1. SVD

In [1]:
from surprise import Dataset
# Use movielens-100K
data = Dataset.load_builtin("ml-100k")

In [2]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.2)

In [3]:
trainset_users = set(trainset.all_users())
trainset_items = set(trainset.all_items())
testset_users = set(testset[x][0] for x in range(len(testset)))
testset_items = set(testset[x][1] for x in range(len(testset)))

# Print the number of unique user IDs and item IDs in the trainset and testset
print("Number of users in trainset: ", len(trainset_users))
print("Number of items in trainset: ", len(trainset_items))
print("Number of users in testset: ", len(testset_users))
print("Number of items in testset: ", len(testset_items))

Number of users in trainset:  943
Number of items in trainset:  1655
Number of users in testset:  943
Number of items in testset:  1410


In [4]:
for i, (user_id, item_id, rating) in enumerate(trainset.all_ratings()):
    print(f'user_id: {user_id}, item_id: {item_id}, rating: {rating}')
    if i >= 4:
        break

user_id: 0, item_id: 0, rating: 3.0
user_id: 0, item_id: 207, rating: 5.0
user_id: 0, item_id: 614, rating: 4.0
user_id: 0, item_id: 668, rating: 5.0
user_id: 0, item_id: 626, rating: 4.0


In [5]:
for i, (user_id, item_id, rating) in enumerate(testset):
    print(f'user_id: {user_id}, item_id: {item_id}, rating: {rating}')
    if i >= 4:
        break

user_id: 483, item_id: 313, rating: 2.0
user_id: 22, item_id: 1000, rating: 3.0
user_id: 793, item_id: 405, rating: 3.0
user_id: 456, item_id: 222, rating: 2.0
user_id: 896, item_id: 966, rating: 4.0


In [6]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [50, 100, 150], "n_epochs": [15, 20, 25], "lr_all": [0.002, 0.004, 0.006], "reg_all": [0.01, 0.02, 0.03]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)

gs.fit(data)

# best RMSE and MAE score
print(gs.best_score["rmse"])
print(gs.best_score["mae"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.9299554607390046
0.7313789386080696
{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.03}


In [7]:
from surprise.model_selection import cross_validate
svd_algo = SVD(n_factors=50, n_epochs=25, lr_all=0.006, reg_all=0.03)

# Run 5-fold cross-validation and print results.
cross_validate(svd_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9261  0.9329  0.9332  0.9269  0.9320  0.9302  0.0031  
MAE (testset)     0.7290  0.7346  0.7332  0.7285  0.7341  0.7319  0.0026  
Fit time          0.80    0.86    0.86    0.84    0.84    0.84    0.02    
Test time         0.15    0.20    0.14    0.19    0.12    0.16    0.03    


{'test_rmse': array([0.92611778, 0.93289917, 0.93315091, 0.92685046, 0.9319569 ]),
 'test_mae': array([0.72904509, 0.73459753, 0.73323626, 0.72851411, 0.7340828 ]),
 'fit_time': (0.7966892719268799,
  0.8596572875976562,
  0.8594143390655518,
  0.8435530662536621,
  0.8435530662536621),
 'test_time': (0.14855623245239258,
  0.20307588577270508,
  0.14059185981750488,
  0.1874549388885498,
  0.1249699592590332)}

In [8]:
from surprise import accuracy
import time
start_time = time.time()
svd_algo.fit(trainset)
# Test the algorithm on the test set
svd_predict = svd_algo.test(testset)
end_time = time.time()

# Compute and print the RMSE and MAE scores
rmse = accuracy.rmse(svd_predict)
mae = accuracy.mae(svd_predict)
print ('Running time: ', end_time - start_time)

RMSE: 0.9300
MAE:  0.7326
Running time:  1.0506892204284668


2. Random

In [9]:
from surprise import NormalPredictor
random_algo = NormalPredictor()

# Run 5-fold cross-validation and print results.
cross_validate(random_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5188  1.5238  1.5096  1.5156  1.5321  1.5200  0.0076  
MAE (testset)     1.2200  1.2216  1.2156  1.2156  1.2285  1.2203  0.0048  
Fit time          0.08    0.09    0.15    0.10    0.14    0.11    0.03    
Test time         0.15    0.09    0.13    0.09    0.15    0.12    0.03    


{'test_rmse': array([1.51884098, 1.52379018, 1.50962558, 1.51557709, 1.53210444]),
 'test_mae': array([1.22004762, 1.22161062, 1.21556831, 1.21559523, 1.22851889]),
 'fit_time': (0.07810473442077637,
  0.09151077270507812,
  0.14562129974365234,
  0.09586668014526367,
  0.13865423202514648),
 'test_time': (0.15069293975830078,
  0.09009027481079102,
  0.1309680938720703,
  0.09372830390930176,
  0.14760613441467285)}

In [10]:
start_time = time.time()
random_algo.fit(trainset)
# Test the algorithm on the test set
random_predict = random_algo.test(testset)
end_time = time.time()

# Compute and print the RMSE and MAE scores
rmse = accuracy.rmse(random_predict)
mae = accuracy.mae(random_predict)
print ('Running time: ', end_time - start_time)

RMSE: 1.5196
MAE:  1.2151
Running time:  0.15746688842773438


3. KNN

In [11]:
from surprise import KNNBasic

param_grid = {"k": [40, 80, 120, 160, 200]}
gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mae"], cv=5)

gs.fit(data)

# best RMSE and MAE score
print(gs.best_score["rmse"])
print(gs.best_score["mae"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [12]:
knn_algo = KNNBasic(k=40)

# Run 5-fold cross-validation and print results.
cross_validate(knn_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9733  0.9745  0.9754  0.9847  0.9841  0.9784  0.0049  
MAE (testset)     0.7687  0.7715  0.7709  0.7773  0.7755  0.7728  0.0031  
Fit time          0.44    0.47    0.47    0.45    0.45    0.46    0.01    
Test time         3.36    3.30    3.32    3.22    3.29    3.30    0.05    


{'test_rmse': array([0.97332174, 0.97445508, 0.97543347, 0.98467607, 0.9841351 ]),
 'test_mae': array([0.7686914 , 0.77154319, 0.77093742, 0.7773245 , 0.77546492]),
 'fit_time': (0.43740105628967285,
  0.4686408042907715,
  0.46863746643066406,
  0.4530189037322998,
  0.4530184268951416),
 'test_time': (3.3554916381835938,
  3.3029673099517822,
  3.3231208324432373,
  3.2188596725463867,
  3.291860818862915)}

In [13]:
start_time = time.time()
knn_algo.fit(trainset)
# Test the algorithm on the test set
knn_predict = knn_algo.test(testset)
end_time = time.time()

# Compute and print the RMSE and MAE scores
rmse = accuracy.rmse(knn_predict)
mae = accuracy.mae(knn_predict)
print ('Running time: ', end_time - start_time)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9825
MAE:  0.7762
Running time:  3.7444703578948975


4. Matrix Factorization with Regulation

In [14]:
import numpy as np

# Convert the trainset to a numpy array
trainset_np = np.zeros((trainset.n_users, trainset.n_items))
for (uid, iid, rating) in trainset.all_ratings():
    trainset_np[uid-1, iid-1] = rating

# Convert the testset to a numpy array
testset_np = np.zeros((len(testset), 3))
# Iterate over the testset ratings and store them in the numpy array
for i, (uid, iid, rating) in enumerate(testset):
    testset_np[i, :] = [int(uid)-1, int(iid)-1, rating]

In [15]:
import numpy as np
class MatrixFactorization:
    def __init__(self, n_factors, n_epochs, lr, reg):
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg = reg
        
    def fit(self, trainset):
        # Initialize biases and latent factor matrices
        self.global_bias = np.mean([r for (_, _, r) in trainset.all_ratings()])
        self.user_bias = np.zeros(trainset.n_users)
        self.item_bias = np.zeros(trainset.n_items)
        self.user_mat = np.random.normal(size=(trainset.n_users, self.n_factors))
        self.item_mat = np.random.normal(size=(trainset.n_items, self.n_factors))
        
        for epoch in range(self.n_epochs):
            # Update biases and latent factor matrices using training set
            for (uid, iid, rating) in trainset.all_ratings():
                # Predict rating
                pred = self.test(uid, iid)
                
                # Calculate error
                err = rating - pred
                
                # Update biases
                self.global_bias += self.lr * err
                self.user_bias[uid] += self.lr * (err - self.reg * self.user_bias[uid])
                self.item_bias[iid] += self.lr * (err - self.reg * self.item_bias[iid])
                
                # Update latent factor matrices
                self.user_mat[uid] += self.lr * (err * self.item_mat[iid] - self.reg * self.user_mat[uid])
                self.item_mat[iid] += self.lr * (err * self.user_mat[uid] - self.reg * self.item_mat[iid])
                
            # Print RMSE on training set
            train_preds = [self.test(uid, iid) for (uid, iid, _) in trainset.all_ratings()]
            train_rmse = np.sqrt(np.mean([(r_ui - r_pred) ** 2 for (_, _, r_ui), r_pred in zip(trainset.all_ratings(), train_preds)]))
            #print("Epoch", epoch, ":", "train rmse =", train_rmse)
                
    def test(self, uid, iid):
        # Convert user and item IDs to integers
        uid = int(uid)
        iid = int(iid)
        
        if uid >= self.user_mat.shape[0] or iid >= self.item_mat.shape[0]:
            return 3.0
        
        # Calculate predicted rating
        pred = self.global_bias + self.user_bias[uid] + self.item_bias[iid] + np.dot(self.user_mat[uid], self.item_mat[iid])
        
        # Clip predicted rating to [1, 5] range
        pred = np.clip(pred, 1, 5)
        
        return pred
    
    def calculate_rmse(self, testset):
        rmse = 0
        for (uid, iid, rating) in testset:
            pred = self.test(uid, iid)
            rmse += (rating - pred) ** 2
        rmse = np.sqrt(rmse / len(testset))
        return rmse
    
    def calculate_mae(self, testset):
        mae = 0
        for (uid, iid, rating) in testset:
            pred = self.test(uid, iid)
            mae += abs(rating - pred)
        mae = mae / len(testset)
        return mae


In [16]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors': [50, 100, 200],
              'n_epochs': [10, 20, 30],
              'lr_all': [0.001, 0.01, 0.1],
              'reg_all': [0.001, 0.01, 0.1]}
# Use SVD in hyperparameter tuning as alternative because SVD is a subcategory of Matrix Factorization
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)

gs.fit(data)

# best RMSE and MAE score
print(gs.best_score["rmse"])
print(gs.best_score["mae"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.9110555778704447
0.7205422439812971
{'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [17]:
start_time = time.time()
mf_algo = MatrixFactorization(n_factors=200, n_epochs=30, lr=0.01, reg=0.1)
mf_algo.fit(trainset)
mf_predicts = [(uid, iid, r_ui, mf_algo.test(uid, iid)) for uid, iid, r_ui in testset]
#mf_predict = [mf_algo.test(uid, iid) for (uid, iid, _) in testset]
end_time = time.time()

rmse = mf_algo.calculate_rmse(testset)
mae = mf_algo.calculate_mae(testset)

print('RMSE:', rmse)
print('MAE:', mae)
print ('Running time: ', end_time - start_time)

RMSE: 1.511873742425185
MAE: 1.2016523866438047
Running time:  180.71247673034668


5. Deep neural network

In [18]:
import pandas as pd

trainset_ratings = [{'user_id': r[0], 'item_id': r[1], 'rating': r[2]} for r in trainset.all_ratings()]
trainset_df = pd.DataFrame(trainset_ratings, columns=['user_id', 'item_id', 'rating'])

testset_ratings = [{'user_id': r[0], 'item_id': r[1], 'rating': r[2]} for r in testset]
testset_df = pd.DataFrame(testset_ratings, columns=['user_id', 'item_id', 'rating'])

In [19]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout

def create_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units1', min_value=32, max_value=512, step=32), activation='relu', input_shape=(2,)))
    model.add(Dropout(hp.Float('dropout1', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('units2', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout2', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='mean_squared_error',
        metrics=['mean_absolute_error'])
    return model

In [20]:
from keras_tuner import RandomSearch

tuner = RandomSearch(
    create_model,
    objective='val_mean_absolute_error',
    max_trials=5,
    executions_per_trial=3,
    directory='tuner_results',
    project_name='neural_recommender')

INFO:tensorflow:Reloading Tuner from tuner_results\neural_recommender\tuner0.json


In [21]:
tuner.search(
    x=[trainset_df['user_id'], trainset_df['item_id']],
    y=trainset_df['rating'],
    epochs=10,
    validation_data=(
        [testset_df['user_id'], testset_df['item_id']],
        testset_df['rating']
    ))

INFO:tensorflow:Oracle triggered exit


In [32]:
# Retrieve the best hyperparameters
best_hp = tuner.get_best_hyperparameters(1)[0]

# Print out the best hyperparameters
print(f"Best units1: {best_hp.get('units1')}")
print(f"Best dropout1: {best_hp.get('dropout1')}")
print(f"Best units2: {best_hp.get('units2')}")
print(f"Best dropout2: {best_hp.get('dropout2')}")
print(f"Best learning_rate: {best_hp.get('learning_rate')}")

Best units1: 512
Best dropout1: 0.1
Best units2: 128
Best dropout2: 0.30000000000000004
Best learning_rate: 0.01


In [78]:
from keras import backend as K
# Define the number of factors for the model
n_factors = 50

# Define the model architecture with the best hyperparameters
dnn_algo = Sequential([
    Dense(512, activation='relu', input_shape=(2,)),
    Dropout(0.1),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='linear')
])

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Compile the model with the specified loss function, optimizer, and metrics
dnn_algo.compile(loss=root_mean_squared_error, optimizer='adam')

In [101]:
from keras.callbacks import EarlyStopping

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

In [87]:
from surprise.model_selection import train_test_split as tts_surprise
# Load the MovieLens 100K dataset
data = Dataset.load_builtin("ml-100k")

# Split the data into training and test sets
trainset, testset = tts_surprise(data, test_size=0.2)

# Convert the training and test data to numpy arrays
X_train = np.array([(x[0], x[1]) for x in trainset.all_ratings()])
y_train = np.array([x[2] for x in trainset.all_ratings()])
X_test = np.array([(x[0], x[1]) for x in testset])
y_test = np.array([x[2] for x in testset])

# Print the shapes of the training and test data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (80000, 2)
y_train shape: (80000,)
X_test shape: (20000, 2)
y_test shape: (20000,)


In [100]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

start_time = time.time()
# Train the model on the training data
history = dnn_algo.fit(X_train, y_train, batch_size=128, epochs=50,
                       validation_split=0.1, callbacks=[early_stopping])
# Use the trained model to make predictions on the test data
X_test = X_test.astype('int32')
dnn_predicts = dnn_algo.predict(X_test)
end_time = time.time()

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

print ('Running time: ', end_time - start_time)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
RMSE: 1.1128241033207071
MAE: 0.9356616813659668
Running time:  13.352808713912964


Cross RMSE between models

In [104]:
rmse1 = accuracy.rmse(svd_predict, random_predict)
print("SVD - Random RMSE = ", rmse1)
rmse2 = accuracy.rmse(svd_predict, knn_predict)
print("SVD - kNN RMSE = ", rmse2)
rmse3 = accuracy.rmse(random_predict, knn_predict)
print("Random - kNN RMSE = ", rmse3)

RMSE: 0.9300
SVD - Random RMSE =  0.9299628516573654
RMSE: 0.9300
SVD - kNN RMSE =  0.9299628516573654
RMSE: 1.5196
Random - kNN RMSE =  1.5196434977773465


In [111]:
from sklearn.metrics import mean_squared_error
mf_predicts_np_arr = np.array([x[3] for x in mf_predicts])
rmse4 = np.sqrt(mean_squared_error(mf_predicts_np_arr, dnn_predicts.flatten()))
print("Matrix Factorization - Deep Neural Network = ", rmse4)

Matrix Factorization - Deep Neural Network =  1.043538010413047


In [120]:
# Get the actual ratings
random_predict_np_arr = np.array([pred.est for pred in random_predict])
knn_predict_np_arr = np.array([pred.est for pred in knn_predict])
svd_predict_np_arr = np.array([pred.est for pred in svd_predict])

In [124]:
rmse5 = np.sqrt(mean_squared_error(mf_predicts_np_arr, random_predict_np_arr))
print("Matrix Factorization - Random = ", rmse5)
rmse6 = np.sqrt(mean_squared_error(mf_predicts_np_arr, knn_predict_np_arr))
print("Matrix Factorization - kNN = ", rmse6)
rmse7 = np.sqrt(mean_squared_error(mf_predicts_np_arr, svd_predict_np_arr))
print("Matrix Factorization - SVD = ", rmse7)

Matrix Factorization - Random =  1.4345210804525301
Matrix Factorization - kNN =  1.1512531272913682
Matrix Factorization - SVD =  1.2030992517865915


In [126]:
rmse8 = np.sqrt(mean_squared_error(random_predict_np_arr, dnn_predicts.flatten()))
print("Random - Deep Neural Network = ", rmse8)
rmse9 = np.sqrt(mean_squared_error(knn_predict_np_arr, dnn_predicts.flatten()))
print("kNN - Deep Neural Network = ", rmse9)
rmse10 = np.sqrt(mean_squared_error(svd_predict_np_arr, dnn_predicts.flatten()))
print("SVD - Deep Neural Network = ", rmse10)

Random - Deep Neural Network =  1.0335376833910799
kNN - Deep Neural Network =  0.5761915233128835
SVD - Deep Neural Network =  0.6782341744545051
