In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [2]:
df_ratings        = dataEngineering.loadRatings()['data']

In [17]:
df_ratings.head(5)

Unnamed: 0,UserID,MovieID,userRating,Title,Genres,meanRating
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92


In [4]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD, KNNWithMeans, KNNBasic, KNNWithZScore

options_KNNwithMeans = {
    'bsl_options': {'learning_rate': 0.0005, 'method': 'sgd'},
    'sim_options': {'name': 'cosine', 'user_based': True}
}

option_SVD = {
    'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4
}

option_KNNBasic = {
    'k': 40,
    'sim_options': {'name': 'pearson', 'user_based': False},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

option_KNNWithZScore = {
    'k': 40,
    'sim_options': {'name': 'pearson', 'user_based': True},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

# Define the metrics you want to use for evaluation
metrics = ['rmse', 'mae']
fold = 5

In [5]:
# Create a Reader object
reader = Reader(rating_scale=(1, 5))

# Load the dataset using the Reader
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

In [6]:
# Define the models
model_SVD = SVD(
    n_epochs=option_SVD['n_epochs'], lr_all=option_SVD['lr_all'], reg_all=option_SVD['reg_all'],
)

model_KNNwithMeans = KNNWithMeans( k=40, 
    sim_options=options_KNNwithMeans['sim_options'], bsl_options=options_KNNwithMeans['bsl_options']
)

model_KNNBasic = KNNBasic(k=40, 
    sim_options=option_KNNBasic['sim_options'], bsl_options=option_KNNBasic['bsl_options'],
)

model_KNNWithZScore = KNNWithZScore(k=40, 
    sim_options=option_KNNWithZScore['sim_options'], bsl_options=option_KNNWithZScore['bsl_options'],
)



In [7]:
# Perform cross-validation for SVD
results_SVD = cross_validate(model_SVD, data, measures=metrics, cv=fold, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8879  0.8889  0.8913  0.8939  0.8904  0.8905  0.0021  
MAE (testset)     0.6882  0.6913  0.6909  0.6880  0.6867  0.6890  0.0018  
Fit time          0.75    0.79    0.71    0.68    0.68    0.72    0.04    
Test time         0.17    0.15    0.19    0.20    0.14    0.17    0.02    


In [8]:
# Perform cross-validation for KNN with Means
results_KNNwithMeans = cross_validate(model_KNNwithMeans, data, measures=metrics, cv=fold, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9002  0.9029  0.9017  0.8976  0.8984  0.9002  0.0020  
MAE (testset)     0.6877  0.6905  0.6875  0.6878  0.6877  0.6882  0.0012  
Fit time          0.61    0.56    0.60    0.60    0.64    0.60    0.02    
Test time         1.47    1.42    1.45    1.53    1.47    1.47    0.03    


In [9]:
results_KNNBasic = cross_validate(model_KNNBasic, data, measures=metrics, cv=fold, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9692  0.9691  0.9668  0.9717  0.9733  0.9700  0.0023  
MAE (testset)     0.7528  0.7508  0.7535  0.7557  0.7565  0.7538  0.0021  
Fit time          11.28   13.30   11.03   11.14   10.65   11.48   0.93    
Test time         8.30    8.16    8.21    8.09    8.01    8.16    0.10    


In [10]:
results_KNNWithZScore = cross_validate(model_KNNWithZScore, data, measures=metrics, cv=fold, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8932  0.8926  0.8972  0.8963  0.8812  0.8921  0.0057  
MAE (testset)     0.6749  0.6742  0.6791  0.6785  0.6665  0.6746  0.0045  
Fit time          0.73    0.77    0.79    0.90    0.83    0.80    0.06    
Test time         1.55    1.45    1.58    1.43    1.55    1.51    0.06    


In [11]:
def displatFold(results, name):
    print(name+':')
    print("\tRMSE (testset)\tMAE (testset)\tFit time\tTest time")
    for fold_idx, (rmse, mae, fit_time, test_time) in enumerate(zip(results['test_rmse'], results['test_mae'], results['fit_time'], results['test_time']), start=1):
        print("Fold %d: %.4f\t\t%.4f\t\t%.4f\t\t%.4f"%(fold_idx, rmse, mae, fit_time, test_time))

In [12]:
displatFold(results_KNNwithMeans, 'KNNwithMeans Results')

KNNwithMeans Results:
	RMSE (testset)	MAE (testset)	Fit time	Test time
Fold 1: 0.9002		0.6877		0.6100		1.4681
Fold 2: 0.9029		0.6905		0.5640		1.4231
Fold 3: 0.9017		0.6875		0.5980		1.4501
Fold 4: 0.8976		0.6878		0.6012		1.5291
Fold 5: 0.8984		0.6877		0.6355		1.4741


In [13]:
displatFold(results_SVD, 'SVD Results')

SVD Results:
	RMSE (testset)	MAE (testset)	Fit time	Test time
Fold 1: 0.8879		0.6882		0.7501		0.1700
Fold 2: 0.8889		0.6913		0.7921		0.1490
Fold 3: 0.8913		0.6909		0.7101		0.1940
Fold 4: 0.8939		0.6880		0.6841		0.2000
Fold 5: 0.8904		0.6867		0.6840		0.1370


In [14]:
displatFold(results_KNNBasic, 'KNNBasic Results')

KNNBasic Results:
	RMSE (testset)	MAE (testset)	Fit time	Test time
Fold 1: 0.9692		0.7528		11.2788		8.2966
Fold 2: 0.9691		0.7508		13.2970		8.1611
Fold 3: 0.9668		0.7535		11.0348		8.2136
Fold 4: 0.9717		0.7557		11.1408		8.0938
Fold 5: 0.9733		0.7565		10.6458		8.0126


In [15]:
displatFold(results_KNNWithZScore, 'KNNWithZScore Results')

KNNWithZScore Results:
	RMSE (testset)	MAE (testset)	Fit time	Test time
Fold 1: 0.8932		0.6749		0.7271		1.5521
Fold 2: 0.8926		0.6742		0.7701		1.4521
Fold 3: 0.8972		0.6791		0.7851		1.5811
Fold 4: 0.8963		0.6785		0.9011		1.4331
Fold 5: 0.8812		0.6665		0.8271		1.5491
