In [68]:
!pip install scikit-surprise



In [101]:
import numpy as np
import pandas as pd
from surprise import SVD, SVDpp, NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [71]:
data = Dataset.load_builtin("ml-100k")

In [90]:
# default model SVD n_factors = 100, lr = 0.005
algo = SVD()

In [91]:
mean_mae = cross_validate(algo, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)
print(f"Mean MAE = {mean_mae}")

Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7361  0.7367  0.7379  0.7345  0.7400  0.7370  0.0018  
Fit time          1.27    1.84    1.67    1.28    1.27    1.47    0.24    
Test time         0.27    0.25    0.13    0.26    0.15    0.21    0.06    
Mean MAE = 0.73704


In [74]:
param_grid = {
    'n_factors': [3, 5, 10, 50],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.005, 0.01, 0.05]
    }
model_gs =  GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=['mae'], joblib_verbose = 1)

In [82]:
model_gs.fit(data)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   40.7s


In [92]:
model_gs.best_params["mae"]

{'n_factors': 3, 'n_epochs': 30, 'lr_all': 0.01}

In [93]:
best_svd = model_gs.best_estimator["mae"]

In [87]:
cross_validate(best_svd, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)

Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7240  0.7293  0.7306  0.7174  0.7264  0.7255  0.0047  
Fit time          1.20    0.85    0.82    0.83    0.80    0.90    0.15    
Test time         0.40    0.16    0.26    0.27    0.23    0.26    0.08    


0.72553

# Conclusion about SVD:
MAE default SVD model: 0.7376 (lr_all=0.005, n_factor=100, n_epochs=20)

MAE best estimates SVD model 0.7275 (lr_all=0.01, n_factor=3, n_epochs=30)

Overfitting become after n_epochs>30 because std of cross validation grow up twice

Calculate about 2 minutes

In [80]:
# default model SVD++ n_factors = 20, lr = 0.007, n_epochs = 20
algo = SVDpp()
mean_mae = cross_validate(algo, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)
print(f"Mean MAE = {mean_mae}")

Evaluating MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7241  0.7205  0.7181  0.7212  0.7224  0.7213  0.0020  
Fit time          27.76   27.38   27.30   27.67   27.75   27.57   0.19    
Test time         4.08    5.96    4.43    4.14    5.06    4.73    0.70    
Mean MAE = 0.72127


In [88]:
param_grid_pp = {
    'n_factors': [3, 5, 10, 20],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01, 0.05]
    }
model_gs_pp =  GridSearchCV(algo_class=SVDpp, param_grid=param_grid_pp, measures=['mae'], joblib_verbose = 10)

In [89]:
model_gs_pp.fit(data)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed: 13.4min


In [94]:
model_gs_pp.best_params["mae"]

{'n_factors': 10, 'n_epochs': 30, 'lr_all': 0.005}

In [96]:
best_svd_pp = model_gs_pp.best_estimator["mae"]

In [97]:
cross_validate(best_svd_pp, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)

Evaluating MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7175  0.7232  0.7270  0.7209  0.7131  0.7203  0.0048  
Fit time          26.03   28.39   24.62   24.08   24.96   25.62   1.52    
Test time         4.50    6.30    5.99    5.25    4.65    5.34    0.71    


0.72032

# Conclusion about SVD++:
MAE default SVD++ model: 0.72127 (lr_all=0.007, n_factor=20, n_epochs=20)

MAE best estimates SVD++ model 0.72012 (lr_all=0.005, n_factor=10, n_epochs=30)

Calculate searchgrid about 50 minutes searchgrid and 3 cross-validation

In [98]:
# default model MNF n_factors=15, n_epochs=50
algo = NMF()

In [99]:
mean_mae = cross_validate(algo, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)
print(f"Mean MAE = {mean_mae}")

Evaluating MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7565  0.7574  0.7567  0.7617  0.7623  0.7589  0.0025  
Fit time          2.64    2.44    1.90    1.91    1.88    2.15    0.32    
Test time         0.21    0.28    0.12    0.15    0.32    0.22    0.08    
Mean MAE = 0.7589


In [103]:
param_grid_nmf = {
    'n_factors': [10, 15, 20],
    'n_epochs': [40, 50, 75],
    'init_low': [0, 1, 4],
    'init_high': [5, 10]
    }
model_gs_nmf =  GridSearchCV(algo_class=NMF, param_grid=param_grid_nmf, measures=['mae'], joblib_verbose = 10)

In [104]:
model_gs_nmf.fit(data)

[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    5.8s
[Parallel(n_jobs=1)]: Done   7 tasks      | elapsed:    9.0s
[Parallel(n_jobs=1)]: Done  12 tasks      | elapsed:   14.0s
[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:   20.7s
[Parallel(n_jobs=1)]: Done  24 tasks      | elapsed:   27.9s
[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:   37.3s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   55.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  60 tasks      | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  84 tasks      | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  97 tasks      | elapsed:  3.3min
[Parallel(n_jobs=1)]: Done 112 tasks      | elapsed:  3.6min
[Parallel(n_jobs=1)]: Done 127 tasks      | elapsed:  4.1min
[Parallel(n_jobs=1)]: Done 144 tasks      | elapsed:  4.7min
[Parallel(n_jobs=1)]: Do

In [111]:
model_gs_nmf.best_params["mae"]

{'n_factors': 20, 'n_epochs': 75, 'init_low': 0, 'init_high': 5}

In [108]:
best_nmf = model_gs_nmf.best_estimator["mae"]

In [109]:
cross_validate(best_svd, data, cv=5, measures=['mae'], verbose=True)['test_mae'].mean().round(5)

Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7305  0.7250  0.7271  0.7269  0.7207  0.7261  0.0032  
Fit time          0.81    0.74    0.75    0.75    1.10    0.83    0.14    
Test time         0.15    0.22    0.29    0.14    0.24    0.21    0.06    


0.72606

# Conclusion about MNF:
MAE default MNF: 0.7589 (n_factor=15, n_epochs=50)

MAE best estimates MNF model 0.7261 (n_factors=20, n_epochs=75, init_low=0, init_high=5)

Calculate searchgrid about 10 minutes

# Conclusion:
The best result

1.   MAE_SVD++ = 0.72012 High accuracy but slow calculation
2.   MAE_MNF = 0.7261 Bad accuracy with default and medium accuracy after modify, difficult make overfitting
3.   MAE_SVD = 0.7275 Medium accuracy but fast calculating, easy overfitting


