In [38]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

#cimport numpy as np # noqa
import numpy as np

from surprise import Reader, AlgoBase, PredictionImpossible
from surprise import Dataset, SVD
from surprise.utils import get_rng

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

from surprise import accuracy

import pandas as pd
import os
import time
import math

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, precision_score, recall_score
from math import sqrt

from surprise import CoSVDv9, CoSVDv5, SVDpp, SVD

In [39]:
data_source = 'ml-latest-small' # 100k MovieLens dataset 2016
#data_source = 'mlsmall' # 100k MovieLens dataset 2018
#data_source = 'ml10M100K' # 10M MovieLens dataset

reader = Reader()
path = os.path.join('../','Data',data_source)
rate = pd.read_csv(path+'/ratings.csv')
raw_tags = pd.read_csv(path+'/tags.csv', encoding='utf-8')

data = Dataset.load_from_df(rate[['userId', 'movieId', 'rating']], reader)

cv = KFold(n_splits=10, random_state=123)

In [40]:
m_list=['RMSE', 'MAE', 'PREC_5', 'REC_5', 'NDCG_5', 'PREC_10', 'REC_10', 'NDCG_10', 'PREC_15', 'REC_15', 'NDCG_15']

### CoSVD (F=40)

In [17]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=40, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8928  0.8845  0.8732  0.8734  0.8767  0.8791  0.8811  0.8862  0.8822  0.8834  0.8813  0.0057  
MAE (testset)     0.6853  0.6801  0.6723  0.6707  0.6694  0.6735  0.6742  0.6790  0.6776  0.6773  0.6759  0.0046  
NDCG_5 (testset)  0.8318  0.8279  0.8343  0.8307  0.8391  0.8348  0.8314  0.8339  0.8272  0.8288  0.8320  0.0035  
NDCG_10 (testset) 0.8693  0.8660  0.8685  0.8650  0.8728  0.8690  0.8672  0.8690  0.8626  0.8656  0.8675  0.0027  
NDCG_15 (testset) 0.8851  0.8825  0.8846  0.8815  0.8891  0.8853  0.8829  0.8855  0.8788  0.8814  0.8837  0.0027  
PREC_5 (testset)  0.3577  0.3555  0.3488  0.3585  0.3563  0.3627  0.3653  0.3578  0.3633  0.3588  0.3585  0.0044  
PREC_10 (testset) 0.3556  0.3526  0.3479  0.3546  0.3539  0.3

### CoSVD (F=30)

In [18]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=30, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8880  0.8900  0.8745  0.8749  0.8781  0.8814  0.8821  0.8889  0.8830  0.8761  0.8817  0.0055  
MAE (testset)     0.6814  0.6846  0.6729  0.6682  0.6719  0.6745  0.6758  0.6806  0.6799  0.6734  0.6763  0.0049  
NDCG_5 (testset)  0.8399  0.8256  0.8325  0.8288  0.8354  0.8330  0.8249  0.8348  0.8317  0.8336  0.8320  0.0043  
NDCG_10 (testset) 0.8755  0.8615  0.8648  0.8664  0.8707  0.8653  0.8631  0.8683  0.8642  0.8699  0.8670  0.0040  
NDCG_15 (testset) 0.8908  0.8790  0.8808  0.8810  0.8875  0.8811  0.8783  0.8847  0.8827  0.8854  0.8831  0.0037  
PREC_5 (testset)  0.3588  0.3617  0.3356  0.3520  0.3601  0.3554  0.3498  0.3578  0.3605  0.3675  0.3559  0.0083  
PREC_10 (testset) 0.3529  0.3565  0.3321  0.3507  0.3568  0.3

### CoSVD(F=20)

In [19]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=20, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8894  0.8879  0.8745  0.8764  0.8801  0.8821  0.8833  0.8915  0.8858  0.8812  0.8832  0.0052  
MAE (testset)     0.6837  0.6819  0.6731  0.6698  0.6735  0.6754  0.6744  0.6845  0.6800  0.6765  0.6773  0.0047  
NDCG_5 (testset)  0.8379  0.8230  0.8309  0.8297  0.8335  0.8349  0.8207  0.8384  0.8297  0.8299  0.8309  0.0055  
NDCG_10 (testset) 0.8714  0.8626  0.8652  0.8651  0.8692  0.8677  0.8588  0.8706  0.8628  0.8679  0.8661  0.0038  
NDCG_15 (testset) 0.8877  0.8806  0.8820  0.8806  0.8851  0.8837  0.8758  0.8873  0.8803  0.8835  0.8826  0.0034  
PREC_5 (testset)  0.3712  0.3473  0.3244  0.3524  0.3545  0.3756  0.3429  0.3501  0.3539  0.3566  0.3529  0.0135  
PREC_10 (testset) 0.3677  0.3447  0.3214  0.3512  0.3527  0.3

### SVD(F=40)

In [20]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8834  0.8751  0.8625  0.8624  0.8678  0.8667  0.8717  0.8761  0.8714  0.8710  0.8708  0.0061  
MAE (testset)     0.6770  0.6736  0.6629  0.6615  0.6640  0.6646  0.6682  0.6722  0.6705  0.6679  0.6682  0.0048  
NDCG_5 (testset)  0.8363  0.8282  0.8412  0.8392  0.8389  0.8407  0.8371  0.8373  0.8335  0.8328  0.8365  0.0038  
NDCG_10 (testset) 0.8739  0.8691  0.8730  0.8708  0.8746  0.8735  0.8711  0.8726  0.8669  0.8682  0.8714  0.0025  
NDCG_15 (testset) 0.8899  0.8858  0.8882  0.8866  0.8901  0.8893  0.8860  0.8884  0.8817  0.8837  0.8870  0.0026  
PREC_5 (testset)  0.3534  0.3507  0.3362  0.3590  0.3375  0.3548  0.3618  0.3439  0.3483  0.3506  0.3496  0.0080  
PREC_10 (testset) 0.3512  0.3481  0.3338  0.3569  0.3366  0.3517 

### SVD(F=30)

In [21]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8825  0.8757  0.8654  0.8658  0.8705  0.8679  0.8729  0.8776  0.8728  0.8670  0.8718  0.0053  
MAE (testset)     0.6772  0.6752  0.6654  0.6614  0.6659  0.6648  0.6684  0.6739  0.6710  0.6653  0.6688  0.0050  
NDCG_5 (testset)  0.8359  0.8298  0.8394  0.8352  0.8350  0.8422  0.8300  0.8376  0.8357  0.8372  0.8358  0.0036  
NDCG_10 (testset) 0.8722  0.8675  0.8700  0.8693  0.8723  0.8735  0.8653  0.8700  0.8679  0.8750  0.8703  0.0028  
NDCG_15 (testset) 0.8886  0.8852  0.8852  0.8843  0.8872  0.8889  0.8813  0.8856  0.8845  0.8896  0.8860  0.0024  
PREC_5 (testset)  0.3434  0.3426  0.3386  0.3449  0.3545  0.3647  0.3433  0.3554  0.3552  0.3587  0.3501  0.0082  
PREC_10 (testset) 0.3406  0.3395  0.3335  0.3426  0.3505  0.3604 

### SVD(F=20)

In [22]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8830  0.8779  0.8674  0.8652  0.8730  0.8691  0.8751  0.8814  0.8772  0.8716  0.8741  0.0056  
MAE (testset)     0.6783  0.6759  0.6668  0.6607  0.6679  0.6661  0.6694  0.6769  0.6730  0.6686  0.6704  0.0053  
NDCG_5 (testset)  0.8381  0.8299  0.8349  0.8361  0.8355  0.8403  0.8308  0.8353  0.8330  0.8356  0.8350  0.0030  
NDCG_10 (testset) 0.8741  0.8662  0.8697  0.8698  0.8713  0.8704  0.8658  0.8697  0.8654  0.8720  0.8694  0.0027  
NDCG_15 (testset) 0.8904  0.8837  0.8854  0.8850  0.8875  0.8871  0.8806  0.8863  0.8820  0.8877  0.8856  0.0027  
PREC_5 (testset)  0.3699  0.3341  0.3407  0.3502  0.3480  0.3677  0.3407  0.3481  0.3549  0.3493  0.3504  0.0108  
PREC_10 (testset) 0.3651  0.3309  0.3394  0.3472  0.3437  0.3655 

## SVD++(F=40)

In [41]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8966  0.8878  0.8770  0.8787  0.8814  0.8827  0.8884  0.8884  0.8898  0.8868  0.8858  0.0055  
MAE (testset)     0.6875  0.6817  0.6725  0.6724  0.6741  0.6760  0.6793  0.6803  0.6799  0.6787  0.6782  0.0044  
PREC_5 (testset)  0.3451  0.3084  0.3045  0.3321  0.3272  0.3340  0.3381  0.3254  0.3371  0.3328  0.3285  0.0122  
REC_5 (testset)   0.1817  0.1656  0.1655  0.1803  0.1767  0.1794  0.1765  0.1698  0.1849  0.1814  0.1762  0.0065  
NDCG_5 (testset)  0.8368  0.8236  0.8347  0.8304  0.8372  0.8319  0.8228  0.8272  0.8279  0.8311  0.8303  0.0048  
PREC_10 (testset) 0.3410  0.3074  0.3009  0.3294  0.3253  0.3317  0.3339  0.3245  0.3316  0.3271  0.3253  0.0116  
REC_10 (testset)  0.2059  0.1948  0.1908  0.2058  0.2045  0.204

## SVD++(F=30)

In [42]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8920  0.8916  0.8755  0.8788  0.8817  0.8844  0.8855  0.8911  0.8861  0.8810  0.8848  0.0054  
MAE (testset)     0.6841  0.6849  0.6750  0.6710  0.6750  0.6767  0.6762  0.6813  0.6804  0.6763  0.6781  0.0042  
PREC_5 (testset)  0.3281  0.3212  0.3023  0.3380  0.3363  0.3337  0.3411  0.3238  0.3238  0.3344  0.3283  0.0108  
REC_5 (testset)   0.1734  0.1748  0.1699  0.1880  0.1853  0.1778  0.1804  0.1710  0.1877  0.1767  0.1785  0.0063  
NDCG_5 (testset)  0.8387  0.8276  0.8364  0.8297  0.8326  0.8308  0.8254  0.8294  0.8266  0.8294  0.8307  0.0040  
PREC_10 (testset) 0.3222  0.3187  0.2992  0.3373  0.3339  0.3297  0.3353  0.3191  0.3190  0.3286  0.3243  0.0107  
REC_10 (testset)  0.1967  0.2037  0.1958  0.2119  0.2121  0.202

## SVD++(F=20)

In [43]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8925  0.8875  0.8766  0.8787  0.8799  0.8853  0.8853  0.8918  0.8889  0.8795  0.8846  0.0054  
MAE (testset)     0.6861  0.6821  0.6743  0.6702  0.6734  0.6777  0.6755  0.6833  0.6820  0.6742  0.6779  0.0049  
PREC_5 (testset)  0.3324  0.3254  0.3008  0.3250  0.3130  0.3157  0.3123  0.3244  0.3192  0.3372  0.3205  0.0101  
REC_5 (testset)   0.1722  0.1695  0.1653  0.1819  0.1734  0.1697  0.1686  0.1698  0.1836  0.1805  0.1734  0.0060  
NDCG_5 (testset)  0.8343  0.8278  0.8323  0.8291  0.8331  0.8360  0.8205  0.8332  0.8220  0.8301  0.8298  0.0049  
PREC_10 (testset) 0.3267  0.3204  0.2976  0.3228  0.3094  0.3117  0.3086  0.3203  0.3171  0.3323  0.3167  0.0096  
REC_10 (testset)  0.1972  0.1976  0.1908  0.2053  0.1989  0.195

## NMF(F=40)

In [44]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9169  0.9151  0.9004  0.9059  0.9088  0.9044  0.9087  0.9144  0.9101  0.9029  0.9088  0.0052  
MAE (testset)     0.7042  0.7024  0.6947  0.6933  0.6960  0.6912  0.6983  0.7031  0.6998  0.6951  0.6978  0.0042  
PREC_5 (testset)  0.3322  0.3336  0.3136  0.3470  0.3428  0.3569  0.3363  0.3378  0.3549  0.3423  0.3397  0.0118  
REC_5 (testset)   0.1908  0.1866  0.1900  0.1958  0.1940  0.2025  0.1917  0.1879  0.2042  0.1932  0.1937  0.0055  
NDCG_5 (testset)  0.8247  0.8155  0.8234  0.8250  0.8233  0.8252  0.8185  0.8217  0.8173  0.8215  0.8216  0.0033  
PREC_10 (testset) 0.3307  0.3308  0.3105  0.3445  0.3428  0.3543  0.3338  0.3375  0.3504  0.3408  0.3376  0.0117  
REC_10 (testset)  0.2143  0.2111  0.2144  0.2192  0.2200  0.2281 

## NMF(F=30)

In [45]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9215  0.9200  0.9035  0.9088  0.9136  0.9039  0.9124  0.9169  0.9164  0.9070  0.9124  0.0061  
MAE (testset)     0.7092  0.7089  0.6984  0.6977  0.7012  0.6947  0.7029  0.7087  0.7081  0.7009  0.7031  0.0051  
PREC_5 (testset)  0.3521  0.3123  0.3139  0.3328  0.3185  0.3431  0.3382  0.3342  0.3307  0.3356  0.3312  0.0122  
REC_5 (testset)   0.1892  0.1758  0.1768  0.1843  0.1787  0.1877  0.1847  0.1771  0.1932  0.1841  0.1832  0.0056  
NDCG_5 (testset)  0.8316  0.8125  0.8213  0.8193  0.8212  0.8288  0.8119  0.8232  0.8147  0.8225  0.8207  0.0061  
PREC_10 (testset) 0.3479  0.3100  0.3119  0.3308  0.3190  0.3394  0.3362  0.3323  0.3298  0.3332  0.3291  0.0114  
REC_10 (testset)  0.2106  0.1998  0.1997  0.2057  0.2017  0.2122 

## NMF(F=20)

In [46]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9256  0.9241  0.9052  0.9121  0.9167  0.9090  0.9186  0.9226  0.9213  0.9089  0.9164  0.0068  
MAE (testset)     0.7175  0.7152  0.7030  0.7022  0.7060  0.7019  0.7116  0.7162  0.7140  0.7063  0.7094  0.0058  
PREC_5 (testset)  0.3341  0.3167  0.2974  0.3237  0.3144  0.3366  0.3084  0.3246  0.3127  0.3243  0.3193  0.0112  
REC_5 (testset)   0.1748  0.1673  0.1638  0.1752  0.1689  0.1745  0.1690  0.1704  0.1770  0.1640  0.1705  0.0045  
NDCG_5 (testset)  0.8269  0.8145  0.8223  0.8232  0.8202  0.8286  0.8130  0.8164  0.8171  0.8200  0.8202  0.0049  
PREC_10 (testset) 0.3326  0.3151  0.2963  0.3228  0.3129  0.3337  0.3076  0.3235  0.3117  0.3217  0.3178  0.0109  
REC_10 (testset)  0.1923  0.1887  0.1835  0.1956  0.1888  0.1958 