In [17]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

#cimport numpy as np # noqa
import numpy as np

from surprise import Reader, AlgoBase, PredictionImpossible
from surprise import Dataset, SVD, SVDpp, NMF
from surprise.utils import get_rng

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

from surprise import accuracy

import pandas as pd
import os
import time
import math

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, precision_score, recall_score
from math import sqrt

from surprise import CoSVDv9, CoSVDv5

In [14]:
#data_source = 'ml-latest-small' # 100k MovieLens dataset 2016
#data_source = 'mlsmall' # 100k MovieLens dataset 2018
data_source = 'ml-10M100K' # 10M MovieLens dataset

reader = Reader()
path = os.path.join('../','Data',data_source)
rate = pd.read_csv(path+'/ratings.csv')
raw_tags = pd.read_csv(path+'/tags.csv', encoding='utf-8')

data = Dataset.load_from_df(rate[['userId', 'movieId', 'rating']], reader)

cv = KFold(n_splits=10, random_state=123)

In [15]:
m_list=['RMSE', 'MAE', 'PREC_5', 'REC_5', 'NDCG_5', 'PREC_10', 'REC_10', 'NDCG_10', 'PREC_15', 'REC_15', 'NDCG_15']

### CoSVD (F=40)

In [5]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=40, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=4)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7815  0.7831  0.7830  0.7843  0.7839  0.7836  0.7842  0.7833  0.7829  0.7839  0.7834  0.0008  
MAE (testset)     0.6000  0.6007  0.6009  0.6020  0.6017  0.6012  0.6017  0.6012  0.6010  0.6014  0.6012  0.0005  
NDCG_5 (testset)  0.8742  0.8738  0.8746  0.8734  0.8732  0.8740  0.8739  0.8740  0.8740  0.8741  0.8739  0.0004  
NDCG_10 (testset) 0.9016  0.9012  0.9018  0.9009  0.9007  0.9012  0.9010  0.9013  0.9011  0.9014  0.9012  0.0003  
NDCG_15 (testset) 0.9133  0.9129  0.9134  0.9125  0.9123  0.9128  0.9128  0.9130  0.9128  0.9130  0.9129  0.0003  
PREC_5 (testset)  0.3994  0.3999  0.4010  0.3995  0.3984  0.3980  0.3996  0.4009  0.3999  0.4003  0.3997  0.0009  
PREC_10 (testset) 0.3952  0.3956  0.3966  0.3955  0.3940  0.3

### CoSVD (F=30)

In [8]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=30, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7828  0.7838  0.7838  0.7850  0.7844  0.7844  0.7849  0.7840  0.7837  0.7849  0.7842  0.0007  
MAE (testset)     0.6010  0.6012  0.6015  0.6026  0.6021  0.6019  0.6023  0.6019  0.6018  0.6021  0.6019  0.0005  
NDCG_5 (testset)  0.8735  0.8734  0.8746  0.8732  0.8731  0.8734  0.8732  0.8734  0.8736  0.8737  0.8735  0.0004  
NDCG_10 (testset) 0.9011  0.9010  0.9017  0.9006  0.9006  0.9007  0.9005  0.9010  0.9008  0.9009  0.9009  0.0003  
NDCG_15 (testset) 0.9128  0.9127  0.9133  0.9123  0.9123  0.9124  0.9123  0.9128  0.9126  0.9126  0.9126  0.0003  
PREC_5 (testset)  0.3978  0.4000  0.3999  0.3986  0.3992  0.3971  0.3990  0.3991  0.3986  0.3980  0.3987  0.0009  
PREC_10 (testset) 0.3937  0.3958  0.3954  0.3946  0.3950  0.3

### CoSVD(F=20)

In [9]:
algo = CoSVDv9(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=20, tags=raw_tags, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm CoSVDv9 on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7845  0.7861  0.7865  0.7875  0.7870  0.7869  0.7874  0.7866  0.7860  0.7872  0.7866  0.0008  
MAE (testset)     0.6025  0.6032  0.6037  0.6048  0.6044  0.6040  0.6043  0.6041  0.6038  0.6042  0.6039  0.0006  
NDCG_5 (testset)  0.8726  0.8722  0.8728  0.8718  0.8721  0.8723  0.8722  0.8720  0.8725  0.8725  0.8723  0.0003  
NDCG_10 (testset) 0.9004  0.9000  0.9004  0.8996  0.8997  0.8999  0.8997  0.8998  0.8999  0.9000  0.8999  0.0003  
NDCG_15 (testset) 0.9122  0.9118  0.9122  0.9115  0.9116  0.9117  0.9117  0.9118  0.9118  0.9119  0.9118  0.0002  
PREC_5 (testset)  0.3957  0.3974  0.3975  0.3964  0.3963  0.3953  0.3950  0.3966  0.3966  0.3960  0.3963  0.0008  
PREC_10 (testset) 0.3915  0.3932  0.3932  0.3922  0.3918  0.3

### SVD(F=40)

In [6]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8157  0.8169  0.8166  0.8176  0.8172  0.8172  0.8176  0.8167  0.8161  0.8179  0.8170  0.0007  
MAE (testset)     0.6312  0.6315  0.6318  0.6323  0.6321  0.6319  0.6321  0.6317  0.6314  0.6322  0.6318  0.0004  
NDCG_5 (testset)  0.8619  0.8619  0.8628  0.8618  0.8618  0.8618  0.8621  0.8623  0.8624  0.8622  0.8621  0.0003  
NDCG_10 (testset) 0.8919  0.8918  0.8925  0.8917  0.8916  0.8916  0.8916  0.8921  0.8918  0.8918  0.8919  0.0003  
NDCG_15 (testset) 0.9050  0.9048  0.9054  0.9049  0.9046  0.9046  0.9048  0.9052  0.9047  0.9049  0.9049  0.0002  
PREC_5 (testset)  0.3565  0.3588  0.3595  0.3584  0.3599  0.3577  0.3584  0.3598  0.3597  0.3594  0.3588  0.0010  
PREC_10 (testset) 0.3530  0.3552  0.3559  0.3549  0.3563  0.3541 

### SVD(F=30)

In [None]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

### SVD(F=20)

In [11]:
algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.09, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, NDCG_5, NDCG_10, NDCG_15, PREC_5, PREC_10, PREC_15, REC_5, REC_10, REC_15 of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8165  0.8177  0.8175  0.8184  0.8181  0.8180  0.8184  0.8176  0.8170  0.8188  0.8178  0.0006  
MAE (testset)     0.6319  0.6320  0.6324  0.6329  0.6328  0.6324  0.6327  0.6324  0.6320  0.6329  0.6324  0.0004  
NDCG_5 (testset)  0.8613  0.8614  0.8625  0.8613  0.8612  0.8615  0.8617  0.8619  0.8621  0.8617  0.8616  0.0004  
NDCG_10 (testset) 0.8914  0.8914  0.8923  0.8914  0.8912  0.8913  0.8912  0.8918  0.8916  0.8914  0.8915  0.0003  
NDCG_15 (testset) 0.9045  0.9044  0.9052  0.9045  0.9042  0.9044  0.9045  0.9049  0.9045  0.9046  0.9046  0.0003  
PREC_5 (testset)  0.3555  0.3591  0.3587  0.3575  0.3592  0.3569  0.3582  0.3595  0.3596  0.3594  0.3584  0.0013  
PREC_10 (testset) 0.3522  0.3554  0.3551  0.3541  0.3557  0.3534 

### SVD++(F=40)

In [18]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8030  0.8027  0.8028  0.8038  0.8038  0.8039  0.8038  0.8036  0.8031  0.8035  0.8034  0.0004  
MAE (testset)     0.6118  0.6115  0.6117  0.6123  0.6122  0.6123  0.6121  0.6125  0.6121  0.6121  0.6121  0.0003  
PREC_5 (testset)  0.4564  0.4592  0.4600  0.4589  0.4594  0.4573  0.4581  0.4586  0.4588  0.4600  0.4587  0.0011  
REC_5 (testset)   0.2644  0.2646  0.2650  0.2649  0.2644  0.2633  0.2630  0.2643  0.2635  0.2653  0.2643  0.0007  
NDCG_5 (testset)  0.8703  0.8701  0.8707  0.8701  0.8697  0.8706  0.8698  0.8697  0.8705  0.8704  0.8702  0.0003  
PREC_10 (testset) 0.4506  0.4532  0.4543  0.4529  0.4533  0.4515  0.4520  0.4527  0.4532  0.4542  0.4528  0.0011  
REC_10 (testset)  0.2989  0.2997  0.2998  0.3002  0.2998  0.298

### SVD++(F=30)

In [19]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8004  0.8005  0.8009  0.8020  0.8012  0.8013  0.8023  0.8012  0.8009  0.8015  0.8012  0.0006  
MAE (testset)     0.6101  0.6097  0.6100  0.6109  0.6104  0.6102  0.6111  0.6106  0.6102  0.6104  0.6104  0.0004  
PREC_5 (testset)  0.4541  0.4579  0.4583  0.4583  0.4576  0.4565  0.4571  0.4560  0.4567  0.4560  0.4569  0.0012  
REC_5 (testset)   0.2616  0.2648  0.2650  0.2648  0.2648  0.2627  0.2636  0.2637  0.2630  0.2631  0.2637  0.0011  
NDCG_5 (testset)  0.8713  0.8708  0.8718  0.8709  0.8711  0.8712  0.8706  0.8708  0.8709  0.8711  0.8710  0.0003  
PREC_10 (testset) 0.4482  0.4521  0.4524  0.4524  0.4514  0.4504  0.4513  0.4503  0.4510  0.4503  0.4510  0.0012  
REC_10 (testset)  0.2959  0.3003  0.3004  0.3003  0.3003  0.297

### SVD++(F=20)

In [20]:
algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=5)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7995  0.7993  0.7995  0.8015  0.8003  0.8007  0.8006  0.7998  0.8000  0.8001  0.8001  0.0006  
MAE (testset)     0.6092  0.6091  0.6093  0.6108  0.6100  0.6100  0.6099  0.6098  0.6098  0.6096  0.6098  0.0005  
PREC_5 (testset)  0.4511  0.4541  0.4549  0.4540  0.4540  0.4524  0.4546  0.4548  0.4544  0.4533  0.4538  0.0011  
REC_5 (testset)   0.2606  0.2625  0.2623  0.2630  0.2618  0.2610  0.2614  0.2626  0.2614  0.2611  0.2618  0.0008  
NDCG_5 (testset)  0.8715  0.8709  0.8716  0.8708  0.8707  0.8716  0.8712  0.8711  0.8717  0.8717  0.8713  0.0004  
PREC_10 (testset) 0.4455  0.4482  0.4487  0.4483  0.4479  0.4465  0.4484  0.4492  0.4487  0.4473  0.4479  0.0011  
REC_10 (testset)  0.2949  0.2979  0.2973  0.2981  0.2974  0.296

## NMF(F=40)

In [21]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=40, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8653  0.8678  0.8671  0.8682  0.8672  0.8676  0.8680  0.8671  0.8665  0.8681  0.8673  0.0008  
MAE (testset)     0.6693  0.6706  0.6703  0.6714  0.6708  0.6706  0.6709  0.6702  0.6700  0.6707  0.6705  0.0005  
PREC_5 (testset)  0.3345  0.3350  0.3361  0.3342  0.3357  0.3337  0.3347  0.3354  0.3348  0.3353  0.3349  0.0007  
REC_5 (testset)   0.1884  0.1879  0.1894  0.1882  0.1892  0.1875  0.1885  0.1887  0.1885  0.1890  0.1885  0.0006  
NDCG_5 (testset)  0.8387  0.8373  0.8382  0.8375  0.8379  0.8380  0.8376  0.8375  0.8384  0.8381  0.8379  0.0004  
PREC_10 (testset) 0.3319  0.3320  0.3332  0.3313  0.3329  0.3309  0.3318  0.3327  0.3320  0.3324  0.3321  0.0007  
REC_10 (testset)  0.2137  0.2141  0.2150  0.2138  0.2151  0.2131 

## NMF(F=30)

In [22]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=30, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8668  0.8681  0.8674  0.8688  0.8676  0.8680  0.8684  0.8676  0.8670  0.8685  0.8678  0.0006  
MAE (testset)     0.6727  0.6731  0.6727  0.6740  0.6732  0.6732  0.6734  0.6728  0.6726  0.6733  0.6731  0.0004  
PREC_5 (testset)  0.3232  0.3247  0.3259  0.3238  0.3252  0.3235  0.3239  0.3249  0.3233  0.3243  0.3243  0.0009  
REC_5 (testset)   0.1780  0.1780  0.1787  0.1778  0.1791  0.1778  0.1782  0.1788  0.1776  0.1791  0.1783  0.0005  
NDCG_5 (testset)  0.8383  0.8371  0.8381  0.8371  0.8381  0.8379  0.8374  0.8371  0.8380  0.8376  0.8377  0.0004  
PREC_10 (testset) 0.3207  0.3221  0.3235  0.3212  0.3225  0.3211  0.3213  0.3225  0.3210  0.3217  0.3218  0.0008  
REC_10 (testset)  0.2014  0.2022  0.2027  0.2016  0.2032  0.2016 

## NMF(F=20)

In [23]:
algo = NMF(verbose=False, n_epochs=40, reg_pu=0.19, reg_qi=0.08, lr_bu=0.001, lr_bi=0.001, reg_bu=0.001, reg_bi=0.001, n_factors=20, random_state=123)

start = time.time()
cross_validate(algo, data, measures=m_list, cv=cv, verbose=True, n_jobs=-1)
print(time.time() - start)

Evaluating RMSE, MAE, PREC_5, REC_5, NDCG_5, PREC_10, REC_10, NDCG_10, PREC_15, REC_15, NDCG_15 of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8699  0.8721  0.8716  0.8727  0.8718  0.8721  0.8724  0.8716  0.8711  0.8726  0.8718  0.0008  
MAE (testset)     0.6786  0.6797  0.6796  0.6806  0.6801  0.6799  0.6802  0.6796  0.6795  0.6802  0.6798  0.0005  
PREC_5 (testset)  0.3073  0.3074  0.3079  0.3061  0.3087  0.3051  0.3069  0.3071  0.3066  0.3072  0.3070  0.0009  
REC_5 (testset)   0.1635  0.1628  0.1639  0.1625  0.1637  0.1623  0.1628  0.1632  0.1622  0.1631  0.1630  0.0005  
NDCG_5 (testset)  0.8375  0.8362  0.8377  0.8367  0.8373  0.8369  0.8364  0.8368  0.8373  0.8371  0.8370  0.0005  
PREC_10 (testset) 0.3054  0.3054  0.3059  0.3039  0.3064  0.3031  0.3047  0.3051  0.3047  0.3051  0.3050  0.0009  
REC_10 (testset)  0.1843  0.1843  0.1848  0.1835  0.1851  0.1834 