In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/154.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357243 sha256=99686ae765bc4ddc6fab2444cb83a5982b32ebdef6e5c4dda4d0ba03c109f0f2
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD, SVDpp, NMF
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
import random

In [4]:
data = Dataset.load_builtin('ml-100k')
raw_ratings = data.raw_ratings
df = pd.DataFrame(raw_ratings, columns=['user', 'item', 'rating', 'timestamp'])
print(df.info())
display(df)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user       100000 non-null  object 
 1   item       100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.1+ MB
None


Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
...,...,...,...,...
99995,880,476,3.0,880175444
99996,716,204,5.0,879795543
99997,276,1090,1.0,874795795
99998,13,225,2.0,882399156


In [5]:
df_sample = df.sample(n=10000, random_state=42)
reader = Reader(rating_scale=(1, 5))
subset_data = Dataset.load_from_df(df_sample[['user', 'item', 'rating']], reader)

param_grid = {
    'n_epochs': [20, 25, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

param_grid_nmf = {
    'n_epochs': [20, 25, 30],
    'n_factors': [15, 20, 25],
    'reg_pu': [0.02, 0.05, 0.1],
    'reg_qi': [0.02, 0.05, 0.1]
}

In [6]:
# Perform the grid search for SVD++
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(subset_data)

# Perform the grid search for SVD++
gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
gs_svdpp.fit(subset_data)

# Perform the grid search for NMF
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=3)
gs_nmf.fit(subset_data)

print("Best parameters for SVD:", gs.best_params['rmse'])
print("Best RMSE score for SVD:", gs.best_score['rmse'])
print("Best parameters for NMF:", gs_nmf.best_params['rmse'])
print("Best RMSE score for NMF:", gs_nmf.best_score['rmse'])
print("Best parameters for SVD++:", gs_svdpp.best_params['rmse'])
print("Best RMSE score for SVD++:", gs_svdpp.best_score['rmse'])

Best parameters for SVD: {'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}
Best RMSE score for SVD: 1.0134711760655402
Best parameters for NMF: {'n_epochs': 30, 'n_factors': 25, 'reg_pu': 0.1, 'reg_qi': 0.1}
Best RMSE score for NMF: 1.124500795266929
Best parameters for SVD++: {'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}
Best RMSE score for SVD++: 1.012378023069051


In [7]:
mod_svd = SVD()
res = cross_validate(mod_svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
SVD_res = pd.DataFrame.from_dict(res).mean(axis=0)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9330  0.9376  0.9320  0.9330  0.9248  0.9296  0.9252  0.9343  0.9330  0.9207  0.9303  0.0049  
MAE (testset)     0.7339  0.7372  0.7311  0.7349  0.7263  0.7315  0.7277  0.7384  0.7376  0.7283  0.7327  0.0042  
Fit time          1.55    1.60    1.58    1.58    2.25    1.66    1.56    1.55    1.55    1.59    1.65    0.20    
Test time         0.08    0.20    0.06    0.06    0.09    0.14    0.06    0.07    0.11    0.17    0.10    0.05    


In [8]:
mod_svdpp = SVDpp()
res = cross_validate(mod_svdpp, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
SVD_res = pd.DataFrame.from_dict(res).mean(axis=0)

Evaluating RMSE, MAE of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9164  0.9119  0.9138  0.9207  0.9052  0.9217  0.9205  0.9128  0.9147  0.9058  0.9143  0.0055  
MAE (testset)     0.7200  0.7163  0.7168  0.7200  0.7082  0.7247  0.7219  0.7139  0.7178  0.7084  0.7168  0.0051  
Fit time          33.34   33.02   34.31   34.33   34.15   34.41   34.71   34.26   34.19   34.16   34.09   0.49    
Test time         3.46    3.03    2.94    2.83    2.42    2.41    2.45    2.44    2.47    2.46    2.69    0.34    


In [9]:
mod_nmf = NMF()
res = cross_validate(mod_nmf, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
SVD_res = pd.DataFrame.from_dict(res).mean(axis=0)

Evaluating RMSE, MAE of algorithm NMF on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9555  0.9555  0.9731  0.9578  0.9488  0.9528  0.9652  0.9673  0.9618  0.9552  0.9593  0.0070  
MAE (testset)     0.7502  0.7515  0.7669  0.7525  0.7459  0.7484  0.7573  0.7602  0.7564  0.7538  0.7543  0.0058  
Fit time          2.25    2.43    2.81    2.21    2.27    2.23    2.90    2.35    2.24    2.18    2.39    0.24    
Test time         0.06    0.25    0.05    0.08    0.05    0.06    0.09    0.07    0.06    0.05    0.08    0.06    


Найоптимальнішим методом є метод NMF, так як він потребує менше часу на виконання, ніж SVD++, і є більш точним, ніж метод SVD, а виконується трохи довше.