In [1]:
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import root_mean_squared_error
import os, io
import torch

%matplotlib inline

import movie_sim

plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})

In [2]:
sample = pd.read_csv('train_umr.csv')

In [3]:
movies = pd.read_csv('movies_encoded.csv')

In [4]:
movies.drop(columns=['Name'],inplace=True)

In [5]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV

from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import NMF
from surprise import accuracy

In [6]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library

In [7]:
algo=SVD(n_factors=32, reg_all=0.1,n_epochs=20)
baseline=data=Dataset.load_from_df(sample,reader)

In [8]:
cross_validate(algo, baseline, measures=['RMSE'], cv=5, verbose=True) 

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9038  0.9021  0.9024  0.9021  0.9031  0.9027  0.0007  
Fit time          83.10   80.27   73.19   73.59   85.17   79.06   4.89    
Test time         22.06   18.75   17.63   20.75   19.67   19.77   1.54    


{'test_rmse': array([0.90383813, 0.90205166, 0.90238476, 0.90209612, 0.90311142]),
 'fit_time': (83.0952684879303,
  80.2717833518982,
  73.19341015815735,
  73.59276795387268,
  85.16736674308777),
 'test_time': (22.06394362449646,
  18.75208067893982,
  17.62871241569519,
  20.747841596603394,
  19.66888403892517)}

In [9]:
new_data=movie_sim.sample_new_data(sample,movies,712664)

317  retrieved movies for the user
finding similar movies...
304  similar movies found
retrieving new data
0  movies out of 304 done
current new data entries: 0
100  movies out of 304 done
current new data entries: 340447
200  movies out of 304 done
current new data entries: 474978
300  movies out of 304 done
current new data entries: 474978


In [10]:
data=Dataset.load_from_df(new_data,reader) #load dataset into Surprise datastructure Dataset

In [42]:
algo = SVD(n_factors=22, reg_all=0.1,n_epochs=20)
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) 

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9413  0.9426  0.9401  0.9419  0.9395  0.9411  0.0012  
Fit time          3.13    3.13    3.43    3.17    3.51    3.27    0.17    
Test time         1.10    0.67    1.02    1.11    0.75    0.93    0.18    


{'test_rmse': array([0.9413179 , 0.9426125 , 0.94008239, 0.94194453, 0.93946011]),
 'fit_time': (3.126126527786255,
  3.1256372928619385,
  3.43276309967041,
  3.1681153774261475,
  3.5126681327819824),
 'test_time': (1.0958611965179443,
  0.6731874942779541,
  1.0245027542114258,
  1.1063010692596436,
  0.7513422966003418)}

In [12]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo = KNNBasic(sim_options=sim_options)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1181  1.1132  1.1100  1.1100  1.1148  1.1132  0.0031  
Fit time          0.22    0.32    0.33    0.31    0.32    0.30    0.04    
Test time         1.48    1.44    1.44    1.44    1.45    1.45    0.02    


{'test_rmse': array([1.11814598, 1.11322889, 1.1099607 , 1.11000707, 1.11484954]),
 'fit_time': (0.2214035987854004,
  0.32292747497558594,
  0.3274238109588623,
  0.31276512145996094,
  0.3215632438659668),
 'test_time': (1.4840407371520996,
  1.4387507438659668,
  1.4364845752716064,
  1.4443776607513428,
  1.450974464416504)}

In [13]:
algo = NMF()
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1190  1.1045  1.1144  1.1181  1.1143  1.1140  0.0051  
Fit time          14.23   14.21   14.74   14.84   14.93   14.59   0.31    
Test time         1.00    0.98    1.06    0.99    0.75    0.96    0.11    


{'test_rmse': array([1.11896033, 1.1044911 , 1.11437511, 1.11807629, 1.11431621]),
 'fit_time': (14.22623324394226,
  14.21005654335022,
  14.740297079086304,
  14.835734128952026,
  14.930388927459717),
 'test_time': (0.996485710144043,
  0.9840614795684814,
  1.058833122253418,
  0.9902048110961914,
  0.7482039928436279)}

In [38]:
param_grid = {'n_factors': [16,17,18,19,20,21,22,23,24],
              'reg_all': [0.1],
              'n_epochs': [20]
              }

In [39]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) 

In [40]:
gs.fit(data)

In [41]:
print(gs.best_params['rmse'])

{'n_factors': 22, 'reg_all': 0.1, 'n_epochs': 20}


In [18]:
param_grid = {'n_factors': [20,30,35,40,50],
              'reg_all': [ 0.1],
              'n_epochs': [20]
              }

In [20]:
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5) 

In [21]:
gs.fit(data)

In [22]:
print(gs.best_params['rmse'])

{'n_factors': 30, 'reg_all': 0.1, 'n_epochs': 20}


In [27]:
param_grid = {'n_factors': [32,33,34,35,36,37,38],
              'reg_all': [ 0.1],
              'n_epochs': [20]
              }

In [28]:
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5) 

In [29]:
gs.fit(data)

In [30]:
print(gs.best_params['rmse'])

{'n_factors': 32, 'reg_all': 0.1, 'n_epochs': 20}


In [31]:
algo = SVDpp(n_factors=32, reg_all=0.1,n_epochs=20)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9357  0.9420  0.9385  0.9388  0.9347  0.9379  0.0026  
Fit time          9.11    8.95    9.04    8.83    8.88    8.96    0.10    
Test time         2.47    2.00    1.99    2.28    2.44    2.23    0.21    


{'test_rmse': array([0.93571759, 0.94199013, 0.93847123, 0.93875542, 0.93467206]),
 'fit_time': (9.110584259033203,
  8.949052572250366,
  9.038138151168823,
  8.828859090805054,
  8.88344120979309),
 'test_time': (2.465672492980957,
  1.998081922531128,
  1.9874858856201172,
  2.279963970184326,
  2.440664291381836)}

In [32]:
new_data2=movie_sim.sample_new_data(sample,movies,6)

119  retrieved movies for the user
finding similar movies...
115  similar movies found
retrieving new data
0  movies out of 115 done
current new data entries: 0
100  movies out of 115 done
current new data entries: 223699


In [33]:
data2=Dataset.load_from_df(new_data2,reader) 
algo = SVD(n_factors=32, reg_all=0.1,n_epochs=20)
cross_validate(algo, data2, measures=['RMSE'], cv=5, verbose=True) 

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9762  0.9740  0.9745  0.9786  0.9721  0.9751  0.0022  
Fit time          1.56    1.54    1.58    1.59    1.57    1.57    0.02    
Test time         0.61    0.33    0.28    0.65    0.65    0.50    0.16    


{'test_rmse': array([0.97623876, 0.97398699, 0.97448653, 0.978643  , 0.97206172]),
 'fit_time': (1.56135892868042,
  1.535557746887207,
  1.5756502151489258,
  1.5887093544006348,
  1.5720744132995605),
 'test_time': (0.6052358150482178,
  0.3299703598022461,
  0.2841343879699707,
  0.6450252532958984,
  0.6453144550323486)}