# Summary

In this notebook, we are taking a look at the model-based recommender systems, using surprise library's `SVD` and `SVDpp` classes. 

# Infrastructure

In [1]:
# surprise library
from surprise import Dataset, Reader

from surprise import SVD, SVDpp
from surprise.model_selection import \
    train_test_split, GridSearchCV, cross_validate

from surprise import accuracy
from surprise.model_selection import KFold

from surprise import SlopeOne, CoClustering

from surprise import dump

import pandas as pd
import numpy as np
import csv

# my functions for this project
import bgg_data_func
import bgg_model_func
from game_name_converter import NameConverter

In [2]:
my_seed = 12345
np.random.seed(my_seed)

In [3]:
file_path = './data_input/games_100_summary.csv'

reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,10))

data = Dataset.load_from_file(file_path, reader=reader)

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  225784 

Number of items:  100 



In [5]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [6]:
trainsetfull = data.build_full_trainset()
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items, '\n')

Number of users:  237253 

Number of items:  100 



In [7]:
trainsetfull_iids = list(trainsetfull.all_items())
iid_converter = lambda x: trainsetfull.to_raw_iid(x)
trainsetfull_raw_iids = list(map(iid_converter, trainsetfull_iids))

In [8]:
name_converter = NameConverter('games_master_list.csv')

# SVD

In [17]:
param_grid_2 = {'n_factors':[5, 10,20],'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs_model_2 = GridSearchCV(SVD,param_grid=param_grid_2,n_jobs = -1,joblib_verbose=5)
gs_model_2.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 46.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 110.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 130.7min finished


In [20]:
gs_model_2.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.6},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.6}}

In [22]:
svd2 = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.6)
svd2.fit(trainset)
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.3232
1.3232164478061834


In [23]:
param_grid_3 = {'n_factors':[20],'n_epochs': [20], 'lr_all': [0.004, 0.005, 0.006],
              'reg_all': [0.5, 0.6, 0.7]}

gs_model_3 = GridSearchCV(SVD,param_grid=param_grid_3,n_jobs = -1,joblib_verbose=5)
gs_model_3.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 35.1min finished


In [25]:
gs_model_3.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.5}}

In [8]:
param_grid_4 = {'n_factors':[20, 30],'n_epochs': [20], 'lr_all': [0.006, 0.01, 0.015],
              'reg_all': [0.7, 1, 1.5]}

gs_model_4 = GridSearchCV(SVD,param_grid=param_grid_4,n_jobs = -1,joblib_verbose=5)
gs_model_4.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 50.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 71.3min finished


In [9]:
gs_model_4.best_params

{'rmse': {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7},
 'mae': {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.7}}

In [11]:
gs_model_4.best_score

{'rmse': 1.3208204848108498, 'mae': 0.9727448775549237}

In [10]:
# one more, for n_factors

In [13]:
param_grid_5 = {'n_factors':[30, 40, 50],'n_epochs': [20], 'lr_all': [0.006],
              'reg_all': [0.7]}

gs_model_5 = GridSearchCV(SVD,param_grid=param_grid_5,n_jobs = -1,joblib_verbose=5)
gs_model_5.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 11.4min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 13.8min finished


In [14]:
gs_model_5.best_params

{'rmse': {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7},
 'mae': {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7}}

In [15]:
gs_model_5.best_score

{'rmse': 1.3194819718027482, 'mae': 0.9704103337407952}

There is not much improvement, going to stop herme, and train the chosen SVD model with these parameters. 

In [19]:
chosen_SVD = SVD(n_factors= 50, n_epochs=20, lr_all=0.006, reg_all=0.7)
chosen_SVD.fit(trainset)
predictions = chosen_SVD.test(testset)
accuracy.rmse(predictions)

RMSE: 1.3214


1.3214368638381457

# SVD++

In [10]:
chosen_SVDpp = SVDpp()
chosen_SVDpp.fit(trainset)
predictions = chosen_SVDpp.test(testset)
accuracy.rmse(predictions)

RMSE: 1.3428


1.3427719518498817

In [None]:
param_grid_6 = {'n_factors':[20, 30],'n_epochs': [20], 'lr_all': [0.006, 0.01],
              'reg_all': [0.7, 1]}

gs_model_6 = GridSearchCV(SVDpp,param_grid=param_grid_6,n_jobs = -1,joblib_verbose=5)
gs_model_6.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 73.4min


# Other Models

For the sake of completeness, I also tested two remaining algorithms from the surprise library: `SlopeOne` and `CoClustering`. 

## SlopeOne

In [17]:
algo = SlopeOne()
results = cross_validate(algo, data, measures = ['RMSE'], return_train_measures=True)
results

In [18]:
results

{'test_rmse': array([1.27893209, 1.27806236, 1.27959544, 1.27605057, 1.27443387]),
 'train_rmse': array([1.12999103, 1.13032548, 1.12965681, 1.13074599, 1.13101295]),
 'fit_time': (4.924248218536377,
  5.99888801574707,
  7.023576021194458,
  7.737403869628906,
  7.064083099365234),
 'test_time': (20.624468088150024,
  15.740535020828247,
  21.612120866775513,
  29.29262113571167,
  15.716026067733765)}

In [19]:
slopeone = SlopeOne()
slopeone.fit(trainset)
predictions = slopeone.test(testset)
accuracy.rmse(predictions)

RMSE: 1.2803


1.2803008950789092

In [26]:
dump.dump ('./models/slopeone',algo=slopeone)

In [31]:
testalgo = dump.load('./models/slopeone')[1]

## CoClustering

In [14]:
algo = CoClustering(n_cltr_u = 3, n_cltr_i = 3)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.3243


1.32430564102699

In [15]:
algo = CoClustering(n_cltr_u = 5, n_cltr_i = 5)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.3372


1.3371894880624036