In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise.model_selection import KFold

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson

import pandas as pd

from surprise.model_selection import GridSearchCV

import bgg_data_func

import bgg_model_func

from game_name_converter import NameConverter

In [3]:
file_path = './data_input/games_100_summary.csv'

reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,10))

data = Dataset.load_from_file(file_path, reader=reader)

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)

In [5]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  225960 

Number of items:  100 



In [6]:
trainsetfull = data.build_full_trainset()
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items, '\n')

Number of users:  237253 

Number of items:  100 



In [7]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [8]:
name_converter = NameConverter('games_master_list.csv')
name_converter.get_game_name_from_id(84876)

'The Castles of Burgundy'

# Model Based

this uses maxtor factorization

In [16]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.3358


1.3357673434348671

In [17]:
param_grid_2 = {'n_factors':[5, 10,20],'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs_model_2 = GridSearchCV(SVD,param_grid=param_grid_2,n_jobs = -1,joblib_verbose=5)
gs_model_2.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 46.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 110.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 130.7min finished


In [20]:
gs_model_2.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.6},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.6}}

In [22]:
svd2 = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.6)
svd2.fit(trainset)
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.3232
1.3232164478061834


In [23]:
param_grid_3 = {'n_factors':[20],'n_epochs': [20], 'lr_all': [0.004, 0.005, 0.006],
              'reg_all': [0.5, 0.6, 0.7]}

gs_model_3 = GridSearchCV(SVD,param_grid=param_grid_3,n_jobs = -1,joblib_verbose=5)
gs_model_3.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 35.1min finished


In [25]:
gs_model_3.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.5}}

In [None]:
param_grid_4 = {'n_factors':[20, 30],'n_epochs': [20], 'lr_all': [0.006, 0.01, 0.015],
              'reg_all': [0.7, 1, 1.5]}

gs_model_4 = GridSearchCV(SVD,param_grid=param_grid_4,n_jobs = -1,joblib_verbose=5)
gs_model_4.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 49.9min


to insert:

gridsearch results can be analysed this way:
results_df = pd.DataFrame.from_dict(gs.cv_results)

bsl_options when creating an algorithm, can be sgd or als, if we do that, can define other variables too



# Examining Data

In [83]:
df = pd.read_csv('./data_input/games_31_summary.csv', header = None)

In [85]:
df.head()

Unnamed: 0,0,1,2
0,DRCrain,174430,7.0
1,Huligan1475,174430,7.0
2,Cama891,174430,9.0
3,ryosaeba83,174430,9.0
4,Creation,174430,10.0


In [87]:
df[2].describe()

count    866517.000000
mean          8.237334
std           1.440595
min           1.000000
25%           7.500000
50%           8.300000
75%           9.000000
max          10.000000
Name: 2, dtype: float64