# Memory Based Model

this calculates nearest users

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise.model_selection import KFold

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson

import pandas as pd

from surprise.model_selection import GridSearchCV

import bgg_data_func

import bgg_model_func

from game_name_converter import NameConverter

In [3]:
file_path = './data_input/games_40_summary.csv'

reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,10))

data = Dataset.load_from_file(file_path, reader=reader)

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)

In [5]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  179856 

Number of items:  40 



In [6]:
trainsetfull = data.build_full_trainset()
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items, '\n')

Number of users:  192606 

Number of items:  40 



In [7]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [8]:
name_converter = NameConverter('games_master_list.csv')
name_converter.get_game_name_from_id(84876)

'The Castles of Burgundy'

## Similarity

In [11]:
sim_cos = {'name':'cosine', 'user_based':False}
basic_cos = knns.KNNBasic(sim_options=sim_cos)
basic_cos.fit(trainset)
predictions = basic_cos.test(testset)
print(accuracy.rmse(predictions))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3831
1.383147354364512


In [32]:
df = bgg_model_func.return_top_similar_dataframe(basic_cos.sim, trainset_raw_iids, 3)
for column in df.columns:
    df[column] = df[column].map(name_converter.get_game_name_from_id)
df.sort_values(['game'], inplace = True, axis = 0)
df

Unnamed: 0,game,similar_1,similar_2,similar_3
2,7 Wonders Duel,Orléans,The Castles of Burgundy,Viticulture Essential Edition
24,A Feast for Odin,Caverna: The Cave Farmers,Orléans,Great Western Trail
7,Agricola,Caverna: The Cave Farmers,Puerto Rico,Tzolk'in: The Mayan Calendar
29,Arkham Horror: The Card Game,Mansions of Madness: Second Edition,Star Wars: Imperial Assault,Star Wars: Rebellion
30,Blood Rage,7 Wonders Duel,Scythe,Viticulture Essential Edition
6,Brass: Birmingham,Brass: Lancashire,Great Western Trail,Concordia
3,Brass: Lancashire,Brass: Birmingham,Concordia,Great Western Trail
13,Caverna: The Cave Farmers,A Feast for Odin,Orléans,Great Western Trail
31,Concordia,Orléans,Great Western Trail,The Castles of Burgundy
28,Food Chain Magnate,Brass: Birmingham,Brass: Lancashire,Great Western Trail


In [21]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(trainset)
predictions = basic_pearson.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.3672
1.3672169234920741


In [33]:
df = bgg_model_func.return_top_similar_dataframe(basic_pearson.sim, trainset_raw_iids, 3)
for column in df.columns:
    df[column] = df[column].map(name_converter.get_game_name_from_id)
df.sort_values(['game'], inplace = True)
df

Unnamed: 0,game,similar_1,similar_2,similar_3
2,7 Wonders Duel,Caverna: The Cave Farmers,Orléans,Viticulture Essential Edition
24,A Feast for Odin,Caverna: The Cave Farmers,Orléans,Great Western Trail
7,Agricola,Caverna: The Cave Farmers,Puerto Rico,Terra Mystica
29,Arkham Horror: The Card Game,Mansions of Madness: Second Edition,The 7th Continent,Star Wars: Imperial Assault
30,Blood Rage,Scythe,Star Wars: Imperial Assault,Star Wars: Rebellion
6,Brass: Birmingham,Brass: Lancashire,Great Western Trail,Food Chain Magnate
3,Brass: Lancashire,Brass: Birmingham,Food Chain Magnate,Through the Ages: A Story of Civilization
13,Caverna: The Cave Farmers,A Feast for Odin,Agricola,Terra Mystica
31,Concordia,Orléans,Great Western Trail,The Castles of Burgundy
28,Food Chain Magnate,Kingdom Death: Monster,Brass: Lancashire,Brass: Birmingham


In [23]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.3468
1.3467983608876295


In [25]:
# sim: same as above, just calculation is different

In [96]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(trainset)
predictions = knn_baseline.test(testset)
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.3549
1.3548980401686634


In [None]:
knn_baseline.predict()

# Model Based

this uses maxtor factorization

In [97]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.3697


1.3697325105207512

In [75]:
# param_grid = {'n_factors':[5, 10,20],'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
#               'reg_all': [0.4, 0.6]}

param_grid = {'n_factors':[5, 10]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.9min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.9min finished


In [77]:
gs_model.best_params

{'rmse': {'n_factors': 10}, 'mae': {'n_factors': 10}}

In [78]:
svd = SVD(n_factors=10)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 3.5453
3.5453165741430865


In [79]:
param_grid = {'n_factors':[5, 10,20],'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs_model_2 = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model_2.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 39.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 46.9min finished


In [80]:
gs_model_2.best_params

{'rmse': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4},
 'mae': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}

In [81]:
svd2 = SVD(n_factors=5, n_epochs=20, lr_all=0.005, reg_all=0.4)
svd2.fit(trainset)
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))

RMSE: 3.5450
3.545013880031576


to insert:

gridsearch results can be analysed this way:
results_df = pd.DataFrame.from_dict(gs.cv_results)

bsl_options when creating an algorithm, can be sgd or als, if we do that, can define other variables too



# Examining Data

In [83]:
df = pd.read_csv('./data_input/games_31_summary.csv', header = None)

In [85]:
df.head()

Unnamed: 0,0,1,2
0,DRCrain,174430,7.0
1,Huligan1475,174430,7.0
2,Cama891,174430,9.0
3,ryosaeba83,174430,9.0
4,Creation,174430,10.0


In [87]:
df[2].describe()

count    866517.000000
mean          8.237334
std           1.440595
min           1.000000
25%           7.500000
50%           8.300000
75%           9.000000
max          10.000000
Name: 2, dtype: float64