# Collaborative based filtering

Collaborative filtering based models finds similaries between items or users through ratings or items that other users have liked as well.

### Importing necessary packages


In [1]:
# installing the library suprise
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.5MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675388 sha256=572691bc77b3a6611bc30271b9db9ada2be412ea501ecb4469575a38a9169e8d
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.0


In [2]:
# importing necessary packages
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler 
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import knns
from surprise import accuracy
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

from surprise import Reader, Dataset
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
nltk.download('punkt')
import re
import string
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

  import pandas.util.testing as tm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Importing and checking through the data sets

In [3]:
# unziping the folder containing the steam_rs.csv data
!unzip steam_rs.zip

Archive:  steam_rs.zip
  inflating: steam_rs.csv            


In [0]:
# importing the data set
steam = pd.read_csv('steam_rs.csv')

In [0]:
# removing release_date column
steam = steam.drop('release_date', axis = 1)

In [6]:
# displaying the data frame
steam.head()

Unnamed: 0,id,appid,name,purchase,hours_of_play,developer,publisher,positive,negative,english,platforms,required_age,categories,genres,steamspy_tags,achievements,average_playtime,median_playtime,owners,detailed_description,about_the_game,short_description,price,rank
0,151603712,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
1,151603712,570,Dota 2,play,0.5,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
2,187131847,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
3,187131847,570,Dota 2,play,2.3,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
4,176410694,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113


In [0]:
# modifying the columns in the data frame
steam = steam[['id', 'appid', 'name', 'rank', 'genres', 'steamspy_tags', 'short_description', 'hours_of_play']]

Since Steam does not use ratings for games, i will base the recommendations on the hours of play for users.

In [0]:
# alterting the data frame further to show the id, game name and its hours of play
steam = steam[['id', 'name', 'hours_of_play']]

In [9]:
# displaying shape of data frame
steam.shape

(99632, 3)

In [0]:
# dropping the duplicate ids, and keeping the most recent ones as making sure it dosent remove all the unique games in the data frame
steam.drop_duplicates(subset = ['id', 'name'],
                     keep = 'last', inplace = True)

In [11]:
# displaying shape of data frame
steam.shape

(60446, 3)

In [0]:
# transforming the current data set into something that is compatible with surpirse 
reader = Reader()
steam = Dataset.load_from_df(steam,reader)

### Train/ test split

The train and test data sets will contain randomly selected user ratings and items instead of the entire list of users and items. 80% of these ratings reside in the training set and the remaining 20% is in the test set.

In [0]:
# preforming train/test split
trainset, testset = train_test_split(steam, test_size=0.2)

In [14]:
# checking the number of items and users in the data set
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  9167 

Number of items:  2175 



From this it can be seen that there is a fewer number of items rather than the number of users.

# Memory Based

For these memory based models I will be using KNNBasic which is a basic collaborative filtering algorithm. In addition to that i will also be using KNNBaseline which is a basic collaborative filtering algorithm taking into account a baseline rating. KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

## Cosine similarity

## KNNBasic with cosine similarity (USER BASED)

In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':True}

In [0]:
#   training the model with user_based = True
basic_user = knns.KNNBasic(sim_options=sim_cos)

In [46]:
#   fitting the model
simcos_cv_user = cross_validate(basic_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    228.9718223.7241213.3561222.01736.4883  
MAE (testset)     35.3183 35.2097 34.1601 34.8961 0.5223  
RMSE (trainset)   218.6245221.3231226.3897222.11243.2189  
MAE (trainset)    34.3881 34.4445 34.9625 34.5984 0.2585  
Fit time          25.84   27.36   14.91   22.70   5.54    
Test time         6.46    6.73    3.41    5.53    1.50    


In [47]:
for i in simcos_cv_user.items():
    print(i)
print('-----------------')
print(np.mean(simcos_cv_user['test_rmse']))

('test_rmse', array([228.97183166, 223.7240792 , 213.35606093]))
('train_rmse', array([218.62447138, 221.32310377, 226.38968214]))
('test_mae', array([35.31827624, 35.20974549, 34.16013318]))
('train_mae', array([34.3881082 , 34.44449754, 34.96253912]))
('fit_time', (25.84223961830139, 27.357378005981445, 14.913233757019043))
('test_time', (6.455743312835693, 6.730473279953003, 3.410547971725464))
-----------------
222.0173239291706


From the results this model appears overfit.

## KNNBasic with cosine similarity (ITEM BASED)


In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':False}

In [0]:
#   training the model with user_based = True
basic_item = knns.KNNBasic(sim_options=sim_cos)

In [20]:
#   fitting the model
simcos_cv_item = cross_validate(basic_item, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    233.1314207.1824225.5939221.969210.8993 
MAE (testset)     34.9303 34.7733 36.0902 35.2646 0.5873  
RMSE (trainset)   216.4389229.2539220.4181222.03695.3555  
MAE (trainset)    34.7965 34.8390 34.1977 34.6111 0.2928  
Fit time          1.95    2.41    1.60    1.99    0.33    
Test time         2.27    1.77    0.92    1.66    0.56    


In [24]:
for i in simcos_cv_item.items():
    print(i)
print('-----------------')
print(np.mean(simcos_cv_item['test_rmse']))

('test_rmse', array([233.13144171, 207.18237231, 225.59386048]))
('train_rmse', array([216.43888121, 229.25390399, 220.41805831]))
('test_mae', array([34.93033289, 34.77326367, 36.09018717]))
('train_mae', array([34.79653516, 34.83900194, 34.19768521]))
('fit_time', (1.9526290893554688, 2.4080560207366943, 1.6034140586853027))
('test_time', (2.274155616760254, 1.7709479331970215, 0.9229907989501953))
-----------------
221.9692248330687


Although like the previous model this model also appears quite overfit, there is a slight improvement in the RMSE and MAE scores.1


### Pearson similarity

## KNNBaseline with pearson similarity (USER BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':True}

In [0]:
#   training the model with user_based = True
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)


In [25]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    227.7171193.8803242.0029221.200120.1792 
MAE (testset)     34.2870 33.4675 37.1238 34.9594 1.5666  
RMSE (trainset)   219.2476234.9514211.4571221.88549.7712  
MAE (trainset)    34.5946 34.9833 33.1351 34.2377 0.7956  
Fit time          33.99   34.03   16.42   28.15   8.29    
Test time         7.31    7.48    4.02    6.27    1.59    


In [26]:
for i in sim_pearson_cv_user.items():
    print(i)
print('-----------------')
print(np.mean(sim_pearson_cv_user['test_rmse']))

('test_rmse', array([227.71714958, 193.88029079, 242.00294524]))
('train_rmse', array([219.24760772, 234.95144508, 211.45711046]))
('test_mae', array([34.28702211, 33.46748549, 37.12380177]))
('train_mae', array([34.59459711, 34.98332315, 33.13510688]))
('fit_time', (33.990625858306885, 34.03328895568848, 16.423507928848267))
('test_time', (7.305282115936279, 7.478791236877441, 4.016505241394043))
-----------------
221.20012853862636


From the RMSE and MAE scores obtained, there is evident improvement in this user based KNNBaseline model with pearson similarity. The lower RMSE scores shown above indicates an improved performance compared to previous models.

## KNNBaseline with pearson similarity (ITEM BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   training the model with user_based = False
knn_baseline_item = knns.KNNBaseline(sim_options=sim_pearson)

In [33]:
#   fitting the model
sim_pearson_cv_item = cross_validate(knn_baseline_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    224.3914236.8364203.9276221.718513.5673 
MAE (testset)     35.6523 35.2078 34.2885 35.0496 0.5679  
RMSE (trainset)   220.9566214.3677230.6685221.99766.6954  
MAE (trainset)    33.8830 34.1239 34.5715 34.1928 0.2853  
Fit time          1.76    2.48    1.60    1.95    0.38    
Test time         2.57    2.01    1.10    1.89    0.61    


In [34]:
for i in sim_pearson_cv_item.items():
    print(i)
print('-----------------')
print(np.mean(sim_pearson_cv_item['test_rmse']))

('test_rmse', array([224.39144468, 236.83644249, 203.92760893]))
('train_rmse', array([220.95663133, 214.36766449, 230.66851105]))
('test_mae', array([35.65233519, 35.20784097, 34.28852744]))
('train_mae', array([33.88297921, 34.12385282, 34.5715336 ]))
('fit_time', (1.7622437477111816, 2.47896671295166, 1.5959157943725586))
('test_time', (2.567018985748291, 2.0062038898468018, 1.096261739730835))
-----------------
221.71849870102267


This models preformance was also decent

## KNNWithMeans with pearson similarity (USER BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':True}

In [0]:
#   training the model with user_based = True
knn_WithMeans_user = knns.KNNWithMeans(sim_options=sim_pearson)

In [37]:
#   fitting the model
sim_pearson_wm_cv_user = cross_validate(knn_WithMeans_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    235.8071236.3716191.4307221.203221.0535 
MAE (testset)     34.4921 36.2347 33.6681 34.7983 1.0700  
RMSE (trainset)   214.9773214.6713235.9929221.88059.9798  
MAE (trainset)    34.3131 33.4291 34.7388 34.1603 0.5455  
Fit time          31.14   30.92   16.14   26.07   7.02    
Test time         7.21    7.33    3.55    6.03    1.75    


In [38]:
for i in sim_pearson_wm_cv_user.items():
    print(i)
print('-----------------')
print(np.mean(sim_pearson_wm_cv_user['test_rmse']))

('test_rmse', array([235.8070977 , 236.37162456, 191.43073826]))
('train_rmse', array([214.97730375, 214.67134826, 235.99294221]))
('test_mae', array([34.49207968, 36.23473695, 33.66810759]))
('train_mae', array([34.31313786, 33.42913201, 34.7387793 ]))
('fit_time', (31.140739917755127, 30.92330241203308, 16.14375615119934))
('test_time', (7.214906454086304, 7.32607102394104, 3.5543196201324463))
-----------------
221.20315350460828


## KNNWithMeans with pearson similarity (ITEM BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   training the model with user_based = False
knn_WithMeans_item = knns.KNNWithMeans(sim_options=sim_pearson)

In [41]:
#   fitting the model
sim_pearson_wm_cv_item = cross_validate(knn_WithMeans_item, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)



Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    240.2601228.4892195.1499221.299719.1049 
MAE (testset)     37.1344 33.7052 33.4021 34.7472 1.6925  
RMSE (trainset)   212.4455218.8437234.4237221.90439.2298  
MAE (trainset)    32.8357 34.5785 34.7244 34.0462 0.8580  
Fit time          1.73    2.35    1.17    1.75    0.48    
Test time         2.39    1.76    0.93    1.69    0.59    


In [42]:
for i in sim_pearson_wm_cv_item.items():
    print(i)
print('-----------------')
print(np.mean(sim_pearson_wm_cv_item['test_rmse']))

('test_rmse', array([240.26008838, 228.48918675, 195.14992952]))
('train_rmse', array([212.44551117, 218.84373337, 234.42365478]))
('test_mae', array([37.1343957 , 33.70524854, 33.40210195]))
('train_mae', array([32.835666  , 34.57854936, 34.72439679]))
('fit_time', (1.7314190864562988, 2.353318214416504, 1.1666381359100342))
('test_time', (2.3850784301757812, 1.764460563659668, 0.9341492652893066))
-----------------
221.29973488229982


# Model based 

In [48]:
param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
               'reg_all': [0.4, 0.6]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(steam)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.2min finished


In [49]:
print(gs_model.best_score)
print(gs_model.best_params)

{'rmse': 221.07945902505745, 'mae': 35.76528369897345}
{'rmse': {'n_factors': 20, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}, 'mae': {'n_factors': 20, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}}


In [0]:
## Perform a gridsearch with SVD
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(steam)

In [51]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 221.31836417839622, 'mae': 35.76531379544365}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [0]:
#create our high performance algorithm
our_algo = gs_model.best_estimator['rmse']

In [55]:
#Retrain on the whole set A
trainset = steam.build_full_trainset()
our_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc3cb33c5f8>

In [56]:
#This is the BIASED Accuracy on A
predictions = our_algo.test(trainset.build_testset())
print('Biased accuract on A', end='  ')
accuracy.rmse(predictions)

Biased accuract on A  RMSE: 222.1159


222.1159269640164

In [58]:
#This is the UNBIASED Accuract on B
testset = steam.construct_testset(B_raw_ratings) #testset is now the set B
predictions = our_algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

NameError: ignored

In [0]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

# Making recommendations

In [0]:
df_games = pd.read_csv('steam_rs.csv')

In [0]:
df_games.head()

In [0]:
# dropping the duplicate ids, and keeping the most recent ones as making sure it dosent remove all the unique games in the data frame
df_games.drop_duplicates(subset = ['id', 'name'],
                     keep = 'last', inplace = True)

In [0]:
# dropping the duplicate ids
df_games.drop_duplicates(subset = ['appid'],
                     keep = 'last', inplace = True)

In [0]:
df_games = df_games[['id', 'appid', 'name', 'purchase', 'developer', 'genres', 'rank']]

In [0]:
svd = SVD(n_factors= 20, reg_all=0.02)
svd.fit(trainset)

In [0]:
svd.predict(2, 4)

In [0]:
def game_rater(df_games, num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            game = df_games[df_games['genres'].str.contains(genre)].sample(1)
        else:
            game = df_games.sample(1)
        print(game)
        rating = input('How many hours have you spent on this game, press n if you have not played it :\n')
        if rating == 'n':
            continue
        else:
            rating_one_game = {'id':userID,'gameId':game['appid'].values[0],'hours_of_play':rating}
            rating_list.append(rating_one_game) 
            num -= 1
    return rating_list      

In [0]:
user_rating = game_rater(df_games, 4, 'RPG')

In [0]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = df_games.append(user_rating,ignore_index=True)

In [0]:
new_ratings_df = new_ratings_df[['id', 'appid', 'hours_of_play']]


In [0]:
# transforming the current data set into something that is compatible with surpirse 
reader = Reader()
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [0]:
# train a model using the new combined DataFrame
svd_ = SVD(n_factors= 20, reg_all=0.02)
svd_.fit(new_data.build_full_trainset())

In [0]:
# make predictions for the user
list_of_games = []
for m_id in df_games['appid'].unique():
    list_of_games.append( (m_id,svd_.predict(1000,m_id)[3]))

In [0]:
# order the predictions from highest to lowest rated
ranked_games = sorted(list_of_games, key=lambda x:x[1], reverse=True)

In [0]:
# return the top n recommendations
def recommended_games(user_ratings, df_games, n):
        for idx, rec in enumerate(user_ratings):
            title = df_games.loc[df_games['appid'] == int(rec[0])]['name']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

In [0]:
recommended_games(ranked_games, df_games, 5)

# FUTURE WORK



*   Developing improved recommendation systems using the hybrid filtering methods
*   Soley using Steam's API to form game recommendations​
*   Using matrix factorization techniques

