# Collaborative filtering based model

Collaborative filtering based models finds similaries between items or users through ratings or items that other users have liked as well.

### Importing necessary packages


In [28]:
# importing necessary packages
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

!pip install scikit-surprise
from sklearn.preprocessing import StandardScaler 
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

from surprise import Reader, Dataset
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
nltk.download('punkt')
import re
import string
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
!unzip data.zip

Archive:  data.zip
  inflating: data/steam_rs.csv       


In [0]:
# importing the data set
df = pd.read_csv('data/steam_rs.csv')

In [0]:
# drop the timestamp column
df_cf = df.drop('release_date', axis = 1)

In [13]:
# displaying the data frame
df_cf.head()

Unnamed: 0,id,appid,name,purchase,hours_of_play,developer,publisher,positive,negative,english,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,detailed_description,about_the_game,short_description,price,rank
0,151603712,570,Dota 2,purchase,1.0,Valve,Valve,1096061,194043,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95912
1,151603712,570,Dota 2,play,0.5,Valve,Valve,1096061,194043,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95912
2,187131847,570,Dota 2,purchase,1.0,Valve,Valve,1096061,194043,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95912
3,187131847,570,Dota 2,play,2.3,Valve,Valve,1096061,194043,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95912
4,176410694,570,Dota 2,purchase,1.0,Valve,Valve,1096061,194043,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,863507,142079,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95912


In [0]:
# modifying the columns in the data frame
df_cf = df_cf[['id', 'appid', 'name', 'rank', 'genres', 'steamspy_tags', 'short_description']]

In [0]:
# alterting the data frame further to show the id, game name and its rank
df = df_cf[['id', 'name', 'rank']]

In [16]:
# displaying shape of data frame
df.shape

(99898, 3)

In [0]:
reader = Reader()
data = Dataset.load_from_df(df,reader)

In [18]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  10385 

Number of items:  2298


### Train/ test split

The train and test data sets will contain randomly selected user ratings and items instead of the entire list of users and items.

In [0]:
# preforming train/test split
trainset, testset = train_test_split(df, test_size=0.2)

In [0]:
trainsetfull = data.build_full_trainset()

In [33]:
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items)

Number of users:  10385 

Number of items:  2298


# Memory Based

### Cosine similarity
KNNBasic with cosine similarity (USER BASED)

In [0]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, cv=3, n_jobs=-1)

In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':True}

In [0]:
#   instantiate the knns.KNNBasic() object with the similarity settings:
basic_user = knns.KNNBasic(sim_options=sim_cos)

In [39]:
#   fitting the model
simcos_cv_item = cross_validate(basic_user, data, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    80.8900 80.9149 80.8634 80.8894 0.0210  
MAE (testset)     79.9889 80.0325 79.9605 79.9939 0.0296  
RMSE (trainset)   80.8891 80.8767 80.9024 80.8894 0.0105  
MAE (trainset)    79.9965 79.9746 80.0106 79.9939 0.0148  
Fit time          1.33    1.93    1.28    1.51    0.29    
Test time         4.90    4.38    2.47    3.92    1.04    


KNNBasic with cosine similarity (ITEM BASED)


In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':False}

In [0]:
#   instantiate the knns.KNNBasic() object with the similarity settings
basic_item = knns.KNNBasic(sim_options=sim_cos)

In [38]:
#   fitting the model
simcos_cv_item = cross_validate(basic_item, data, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    80.8773 80.8585 80.9325 80.8894 0.0314  
MAE (testset)     79.9628 79.9687 80.0504 79.9939 0.0400  
RMSE (trainset)   80.8955 80.9049 80.8679 80.8894 0.0157  
MAE (trainset)    80.0095 80.0065 79.9657 79.9939 0.0200  
Fit time          1.43    1.81    1.27    1.51    0.23    
Test time         5.16    4.45    2.22    3.94    1.25    


### Pearson similarity

KNNBaseline with pearson similarity (USER BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':True}

In [0]:
#   instantiate the knns.KNNBasic() object with the similarity settings
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)


In [42]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, data, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    80.8542 80.8735 80.9405 80.8894 0.0370  
MAE (testset)     79.9513 79.9782 80.0524 79.9939 0.0427  
RMSE (trainset)   80.9070 80.8974 80.8639 80.8894 0.0185  
MAE (trainset)    80.0152 80.0018 79.9647 79.9939 0.0214  
Fit time          51.18   51.68   28.54   43.80   10.79   
Test time         30.40   30.74   15.39   25.51   7.16    


KNNBaseline with pearson similarity (ITEM BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   instantiate the knns.KNNBasic() object with the similarity settings
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)

In [45]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, data, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    80.8790 80.9109 80.8784 80.8894 0.0152  
MAE (testset)     79.9737 80.0098 79.9984 79.9939 0.0151  
RMSE (trainset)   80.8946 80.8787 80.8949 80.8894 0.0076  
MAE (trainset)    80.0041 79.9860 79.9917 79.9939 0.0076  
Fit time          2.00    2.77    1.97    2.25    0.37    
Test time         4.56    3.58    1.89    3.34    1.10    


# Model based

In [0]:
## Perform a gridsearch with SVD
# ⏰ This cell may take several minutes to run
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

In [47]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 80.88938877028063, 'mae': 79.9939387940746}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [48]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([80.90348422, 80.88443837, 80.88035288]))
('test_mae', array([80.00366297, 79.98126541, 79.99688807]))
('fit_time', (56.57424354553223, 55.431344747543335, 26.176817417144775))
('test_time', (27.336431741714478, 26.979588508605957, 13.666398048400879))
-----------------------
80.88942515560707


In [49]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [50]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([80.83992433, 80.93252361, 80.93049124, 80.79261078, 80.95145951]))
('test_mae', array([79.9537338 , 80.03242269, 80.04074163, 79.8812427 , 80.06155569]))
('fit_time', (40.17104411125183, 35.938589572906494, 34.390875816345215, 35.39463806152344, 34.732157468795776))
('test_time', (14.515562057495117, 14.345680713653564, 13.92281460762024, 13.855585098266602, 14.048386096954346))


80.88940189613027

# Making recommendations

In [55]:
svd = SVD(n_factors= 50, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fafe89370b8>

In [56]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=5, details={'was_impossible': False})

In [0]:
def game_rater(game_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            game = df_cf[df_cf['genres'].str.contains(genre)].sample(1)
        else:
            game = df_cf.sample(1)
        print(game)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_game = {'id':userID,'gameId':game['appid'].values[0],'rating':rating}
            rating_list.append(rating_one_game) 
            num -= 1
    return rating_list      

In [59]:
user_rating = game_rater(df_cf, 4, 'RPG')

              id   appid       name       rank                                             genres  \
68502  131785763  212160  Vindictus  70.974753  Action;Adventure;Free to Play;Massively Multip...   

                            steamspy_tags                                  short_description  
68502  Free to Play;Action;Hack and Slash  Enter a dark and sinister world where you must...  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
              id   appid           name       rank                      genres  \
49889  102921672  200710  Torchlight II  94.245684  Action;Adventure;Indie;RPG   

                       steamspy_tags                                  short_description  
49889  RPG;Action RPG;Hack and Slash  The adventure continues in Torchlight II! An A...  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
              id   appid   name       rank                                             genres  \
32182  

KeyboardInterrupt: ignored