# Collaborative based filtering

Collaborative filtering based models finds similaries between items or users through ratings or items that other users have liked as well.

### Importing necessary packages


In [6]:
# installing the library suprise
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675388 sha256=f67d9f8973e61621460bb77bafe7d20513e347c4a67e418d65e6433d0a111616
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.0


In [7]:
# importing necessary packages
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler 
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import knns
from surprise import accuracy
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

from surprise import Reader, Dataset
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
nltk.download('punkt')
import re
import string
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
# unziping the folder containing the steam_rs.csv data
!unzip steam_rs.zip

Archive:  steam_rs.zip
  inflating: steam_rs.csv            


### Importing and checking through the data sets

In [0]:
# importing the data set
steam = pd.read_csv('steam_rs.csv')

In [0]:
# removing release_date column
steam = steam.drop('release_date', axis = 1)

In [95]:
# displaying the data frame
steam.head()

Unnamed: 0,id,appid,name,purchase,hours_of_play,developer,publisher,positive,negative,english,platforms,required_age,categories,genres,steamspy_tags,achievements,average_playtime,median_playtime,owners,detailed_description,about_the_game,short_description,price,rank
0,151603712,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
1,151603712,570,Dota 2,play,0.5,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
2,187131847,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
3,187131847,570,Dota 2,play,2.3,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
4,176410694,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113


In [0]:
# modifying the columns in the data frame
steam = steam[['id', 'appid', 'name', 'rank', 'genres', 'steamspy_tags', 'short_description', 'hours_of_play']]

Since Steam does not maintain explicit ratings for games, we derive our ratings based on the hours of play.

In [0]:
# alterting the data frame further to show the id, game name and its hours of play
steam = steam[['id', 'name', 'hours_of_play']]

In [98]:
# displaying shape of data frame
steam.shape

(99632, 3)

In [0]:
# transforming the current data set into something that is compatible with surpirse 
reader = Reader()
steam = Dataset.load_from_df(steam,reader)

### Train/ test split

The train and test data sets will contain randomly selected user ratings and items instead of the entire list of users and items. 80% of these ratings reside in the training set and the remaining 20% is in the test set.

In [0]:
# preforming train/test split
trainset, testset = train_test_split(steam, test_size=0.2)

In [101]:
# checking the number of items and users in the data set
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  10034 

Number of items:  2224 



From this it can be seen that there is a fewer number of items rather than the number of users.

# Memory Based

For these memory based models I will be using KNNBasic which is a basic collaborative filtering algorithm. In addition to that i will also be using KNNBaseline which is a basic collaborative filtering algorithm taking into account a baseline rating. KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

## Cosine similarity

KNNBasic with cosine similarity (USER BASED)

In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':True}

In [0]:
#   training the model with user_based = True
basic_user = knns.KNNBasic(sim_options=sim_cos)

In [104]:
#   fitting the model
simcos_cv_item = cross_validate(basic_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    174.6109169.0096175.6148173.07842.9062  
MAE (testset)     22.4693 22.3509 23.2253 22.6818 0.3873  
RMSE (trainset)   172.2598175.0270171.7426173.00981.4419  
MAE (trainset)    22.4076 22.4724 22.0488 22.3096 0.1863  
Fit time          59.48   58.46   30.60   49.51   13.38   
Test time         29.97   30.26   14.94   25.06   7.15    


KNNBasic with cosine similarity (ITEM BASED)


In [0]:
#   cosine similarity
sim_cos = {'name':'cosine', 'user_based':False}

In [0]:
#   training the model with user_based = True
basic_item = knns.KNNBasic(sim_options=sim_cos)

In [56]:
#   fitting the model
simcos_cv_item = cross_validate(basic_item, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    169.2498172.0856178.3425173.22593.7986  
MAE (testset)     22.2881 23.7911 22.5680 22.8824 0.6526  
RMSE (trainset)   174.9768173.6106170.4203173.00261.9092  
MAE (trainset)    22.5215 21.7882 22.4265 22.2454 0.3256  
Fit time          2.18    3.39    2.09    2.55    0.59    
Test time         5.11    3.84    2.08    3.68    1.24    


### Pearson similarity

KNNBaseline with pearson similarity (USER BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':True}

In [0]:
#   training the model with user_based = True
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)


In [59]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)



Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    174.9807177.7516166.8525173.19494.6253  
MAE (testset)     22.3736 23.3717 22.2430 22.6628 0.5041  
RMSE (trainset)   172.1432170.7203176.1185172.99402.2844  
MAE (trainset)    21.9747 21.4711 22.0538 21.8332 0.2580  
Fit time          56.55   56.83   29.00   47.46   13.05   
Test time         41.16   41.38   17.08   33.21   11.40   


KNNBaseline with pearson similarity (ITEM BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   training the model with user_based = False
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)

In [62]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    169.1324177.4867172.8173173.14553.4185  
MAE (testset)     23.1095 22.6109 22.3299 22.6834 0.3224  
RMSE (trainset)   175.0018170.8100173.1844172.99881.7163  
MAE (trainset)    21.6548 21.8811 22.0344 21.8567 0.1559  
Fit time          1.92    3.03    1.93    2.29    0.52    
Test time         5.87    4.71    2.95    4.51    1.20    


KNNWithMeans with pearson similarity (USER BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':True}

In [0]:
#   training the model with user_based = True
knn_WithMeans_user = knns.KNNWithMeans(sim_options=sim_pearson)

In [65]:
#   fitting the model
sim_pearson_wm_cv_user = cross_validate(knn_WithMeans_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    174.5953171.9645173.3167173.29221.0742  
MAE (testset)     21.9322 23.1473 22.5149 22.5315 0.4962  
RMSE (trainset)   172.3533173.6734173.0030173.00990.5389  
MAE (trainset)    22.0488 21.4549 21.7902 21.7646 0.2432  
Fit time          58.09   61.63   30.19   49.97   14.06   
Test time         31.68   32.46   16.20   26.78   7.49    


KNNWithMeans with pearson similarity (ITEM BASED)

In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   training the model with user_based = False
knn_WithMeans_item = knns.KNNWithMeans(sim_options=sim_pearson)

In [68]:
#   fitting the model
sim_pearson_wm_cv_item = cross_validate(knn_WithMeans_item, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    195.4273162.4315159.3542172.404316.3281 
MAE (testset)     24.2049 21.7889 21.5890 22.5276 1.1888  
RMSE (trainset)   160.7320178.1345179.5169172.79458.5481  
MAE (trainset)    20.9565 22.1461 22.2506 21.7844 0.5870  
Fit time          2.08    2.96    1.86    2.30    0.47    
Test time         5.15    4.10    2.41    3.89    1.13    


# Model based (matrix factorization)

In [106]:
param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
               'reg_all': [0.4, 0.6]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(steam)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.5min finished


In [107]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 192.3314
192.33141233929877


this model performed worse than the others

In [0]:
## Perform a gridsearch with SVD
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(steam)

In [112]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 172.8549464425004, 'mae': 23.66983378924563}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [0]:
## Perform a gridsearch with SVD
params = {'n_factors': [20],
         'reg_all': [0.02]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(steam)

In [114]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 172.45558694700102, 'mae': 23.6698244001123}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [0]:
# person similarity
sim_pearson = {'name':'pearson', 'user_based':False}

In [0]:
#   training the model with user_based = False
knn_baseline_user = knns.KNNBaseline(sim_options=sim_pearson)

In [117]:
#   fitting the model
sim_pearson_cv_user = cross_validate(knn_baseline_user, steam, measures=['rmse', 'mae'],
                           cv = 3, return_train_measures=True, n_jobs= -1,
                           verbose = True)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    175.3776164.5200179.3078173.06856.2540  
MAE (testset)     22.7405 22.0885 23.1969 22.6753 0.4548  
RMSE (trainset)   171.8941177.1833169.8621172.97983.0859  
MAE (trainset)    21.8442 22.1439 21.5853 21.8578 0.2283  
Fit time          2.42    3.49    2.19    2.70    0.57    
Test time         5.86    5.08    2.91    4.62    1.25    


In [119]:
for i in sim_pearson_cv_user.items():
    print(i)
print('-----------------------')
print(np.mean(sim_pearson_cv_user['test_rmse']))

('test_rmse', array([175.3775971 , 164.51996491, 179.30780282]))
('train_rmse', array([171.89413719, 177.18327502, 169.86209837]))
('test_mae', array([22.74047644, 22.0885182 , 23.19693017]))
('train_mae', array([21.84417243, 22.14393612, 21.58533206]))
('fit_time', (2.4215846061706543, 3.4935102462768555, 2.1876068115234375))
('test_time', (5.8621826171875, 5.076887607574463, 2.9144668579101562))
-----------------------
173.0684549408837


In [120]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline, steam)

Estimating biases using als...
Computing the pearson similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [121]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([183.08275528, 169.36486003, 157.65612901, 182.36099013,
       172.68985089]))
('test_mae', array([24.06104803, 21.91421191, 21.84608246, 23.87831401, 21.4690861 ]))
('fit_time', (48.995821714401245, 41.32681584358215, 41.72800850868225, 41.782835721969604, 43.42121410369873))
('test_time', (21.798069953918457, 22.222237586975098, 22.609481811523438, 21.038819313049316, 21.44190001487732))


173.03091706803121

# Making recommendations

In [0]:
df_games = pd.read_csv('steam_rs.csv')

In [140]:
df_games.head()

Unnamed: 0,id,appid,name,purchase,hours_of_play,developer,publisher,positive,negative,release_date,english,platforms,required_age,categories,genres,steamspy_tags,achievements,average_playtime,median_playtime,owners,detailed_description,about_the_game,short_description,price,rank
0,151603712,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,2013-07-09,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
1,151603712,570,Dota 2,play,0.5,Valve,Valve,1097301,194384,2013-07-09,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
2,187131847,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,2013-07-09,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
3,187131847,570,Dota 2,play,2.3,Valve,Valve,1097301,194384,2013-07-09,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113
4,176410694,570,Dota 2,purchase,0.0,Valve,Valve,1097301,194384,2013-07-09,1,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,Free to Play;MOBA;Strategy,0,23944,801,100000000-200000000,<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...",0.0,84.95113


In [142]:
svd = SVD(n_factors= 20, reg_all=0.02)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f5ac405ff28>

In [143]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=5, details={'was_impossible': False})

In [0]:
def game_rater(df_games, num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            game = df_games[df_games['genres'].str.contains(genre)].sample(1)
        else:
            game = df_games.sample(1)
        print(game)
        hours = input('How do you rate this game on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_game = {'id':userID,'gameId':game['appid'].values[0],'hours_of_play':rating}
            rating_list.append(rating_one_game) 
            num -= 1
    return rating_list      

In [148]:
user_rating = game_rater(df_games, 4, 'RPG')

             id   appid            name  purchase  hours_of_play     developer  publisher  \
67331  61676491  250400  How to Survive  purchase            0.0  Eko Software  505 Games   

       positive  negative release_date  english platforms  required_age  \
67331     13456      2842   2014-08-29        1   windows             0   

                                              categories                genres  \
67331  Single-player;Multi-player;Co-op;Shared/Split ...  Action;Adventure;RPG   

                   steamspy_tags  achievements  average_playtime  median_playtime  \
67331  Survival;Zombies;Crafting            43               333              256   

                owners                               detailed_description  \
67331  1000000-2000000  <h1>Special Offer</h1><p>How to Survive 2 on E...   

                                          about_the_game  \
67331  <ul class="bb_ul"><li>“<i><strong>A real Gem</...   

                                       short_descr

NameError: ignored

In [138]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = steam.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

AttributeError: ignored

In [0]:
# train a model using the new combined DataFrame
svd_ = SVD(n_factors= 50, reg_all=0.05)
svd_.fit(new_data.build_full_trainset())

In [0]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in new_df['movieId'].unique():
    list_of_movies.append( (m_id,svd_.predict(1000,m_id)[3]))

In [0]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [0]:
# return the top n recommendations using the 
def recommended_movies(trainset,n):
        for idx, rec in enumerate(trainset):
            title = trainset.loc[trainset['id'] == int(rec[0])]['name']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

In [134]:
recommended_movies(trainset,5)

TypeError: ignored