# Recommender System
## Prerequisites

In [41]:
# Data manipulation
import numpy as np
import pandas as pd
from datetime import datetime

# NLP
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Machine Learning
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score,precision_score,average_precision_score
from surprise.prediction_algorithms import SVD
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from surprise.model_selection import KFold
from collections import defaultdict, Counter
from itertools import combinations

# Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from scipy.sparse import vstack

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("paper")
sns.set_style('darkgrid')

# Miscellaneous
pd.set_option('max_columns', 40)
pd.set_option('max_colwidth', 99)

In [42]:
combined = pd.read_csv('../Dataset/combined.csv')

In [43]:
combined.drop(columns=['name_y'], inplace=True)

In [44]:
combined.rename(columns={'name_x':'name'}, inplace=True)

In [45]:
games = pd.read_csv('../Dataset/game_info.csv')
users = pd.read_csv('../Dataset/user_info.csv')

## Appetizer
**Applying cosine similarity on name and description.**
The easiest and fastest way is to build a cosine similarity matrix of name and game description and recommend the most similar games base on how similar the game name and descriptions are. In this way, our engine will always recommend games which are in the same series or games that are having the same storyline as the games already played by users. 

In [46]:
games['name_n_descr'] = games['name_clean']+games['descr_clean']

In [47]:
stops = list(set(stopwords.words('english')))
tfidf_name_descr = TfidfVectorizer(stop_words=stops, ngram_range=(1,3))

In [48]:
tfidf_name_descr_matrix = tfidf_name_descr.fit_transform(games['name_n_descr'])

In [49]:
name_descr_sim = cosine_similarity(tfidf_name_descr_matrix)

In [50]:
name_descr_sim = pd.DataFrame(name_descr_sim, columns=games['name'].values, index=games['name'].values)

In [51]:
name_descr_sim['Gulman 5'].sort_values(ascending=False).head(10)

Gulman 5                            1.000000
Gulman 4: Still alive               0.070887
Terrorist Elimination               0.054947
Shadow Ops: Red Mercury             0.052188
Strike Force Remastered             0.050525
Hydrophobia: Prophecy               0.043320
Hostage: Rescue Mission             0.043178
Strike Force: Desert Thunder        0.042653
Strike Force: Arctic Storm          0.040970
Counter-Strike: Global Offensive    0.039728
Name: Gulman 5, dtype: float64

>**Analysis:**
>- Gulman 4 which is in the same series as Gulman 5 is at the top of list. If player input his preferred game as Gulman 5, he will probably enjoy Gulman 4 as well. 
>- The rest of the recommendations are having the same genre as the input game. They are all action/shooting games. 
>- Cosine similarity of game name and description is the easier and fastest way to recommend player games, however, it is not personalized enough, as players will have the same suggestion if they input the same game name. 

## Content Based

### Cosine Similarity
**Applying cosine similarity on game attributes.** Applying cosine similarity on game attributes will enable us to find similar games base on game features such as genre, category, developer etc.

In [52]:
features = list(games.select_dtypes(exclude='object').columns)

In [53]:
to_remove = ['rank', 'month', 'presence', 'pos_ratings', 'avg_playtime', 'median_playtime','language_count', 
             'lang_french', 'lang_german', 'lang_russian', 'lang_english', 'lang_italian', 'lang_japanese']
features = [i for i in features if i not in to_remove]
features.append('name')

In [54]:
games_new = games[features]

In [55]:
games_new.set_index('name', inplace=True)

In [56]:
def de_mean_normalize (df):
    return round((df.T - df.mean(axis=1)),2).T

In [57]:
games_new = de_mean_normalize(games_new)

In [58]:
games_similarity_cs = pd.DataFrame(cosine_similarity(games_new), columns=games_new.index, index=games_new.index)

In [59]:
games_similarity_cs['Dota 2'].sort_values(ascending=False)

name
Dota 2                                         1.000000
Counter-Strike: Global Offensive               1.000000
Team Fortress 2                                1.000000
Warframe                                       1.000000
Unturned                                       1.000000
                                                 ...   
aMAZE St.Patrick                               0.994923
Chocolate makes you happy: Valentine's Day     0.994923
Invasion Zero                                  0.994923
Chocolate makes you happy: St.Patrick's Day    0.994923
Author Clicker                                 0.994923
Name: Dota 2, Length: 18367, dtype: float64

In [60]:
games[(games['name']=='Dota 2')|(games['name']=='Counter-Strike: Global Offensive')]

Unnamed: 0,rank,name,presence,price,controller,languages,description,tags,platforms,categories,genres,pos_ratings,avg_playtime,median_playtime,owners,avg_score,name_clean,year,month,descr_clean,...,gr_racing,gr_rpg,gr_mmp,categ_clean,cat_ach_cloud,cat_ctl_sp,cat_ctl_ach,cat_pctl_sp,cat_sp_leader,cat_ach_leader,cat_sp_mp,cat_ctl_cloud,cat_pctl_ach,cat_mp_ach,cat_mp_coop,cat_singleplayer,cat_multiplayer,cat_steamtradingcards,cat_steamleaderboards,name_n_descr
0,1,Counter-Strike: Global Offensive,1009588,0.0,1.0,"English, Czech, Danish, Dutch, Finnish, French, German, Hungarian, Italian, Japanese, Korean, N...",Counter-Strike is a multiplayer phenomenon in its simplicity. No complicated narratives to expl...,"FPS, Shooter, Multiplayer, Competitive, Action, Team-Based, eSports, Tactical, First-Person, Pv...","['xbox', 'playstation', 'pc']",Multi-player;Steam Achievements;Full controller support;Steam Trading Cards;Steam Workshop;In-A...,"action,freetoplay",2644405,22494,6502,150000000.0,5,Counter Strike Global Offensive,2012,8,counter strike multiplayer phenomenon simplicity complicated narrative explain source conflict ...,...,0,0,0,"['multiplayer', 'steamachievements', 'fullcontrollersupport', 'steamtradingcards', 'steamworksh...",0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,Counter Strike Global Offensivecounter strike multiplayer phenomenon simplicity complicated nar...
1,3,Dota 2,1009306,0.0,1.0,"English, Bulgarian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, Italian, J...","What used to be an unofficial modded map for the Warcraft 3, ended up being the most budgeted c...","Free to Play, MOBA, Multiplayer, Strategy, eSports, Team-Based, Competitive, Action, Online Co-...",['pc'],Multi-player;Co-op;Steam Trading Cards;Steam Workshop;SteamVR Collectibles;In-App Purchases;Val...,"action,freetoplay,strategy",863508,23944,801,300000000.0,5,Dota 2,2013,9,used unofficial modded map warcraft ended budgeted cybersport discipline gathering people watch...,...,0,0,0,"['multiplayer', 'coop', 'steamtradingcards', 'steamworkshop', 'steamvrcollectibles', 'inapppurc...",0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,Dota 2used unofficial modded map warcraft ended budgeted cybersport discipline gathering people...


>**Analysis:**
>- Majority of games are very much similar to each other after applying cosine similarity. 
>- If we take a look at 'Dota 2' and 'Counter Strike', from game attributes perspective they are quite similar. They are both free to play, require controller, have similar social media presence etc. 
>- We shall compare euclidean distance for these games as well. 

### Euclidean Distance

**Calculating euclidean distance between games.** Euclidean distance takes into consideration of magnitude of features which is important in our analysis as well. 

In [61]:
games_similarity_ed = pd.DataFrame(euclidean_distances(games_new), columns=games_new.index, index=games_new.index)

In [62]:
# force the values to be between 0 and 1
games_similarity_ed = games_similarity_ed/300_000_000

In [63]:
games_similarity_ed['Dota 2'].sort_values(ascending=True).head(10)

name
Dota 2                              0.000000
Counter-Strike: Global Offensive    0.494152
Team Fortress 2                     0.757699
Unturned                            0.757699
Warframe                            0.757699
Half-Life 2: Lost Coast             0.889473
Heroes & Generals                   0.889473
Half-Life 2: Deathmatch             0.889473
Warface                             0.889473
Counter-Strike: Condition Zero      0.889473
Name: Dota 2, dtype: float64

>**Analysis:**
>- Different from cosine similarity, the lower the euclidean distance value, the more similar the two games are. 
>- After checking against euclidean distance, Counter Strike is still the most similar game to Dota 2. However, in euclidean distance analysis, Counter Strike is not identical to Dota 2 as shown in cosine similarity analysis. If we take a look at average play time and number of positive ratings, they are quite different for these two games. Hence the euclidean distance between them. 

## Collaborative Filtering

### Model Based
There are many ways to achieve collaborative filtering. <br>
One of the ways is matrix factorization and dimension reduction via **`Singular Value Decomposition`**. Through dimension reduction, our model is able to summarize unique game profiles for us.

In [64]:
users[['userid', 'name', 'user_score']].head(3)

Unnamed: 0,userid,name,user_score
0,151603712,The Elder Scrolls V Skyrim,3.388639
1,151603712,Fallout 4,2.89889
2,151603712,Spore,2.143195


In [65]:
reader = Reader()
user_rating = Dataset.load_from_df(users[['userid', 'name', 'user_score']],reader)

In [66]:
trainset, testset = train_test_split(user_rating, test_size=0.2)

- **Grid search to get the best parameters.** As we are using MAE as our main evaluation matrix, we will be optimizing MAE via grid search. 

In [67]:
param_grid = {'n_factors':[8, 10, 12], 
              'lr_all': [0.002, 0.005, 0.0007], 
              'reg_all': [0.02, 0.05, 0.1]}
svd_model = GridSearchCV(SVD, param_grid=param_grid, cv=5)
svd_model.fit(user_rating)

In [68]:
print(svd_model.best_score)

{'rmse': 0.762837920350982, 'mae': 0.6040717314439344}


In [69]:
print(svd_model.best_params)

{'rmse': {'n_factors': 8, 'lr_all': 0.005, 'reg_all': 0.05}, 'mae': {'n_factors': 10, 'lr_all': 0.005, 'reg_all': 0.02}}


- **Let's build a model with best parameters**

In [70]:
best_model = SVD(n_factors= 8, reg_all=0.02)

In [71]:
trainset = user_rating.build_full_trainset()
testset = trainset.build_testset()
best_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb33a862150>

In [72]:
predictions = best_model.test(testset)

In [73]:
cf_error = accuracy.mae(predictions)

MAE:  0.5488


In [74]:
cf_error

0.5487560464911426

>**Analysis:**
>- We have just predicted user rating for all the games in our dataset, and our mean absolute error is 0.54. This means that on average, our predictions are about half a score away from user's actual score. Since our score ranges from 0 to 5, 0.5 error is considered as good. 

**Let's define a function to make recommendations to users.**

In [75]:
def get_top_n(predictions, userid, n=5):
    top_n = defaultdict(list)
    top_n_filtered = []
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    for game in top_n[userid]:
        top_n_filtered.append(game[0])
    for num, game in enumerate(top_n_filtered):
        print(f'Recommendatin {num+1}: {game}')

In [76]:
get_top_n(predictions, 43284145)

Recommendatin 1: Total War ROME II - Emperor Edition
Recommendatin 2: Mount & Blade Warband
Recommendatin 3: Empire Total War
Recommendatin 4: Napoleon Total War
Recommendatin 5: Left 4 Dead 2


In [77]:
users[users['userid']==43284145][['userid','name','user_score']][:5]

Unnamed: 0,userid,name,user_score
47879,43284145,Napoleon Total War,3.050656
47880,43284145,Total War ROME II - Emperor Edition,2.428392
47881,43284145,Mount & Blade Warband,2.413364
47882,43284145,Rising Storm/Red Orchestra 2 Multiplayer,2.397789
47883,43284145,Tomb Raider,2.364829


In [78]:
get_top_n(predictions, 33282871)

Recommendatin 1: Counter-Strike Global Offensive
Recommendatin 2: Grand Theft Auto V
Recommendatin 3: Arma 3
Recommendatin 4: DARK SOULS II
Recommendatin 5: Fallout 4


In [79]:
users[users['userid']==33282871][['userid','name','user_score']][:5]

Unnamed: 0,userid,name,user_score
18607,33282871,Counter-Strike Global Offensive,3.475007
18608,33282871,ARK Survival Evolved,3.471143
18609,33282871,Terraria,2.913409
18610,33282871,Team Fortress 2,2.88893
18611,33282871,XCOM Enemy Unknown,2.702498


>**Analysis:**
>- If we look at the recommendation for user '43284145', he generally likes war games. We recommended 5 games to him and 2 out of 5 appeared on his top 5 game list
>- For user '98848653', he generally likes shooting games. We recommended 5 games to him but unfortunately we did not manage to capture any of the top 5 games.
>- To evaluate these two predictions, let's calculate the **`precision@k`** for these two users.

- **`Precision @K.`** Another important evaluation matrix we are looking at is precision@k. Precision@k looks at the following criteria:<br>
    - 1. how many recommended games actually appeared on user's list of played games (**relevant games**)<br>
    - 2. out of all the relevant games, how many games are qualified to be a good recommendation (score >2) (**qualified games**)<br>
    - 3. finally, precision@k is calculated by taking the **ratio of qualified games to relevant games**. 

In [80]:
def precision_at_k(predictions, k=5, threshold=2.0):
    """Return precisionfor each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    return precisions

In [81]:
precision_dictionary = precision_at_k(predictions)

In [82]:
cf_avg_precision = sum(precision_dictionary.values())/len(precision_dictionary.values())
cf_avg_precision

0.36968428781204205

In [83]:
precision_dictionary[33282871]

0.4

- Precision is okay for player 33282871. Although our recommended games are not his top 5 favorite games, he seems to like 3 out of the 5 games we recommended. 

In [84]:
precision_dictionary[43284145]

1.0

- Precision is perfect for player 43284145 because we managed to recommend all 5 games which he likes.

### User Based
Another way of achieving collaborative filtering is matrix factorization coupled with **`neural network`** and **`gradient descent`**. 

In [85]:
users.shape

(70489, 6)

In [86]:
users_new_pivot = users.pivot_table(index='userid', columns='name', values='user_score')

In [87]:
users_new_pivot = users_new_pivot.fillna(0)

In [88]:
# Shuffle DataFrame
users_new = users.sample(frac=1).reset_index(drop=True)

# Test size
n = 7000
users_new_train = users_new[:-n]
users_new_test = users_new[-n:]

In [89]:
# Create user- & game-id mapping
unique_user_ids = users_new['userid'].unique()
unique_game_ids = users_new['name'].unique()
user_id_mapping = {id:i for i, id in enumerate(unique_user_ids)}
game_id_mapping = {id:i for i, id in enumerate(unique_game_ids)}

# Create correctly mapped train- & testset
train_user_data = users_new_train['userid'].map(user_id_mapping)
train_game_data = users_new_train['name'].map(game_id_mapping)
test_user_data = users_new_test['userid'].map(user_id_mapping)
test_game_data = users_new_test['name'].map(game_id_mapping)

# Get input variable-sizes
num_user = len(user_id_mapping)
num_games = len(game_id_mapping)

In [90]:
embedding_size = 32
# Inputs
user_id_input = Input(shape=[1], name='userid')
game_id_input = Input(shape=[1], name='name')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, input_dim=num_user, input_length=1, 
                           name='user_embedding')(user_id_input)
game_embedding = Embedding(output_dim=embedding_size, input_dim=num_games,input_length=1, 
                            name='item_embedding')(game_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
game_vector = Reshape([embedding_size])(game_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, game_vector])

# Setup model
model = Model(inputs=[user_id_input, game_id_input], outputs=y)
model.compile(loss='mae', optimizer='adam')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# Fit model
model.fit([train_user_data, train_game_data], users_new_train['user_score'].values,
          batch_size=256, epochs=50, validation_split=0.1, callbacks=[es], shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_game_data])
y_true = users_new_test['user_score'].values

#  Compute RMSE
mae = mean_absolute_error(y_pred=y_pred, y_true=y_true)
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} MAE'.format(mae))

Train on 57140 samples, validate on 6349 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


Testing Result With Keras Matrix-Factorization: 0.7338 MAE


> Analysis:
> - Neural network with gradient descent is having higher error as compared to matrix factorization with SVD. This is because SVD is able to reduce dimension of utility matrix, extract latent vector and facilitate a clearer representation of relationships between user and items as compared to neural network. 
>- Therefore, we will use SVD in our model instead. 

## Hybrid Model
**Wisdom of crowds.** It is always better to balance our recommendations by including more sources of opinions. In this section, we are going to use a hybrid model taking in user ratings, game id and game descriptions. 

Before getting into feature rich recommender engine, let's build a model with user and game description to test it's performance. 

In [91]:
game_list_mapping = combined[['name_clean']]

In [92]:
game_id_mapping2 = {id:i for i, id in enumerate(game_list_mapping['name_clean'].unique())}

In [93]:
game_list_mapping['name_clean_map'] = game_list_mapping['name_clean'].map(game_id_mapping2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [94]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(combined['userid'].unique())}
game_id_mapping = {id:i for i, id in enumerate(combined['name_clean'].unique())}

# Use mapping to get better ids
combined['userid'] = combined['userid'].map(user_id_mapping)
combined['name_clean'] = combined['name_clean'].map(game_id_mapping)

In [95]:
# Preprocess metadata
game_descr = combined[['name_clean','description']].copy()
game_descr = game_descr.set_index('name_clean', drop=True)

In [96]:
hybrid = combined[['name_clean', 'userid', 'user_score']]

In [97]:
# Split train- & testset
n = 6000

users_hybrid_train = hybrid[:n]
users_hybrid_test = hybrid[-n:]
    # Create user- & game-id mapping
unique_user_ids = hybrid['userid'].unique()
unique_game_ids = hybrid['name_clean'].unique()
user_id_mapping = {id:i for i, id in enumerate(unique_user_ids)}
game_id_mapping = {id:i for i, id in enumerate(unique_game_ids)}

# Create correctly mapped train- & testset
train_user_data = users_hybrid_train['userid'].map(user_id_mapping)
train_game_data = users_hybrid_train['name_clean'].map(game_id_mapping)
test_user_data = users_hybrid_test['userid'].map(user_id_mapping)
test_game_data = users_hybrid_test['name_clean'].map(game_id_mapping)

# Get input variable-sizes
num_user = len(user_id_mapping)
num_games = len(game_id_mapping)
    
    

hybrid = hybrid.sample(frac=1).reset_index(drop=True)
hybrid_train = hybrid[:n]
hybrid_test = hybrid[-n:]

In [98]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(combined['description'])

In [99]:
tfidf_mapping = {id:i for i, id in enumerate(game_descr.index)}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in hybrid_train['name_clean'].values:
#train_tfidf = hybrid_train['name_clean'].map(tfidf_mapping)
    index = tfidf_mapping[id]
    train_tfidf.append(tfidf_hybrid[index])
    
test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in hybrid_test['name_clean'].values:
    index = tfidf_mapping[id]
    test_tfidf.append(tfidf_hybrid[index])

In [100]:
# Stack the sparse matrices
train_tfidf = vstack(train_tfidf)
test_tfidf = vstack(test_tfidf)

In [101]:
user_embed = 32
game_embed = 32
tfidf_embed = 32

# Create three input layers
user_id_input = Input(shape=[1], name='user')
game_id_input = Input(shape=[1], name='game')
tfidf_input = Input(shape=[24494], name='tfidf')

In [102]:
# Create separate embeddings for users and movies
user_embedding = Embedding(output_dim=user_embed, input_dim=len(user_id_mapping),
                           input_length=1, name='user_embedding')(user_id_input)
game_embedding = Embedding(output_dim=game_embed, input_dim=len(game_id_mapping),
                           input_length=1, name='game_embedding')(game_id_input)

In [103]:
# Dimensionality reduction with Dense layers
tfidf_vectors = Dense(128, activation='relu')(tfidf_input)
tfidf_vectors = Dense(32, activation='relu')(tfidf_vectors)

# Reshape both embedding layers
user_vectors = Reshape([user_embed])(user_embedding)
game_vectors = Reshape([game_embed])(game_embedding)

In [104]:
tfidf_vectors.shape

TensorShape([None, 32])

In [105]:
y = Dot(1, normalize=False)([user_vectors, tfidf_vectors])

In [106]:
y.shape

TensorShape([None, 1])

In [107]:
model = Model(inputs=[user_id_input, tfidf_input], outputs=y)

In [108]:
model.compile(loss='mae', optimizer='adam')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# Fit model
model.fit([train_user_data, train_tfidf], hybrid_train['user_score'].values,
          batch_size=256, epochs=50, validation_split=0.1, callbacks=[es], shuffle=True)

Train on 5400 samples, validate on 600 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fb3a05af250>

In [109]:
# Test model
y_pred = model.predict([test_user_data, test_tfidf])
y_true = hybrid_test['user_score'].values



In [110]:
#  Compute MAE
mae = mean_absolute_error(y_pred=y_pred, y_true=y_true)
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} MAE'.format(mae))



Testing Result With Keras Matrix-Factorization: 1.1907 MAE


>**Analysis:**
> - It is observed that with user information and game description, predictions for user ratings are worse than the previous model when we use user and game ratings. 

**Now let's add game description to our model.**

In [111]:
# Concatenate all layers into one vector
both = Concatenate()([user_vectors, game_vectors, tfidf_vectors])

# Add dense layers for combinations and scalar output
dense = Dense(512, activation='relu')(both)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)

In [112]:
# Create and compile model
model = Model(inputs=[user_id_input, game_id_input, tfidf_input], outputs=output)
model.compile(loss='mae', optimizer='adam')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# Train and test the network
model.fit([hybrid_train['userid'], hybrid_train['name_clean'], train_tfidf],
          hybrid_train['user_score'].values,
          batch_size=1024, epochs=50, validation_split=0.1, callbacks=[es], shuffle=True)

Train on 5400 samples, validate on 600 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fb3241eb950>

In [113]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
game (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
tfidf (InputLayer)              [(None, 24494)]      0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 32)        307008      user[0][0]                       
____________________________________________________________________________________________

In [73]:
y_pred = model.predict([hybrid_test['userid'], hybrid_test['name_clean'], test_tfidf])
y_true = hybrid_test['user_score'].values



In [74]:
hybrid_error = mean_absolute_error(y_pred=y_pred, y_true=y_true)
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} MAE'.format(hybrid_error))



Testing Result With Keras Hybrid Deep Learning: 0.6405 MAE


- Let's make some recommendations to our users

In [75]:
predicted_ratings = hybrid_test.copy()

In [76]:
game_list_mapping_dict ={}
for num in list(game_list_mapping['name_clean_map'].unique()):
    game_list_mapping_dict[num]=game_list_mapping[game_list_mapping['name_clean_map']==num]['name_clean']

In [77]:
predicted_ratings['name'] = predicted_ratings['name_clean'].apply(lambda x: game_list_mapping_dict[x].values[0])

In [78]:
predicted_ratings['pred_ratings'] = y_pred

In [79]:
predicted_ratings.sort_values(by='pred_ratings')

Unnamed: 0,name_clean,userid,user_score,name,pred_ratings
37944,708,1151,0.000178,Proteus,0.026518
40764,708,5644,0.297029,Proteus,0.074365
37054,798,5766,0.689444,WORLD END ECONOMiCA episode 01,0.238471
42404,1410,4899,2.173700,AaAaAA A Reckless Disregard for Gravity,0.347779
38716,574,7712,0.297029,I Have No Mouth and I Must Scream,0.375354
...,...,...,...,...,...
39094,0,1329,2.110351,Counter Strike Global Offensive,2.952793
41108,0,4,1.716380,Counter Strike Global Offensive,2.959100
38811,0,1141,4.235245,Counter Strike Global Offensive,2.961461
38387,2,345,3.270115,Rust,2.963229


In [80]:
def get_top_k(df, userid):
    ratings = df[df['userid']==userid]
    ratings.sort_values(by='pred_ratings', ascending=False)
    for num, idx in enumerate(ratings[:5].index):
        print(f"Recommendation {num+1}: {ratings.loc[idx]['name']}")

In [81]:
get_top_k(predicted_ratings, 1232)

Recommendation 1: PAYDAY 2
Recommendation 2: The Wolf Among Us
Recommendation 3: Tropico 4
Recommendation 4: LUFTRAUSERS
Recommendation 5: Psychonauts


In [124]:
predicted_ratings[predicted_ratings['userid']==1232][['userid', 'name', 'user_score']].sort_values(by='user_score',ascending=False)[:5]

Unnamed: 0,userid,name,user_score
41501,1232,BioShock Infinite,2.78702
36843,1232,Tropico 4,2.520993
40370,1232,XCOM Enemy Unknown,2.508929
39941,1232,Saints Row The Third,2.470551
39793,1232,Recettear An Item Shop s Tale,2.347346


In [135]:
get_top_k(predicted_ratings, 5644)

Recommendation 1: Overgrowth
Recommendation 2: Space Pirates and Zombies
Recommendation 3: Miasmata
Recommendation 4: Intrusion 2
Recommendation 5: Portal


In [136]:
predicted_ratings[predicted_ratings['userid']==5644][['userid', 'name', 'user_score']].sort_values(by='user_score',ascending=False)[:5]

Unnamed: 0,userid,name,user_score
39656,5644,Grand Theft Auto V,2.893939
40856,5644,Saints Row The Third,2.508929
40727,5644,Far Cry 3,2.508929
37094,5644,Space Pirates and Zombies,2.347346
40630,5644,Rebuild 3 Gangs of Deadsville,1.897744


In [82]:
def precision_at_k_hybrid(prediction_df, threshold=2.0):
    """Return precisionfor each user"""
    precisions = dict()
    n_rec=0
    n_rel_and_rec_5 = 0
    unique_users = list(prediction_df['userid'].unique())
    for user in unique_users:
        user_df_5 = prediction_df[prediction_df['userid']==user].sort_values(by='pred_ratings', ascending=False)[:5]
        
        # Number of relevant and recommended items in top k
        for idx in list(user_df_5.index):
            if (user_df_5.loc[idx]['pred_ratings']>=threshold) & (user_df_5.loc[idx]['user_score']>=threshold):
                n_rel_and_rec_5 += 1
            else:
                n_rel_and_rec_5 += 0
        
        # Number of recommended items:
            if user_df_5.loc[idx]['pred_ratings']>=threshold:
                n_rec += 1
            else:
                n_rec += 0
        precisions[user] = n_rel_and_rec_5/n_rec if n_rec!=0 else 0
    return precisions

In [83]:
precision_dictionary_hybrid = precision_at_k_hybrid(predicted_ratings)

In [84]:
hybrid_avg_precision = sum(precision_dictionary_hybrid.values())/len(precision_dictionary_hybrid.values())
hybrid_avg_precision

0.6081552237258504

In [85]:
precision_dictionary_hybrid[1232]

0.6470588235294118

In [137]:
precision_dictionary_hybrid[5644]

0.5824742268041238

## Conclusion

In [86]:
print(f'Collaborative Filtering MAE: {round(cf_error,2)}')
print(f'           Hybrid Model MAE: {round(hybrid_error,2)}')

Collaborative Filtering MAE: 0.55
           Hybrid Model MAE: 0.64


In [87]:
print(f'Collaborative Filtering Average Precision: {round(cf_avg_precision*100,2)}%')
print(f'           Hybrid Model Average Precision: {round(hybrid_avg_precision*100,2)}%')

Collaborative Filtering Average Precision: 37.49%
           Hybrid Model Average Precision: 60.82%


>**Model Comparison:**<br>

>- Collaborative filtering gives us an error of 0.55 from true user scores for games. Hybrid model is having slightly larger error of 0.65 from true user scores for games. (game score has a range from 0 to 5)<br>
>- However, if we take a look at the average precision, hybrid model has close to twice the precision as compared to collaborative filtering model. This proofs our point above that including more sources of recommendations and gather wisdoms of crowds by analyzing both user rating and game description, we are able to suggest games which are more similar to the game input by users. 
>-Therefore, **hybrid model** is a better recommender engine. 

>**Limitations and Future Work:**:

>- **Limitations:**
    1. The proposed hybrid model only has three features, user id, user ratings and game descriptions. A better performed model can be built using more features. 
    2. The proposed model works well on smaller datasets. However, when our utility matrix gets too large and too sparse, performance of recommender system might not be as well.  <br>
    
>- **Future Work:**
    1. This model could be improved by including more input layers consisting of other game features such as owner count, average playtime etc to make predictions and recommendations more accurate. 
    2. We can consider time element in our future versions of recommender engine. Recent reviews and ratings should carry a higher weight as compared to older reviews and ratings. This is to capture user's change of preference and make our predictions more accurate. 
    3. We want to find a way to speed up our recommender engine. The current recommender system could take couple of minutes before it can produce any recommendations. However, this is not acceptable in production environment, simply due to the fact that user would lost interest while waiting for our recommendations. 