In [3]:
import numpy as np
import pandas as pd
from cmfrec import CMF, ContentBased
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from collections import defaultdict



Load data

In [4]:
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        return df
games_data = reduce_memory(pd.read_csv('../data/games.csv'))
users_data = reduce_memory(pd.read_csv('../reduced_data/users.csv'))
rec_data = reduce_memory(pd.read_csv('../reduced_data/recommendations.csv'))
gamesmeta_data = pd.read_json('../data/games_metadata.json',lines = True)

Pre-process data

In [5]:
MFrec_data = rec_data[["user_id","app_id","is_recommended"]]
MFrec_data.rename(columns={'user_id': 'UserId', "app_id": "ItemId", "is_recommended":"Rating"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MFrec_data.rename(columns={'user_id': 'UserId', "app_id": "ItemId", "is_recommended":"Rating"}, inplace=True)


In [6]:
users_unique = MFrec_data["UserId"].unique()
games_unique = MFrec_data["ItemId"].unique()
users_train, users_test = train_test_split(users_unique, test_size= 0.05, random_state= 123)
games_train, games_test = train_test_split(games_unique, test_size= 0.1, random_state= 40)
rec_train, rec_test = train_test_split(MFrec_data.loc[MFrec_data["UserId"].isin(users_train)
                                                     & MFrec_data["ItemId"].isin(games_train)],
                                                     test_size= 0.1, random_state= 1)

users_train,games_train = rec_train["UserId"].unique(), rec_train["ItemId"].unique()

rec_test1 =rec_test.loc[rec_test["UserId"].isin(users_train)&
                                          rec_test["ItemId"].isin(games_train)]
rec_test2 = MFrec_data.loc[MFrec_data["UserId"].isin(users_train)&
                                          ~MFrec_data["ItemId"].isin(games_train)]
rec_test3 = MFrec_data.loc[~MFrec_data["UserId"].isin(users_train)&
                                          MFrec_data["ItemId"].isin(games_train)]
rec_test4 = MFrec_data.loc[~MFrec_data["UserId"].isin(users_train)&
                                          ~MFrec_data["ItemId"].isin(games_train)]
print(rec_train.shape)
print(rec_test1.shape)
print(rec_test2.shape)
print(rec_test3.shape)
print(rec_test4.shape)

(3161313, 3)
(276576, 3)
(312852, 3)
(257321, 3)
(101052, 3)


In [7]:
sentiment_mapping = {
    'Overwhelmingly Positive':8,
    'Very Positive':7,
    'Positive': 6,
    'Mostly Positive': 5,
    'Mixed': 4,
    'Mostly Negative': 3,
    'Negative': 2,
    'Very Negative': 1,
    'Overwhelmingly Negative': 0, 
}
games_data['rating'].replace(sentiment_mapping,inplace=True)
MFgames_data  = games_data.drop(['title','win','mac','linux','price_original','steam_deck','date_release'], axis = 1).rename(columns={"app_id" :"ItemId"})
MFgames_data.iloc[:,1:] = (MFgames_data.iloc[:,1:]-MFgames_data.iloc[:,1:].mean()) / MFgames_data.iloc[:,1:].std()

MFusers_data = users_data.rename(columns={"user_id" : "UserId"})
MFusers_data.iloc[:,1:] = (MFusers_data.iloc[:,1:] - MFusers_data.iloc[:,1:].mean()) / MFusers_data.iloc[:,1:].std()

# Model

Normal Matrix Factoriztion

In [9]:
model = CMF(method = "als", k = 40, lambda_ = 1e+1)
model.fit(rec_train)

Collective matrix factorization model
(explicit-feedback variant)


Matrix Factorization with side info

In [6]:
model_with_sideinfo = CMF(method = "als", lambda_ = 1e+1, w_main = 0.5, w_item= 0.25, w_user= 0.25)
model_with_sideinfo.fit(X = MFrec_data, U = MFusers_data, I = MFgames_data)

# Predict one user

View info the user

In [8]:
title = defaultdict(lambda: 0)
reviews = defaultdict(lambda : 0)
positive_ratio = defaultdict(lambda : 0)
for i in games_data.itertuples():
    title[i.app_id] = i.title
    reviews[i.app_id] = i.user_reviews
    positive_ratio[i.app_id] = i.positive_ratio
   

In [9]:
userId = 10
users_data[lambda x: x["user_id"] == userId]
rec_data[lambda x: x["user_id"] == userId].assign(Movie=lambda x: x["app_id"].map(title),
                                                  Number_of_reviews = lambda x: x["app_id"].map(reviews),
                                                  positive_ratio=lambda x: x["app_id"].map(positive_ratio))

Unnamed: 0,user_id,products,reviews
5310600,10,387,2


Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,Movie,Number_of_reviews,positive_ratio
305473,394360,0,0,2021-11-26,True,107.0,10,305473,Hearts of Iron IV,176243,92
9771144,469600,0,0,2021-10-13,True,26.2,10,9771144,Legion TD 2 - Multiplayer Tower Defense,8426,86


Examine top recommended lists

In [10]:
exclude = MFrec_data["ItemId"].loc[MFrec_data['UserId'] == userId]

rec_list_m = model.topN(user =userId, n= 5, exclude =exclude)
rec_list_msideinfo =model_with_sideinfo.topN(user =userId, n=5, exclude = exclude)

In [11]:
avg_games_rating = defaultdict(lambda: 0)
for i in MFrec_data.groupby("ItemId")["Rating"].mean().to_frame().itertuples():
    avg_games_rating[i.Index] = i.Rating
def printRec(reclist):
    list_w_info = [str(m + 1) + ") - " + title[reclist[m]] +\
        " - Average Rating: " + str(np.round(avg_games_rating[reclist[m]], 2))+\
        " - Number of ratings: " + str(reviews[reclist[m]])\
                   for m in range(len(reclist))]
    print("\n".join(list_w_info))
print("Recommended from ratings-only model")
printRec(rec_list_m)
print("----------------")
print("Recommended from hybrid model")
printRec(rec_list_msideinfo)

Recommended from ratings-only model
1) - Ready or Not - Average Rating: 0.91 - Number of ratings: 95170
2) - METAL GEAR SOLID V: THE PHANTOM PAIN - Average Rating: 0.93 - Number of ratings: 57296
3) - Fallout: New Vegas - Average Rating: 0.95 - Number of ratings: 147417
4) - DOOM - Average Rating: 0.96 - Number of ratings: 121343
5) - Pavlov VR - Average Rating: 0.94 - Number of ratings: 34735
----------------
Recommended from hybrid model
1) - Among Us - Average Rating: 0.88 - Number of ratings: 587821
2) - Warframe - Average Rating: 0.88 - Number of ratings: 542198
3) - Grand Theft Auto IV: The Complete Edition - Average Rating: 0.79 - Number of ratings: 114405
4) - DARK SOULS™ III - Average Rating: 0.93 - Number of ratings: 230407
5) - NieR:Automata™ - Average Rating: 0.86 - Number of ratings: 86896


# Evaluate model

Train-test split

In [12]:
users_unique = MFrec_data["UserId"].unique()
games_unique = MFrec_data["ItemId"].unique()
users_train, users_test = train_test_split(users_unique, test_size= 0.2, random_state= 123)
games_train, games_test = train_test_split(games_unique, test_size= 0.2, random_state= 42)
rec_train, rec_test = train_test_split(MFrec_data.loc[MFrec_data["UserId"].isin(users_train)
                                                     & MFrec_data["ItemId"].isin(games_train)],
                                                     test_size= 0.2, random_state= 1)

users_train,games_train = rec_train["UserId"].unique(), rec_train["ItemId"].unique()

rec_test1 =rec_test.loc[rec_test["UserId"].isin(users_train)&
                                          rec_test["ItemId"].isin(games_train)]
rec_test2 = MFrec_data.loc[MFrec_data["UserId"].isin(users_train)&
                                          ~MFrec_data["ItemId"].isin(games_train)]
rec_test3 = MFrec_data.loc[~MFrec_data["UserId"].isin(users_train)&
                                          MFrec_data["ItemId"].isin(games_train)]
rec_test4 = MFrec_data.loc[~MFrec_data["UserId"].isin(users_train)&
                                          ~MFrec_data["ItemId"].isin(games_train)]

print(rec_test1.shape)
print(rec_test2.shape)
print(rec_test3.shape)
print(rec_test4.shape)

(4046993, 3)
(4787903, 3)
(7942295, 3)
(3069932, 3)


In [13]:
users_data_train = MFusers_data.loc[lambda x: x["UserId"].isin(users_train)]
games_data_train = MFgames_data.loc[lambda x: x["ItemId"].isin(games_train)]

m_classic = CMF(k=40).fit(rec_train)
m_collective = CMF(k=40, w_main=0.5, w_user=0.5, w_item=0.5).fit(X=rec_train,U=users_data_train,I=games_data_train)

In [19]:
pred_ratings_only = m_classic.predict(rec_test1["UserId"], rec_test1["ItemId"])
print("RMSE type 1 ratings-only model: %.3f [rho: %.3f]" %(np.sqrt(mean_squared_error(rec_test1["Rating"], pred_ratings_only, squared=True)),np.corrcoef(rec_test1["Rating"], pred_ratings_only)[0,1]))

RMSE type 1 ratings-only model: 0.338 [rho: 0.349]




In [20]:

pred_hybrid = m_collective.predict(rec_test1["UserId"], rec_test1["ItemId"])
print("RMSE type 1 hybrid model: %.3f [rho: %.3f]" %(np.sqrt(mean_squared_error(rec_test1["Rating"],pred_hybrid,squared=True)),np.corrcoef(rec_test1["Rating"], pred_hybrid)[0,1]))

RMSE type 1 hybrid model: 0.696 [rho: 0.019]


