## Loading data and Necessary libraries

**References:**
* Splitting into Train and Test https://towardsdatascience.com/if-you-cant-measure-it-you-can-t-improve-it-5c059014faad
* Official documentation http://surpriselib.com/
* Basic tutorial https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
* Predicting on Test https://surprise.readthedocs.io/en/stable/FAQ.html#raw-inner-note
* Top N recommendations for each user https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-save-some-data-for-unbiased-accuracy-estimation

In [101]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
#from sklearn.decomposition import NMF
import surprise
from surprise import NMF
#from sklearn.model_selection import GridSearchCV
#from surprise.model_selection import cross_validate
#from surprise.model_selection import train_test_split

In [15]:
# Read data
raw_data = pd.read_csv("../../../merged_data_edited.csv", index_col = 0)

In [16]:
# Column names
raw_data.columns

Index(['Unnamed: 0.1', 'reviewID', 'overall', 'verified', 'reviewTime',
       'reviewerID', 'productID', 'reviewText', 'summary', 'vote', 'style',
       'category', 'title', 'brand', 'rank', 'main_cat', 'description',
       'also_buy', 'also_view', 'feature', 'numberOfReviews'],
      dtype='object')

In [17]:
# Drop first column
data = raw_data.drop(labels = ["Unnamed: 0.1"], axis = 1)

In [18]:
# Explore resulting df
data.head()

Unnamed: 0,reviewID,overall,verified,reviewTime,reviewerID,productID,reviewText,summary,vote,style,category,title,brand,rank,main_cat,description,also_buy,also_view,feature,numberOfReviews
0,0,5,True,2015-10-17,A1HP7NVNPFMA4N,700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
1,1,4,False,2015-07-27,A1JGAP0185YJI6,700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
2,2,3,True,2015-02-23,A1YJWEXHQBWK2B,700026657,ok game.,Three Stars,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
3,3,2,True,2015-02-20,A2204E1TH211HT,700026657,"found the game a bit too complicated, not what...",Two Stars,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
4,4,5,True,2014-12-25,A2RF5B5H74JLPE,700026657,"great game, I love it and have played it since...",love this game,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13


In [19]:
# Shape
data.shape

(473824, 20)

In [20]:
# Check that there are no duplicates
sum(data.duplicated(subset = ["reviewerID", "productID", "reviewTime"]))

0

In [21]:
# Possible values for ratings
data["overall"].value_counts()

5    286845
4     88462
3     46548
1     29137
2     22832
Name: overall, dtype: int64

## Splitting into Train and Test

In [75]:
# Split data into Train and Test
train_ratio = 0.7
shuffled_data = shuffle(data, random_state = 232323)
split_cut = np.int(np.round(len(shuffled_data) * train_ratio))
train_df = shuffled_data.iloc[0:split_cut]
test_df = shuffled_data.iloc[split_cut::]
test_df = test_df[(test_df['reviewerID'].isin(train_df['reviewerID'])) & (test_df['productID'].isin(train_df['productID']))]


In [76]:
# Unique users in full dataset
print(len(data["reviewerID"].unique()))
# Unique users in full dataset
print(len(data["productID"].unique()))

55223
17408


In [77]:
# Unique users in Train dataset
print(len(train_df["reviewerID"].unique()))
# Unique users in Train dataset
print(len(train_df["productID"].unique()))

55133
17393


In [78]:
# Unique users in Test dataset
print(len(test_df["reviewerID"].unique()))
# Unique users in Test dataset
print(len(test_df["productID"].unique()))

49197
16503


In [91]:
# Number of rows in Test
len(test_df)

141704

In [90]:
# All reviewers contained in Train
sum(test_df["reviewerID"].isin(train_users))

141704

In [93]:
# All products contained in Train
sum(test_df["productID"].isin(train_items))

141704

## Loading data into Surprise

In [94]:
# Instantiate reader object, to parse ratings
reader = surprise.Reader(rating_scale=(1, 5))

In [100]:
# Load Train data into Surprise, from dataframe created and using reader instantiated above
surprise_data_train = surprise.Dataset.load_from_df(train_df[['reviewerID', 'productID', 'overall']], reader)

## Training the model

In [108]:
# Define parameter grid
param_grid = {"n_factors": [5, 10, 15, 20, 25, 30, 40, 45, 50],
              "n_epochs": [25, 50, 75]}

In [109]:
# Instantiate grid search CV
grid_search = surprise.model_selection.GridSearchCV(algo_class = NMF, 
                                                    param_grid = param_grid, 
                                                    measures=['rmse'], 
                                                    cv = 5, 
                                                    n_jobs = 7,
                                                    verbose = 5)

In [110]:
# Look for best parameters on train data
grid_search.fit(surprise_data_train)

In [186]:
# best RMSE score
#print(grid_search.best_score['rmse'])

In [112]:
# combination of parameters that gave the best RMSE score
print(grid_search.best_params['rmse'])

{'n_factors': 40, 'n_epochs': 50}


In [113]:
# Get model with best performance
best_NMF = grid_search.best_estimator['rmse']

In [114]:
# Fit best NMF on Train
best_NMF.fit(surprise_data_train.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1a7ee73710>

In [122]:
predictions = best_NMF.test(trainset.build_testset())
surprise.accuracy.rmse(predictions)

RMSE: 0.4733


0.4732875436103701

In [141]:
predictions[0]

Prediction(uid='A1G0WXV8BJWMO6', iid='B00EZPCX1A', r_ui=5.0, est=5, details={'was_impossible': False})

In [140]:
type(predictions[0])

surprise.prediction_algorithms.predictions.Prediction

## Predicting on Test Set

In [169]:
test_list = []

In [182]:
for _, row in test_df.iterrows():
    t = (row["reviewerID"], row["productID"], row["overall"])
    test_list.append(t)

In [184]:
# Predict using best model found
test_predictions = best_NMF.test(test_list)

In [185]:
# Compute Test RMSE
surprise.accuracy.rmse(test_predictions)

RMSE: 1.1225


1.1225321863396875