### Author
Group C: Fidelis Mnyanyi, Lucas Holstein, Michael Wagdy, Ricardo Santos, Siddhi Dhavale, Tomas Susedik

### Last revision
03/November/2020

### Group Assignment
Recommendation Engines

---

## 1. Setting up the environment
### 1.1 Packages

In [1]:
import pandas as pd
import numpy as np
import json
import datetime
from sklearn.metrics.pairwise import linear_kernel
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import KNNBaseline
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import accuracy
import random


### 1.2 Reading the database file

In [2]:
with open('reviews_Musical_Instruments_5.json') as handle:
    json_data = [json.loads(line) for line in handle]

In [3]:
df = pd.DataFrame(json_data, columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime'])

In [4]:
df.shape

(10261, 9)

### 1.3 First look at the database

In [5]:
df.sample(1)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
4012,A1HZRYGGNMOWRQ,B0009G1E0K,Joseph F. Bierley,"[8, 10]","Very nice quality, inexpensive guitar strap. A...",5.0,Protec Guitar Strap,1313020800,"08 11, 2011"


Let's quickly visualize the dataset to ensure that everything looks good

In [6]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


## 2. Understanding database properties
### 2.1 Shape

In [7]:
df.shape

(10261, 9)

In [8]:
print('Dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::20000, :])

Dataset shape: (10261, 9)
-Dataset examples-
       reviewerID        asin  \
0  A2IBPI20UZIR0U  1384719342   

                                       reviewerName helpful  \
0  cassandra tu "Yeah, well, that's just like, u...  [0, 0]   

                                          reviewText  overall summary  \
0  Not much to write about here, but it does exac...      5.0    good   

   unixReviewTime   reviewTime  
0      1393545600  02 28, 2014  


### 2.2 Dates

In [9]:
del(df['reviewTime'])
df.rename(columns = {'asin':'productID', 'overall':'rating', 'unixReviewTime':'reviewDate'}, inplace = True)
df['reviewDate'] = pd.to_datetime(df['reviewDate'], unit = 's')
df.sample(1)

Unnamed: 0,reviewerID,productID,reviewerName,helpful,reviewText,rating,summary,reviewDate
735,A3K6M4WXNTYQEM,B0002D0B4K,Keane O'Kelley,"[0, 0]",I'm using them to angle my speakers up in addi...,3.0,They work,2013-08-07


### 2.3 Null values

In [10]:
df.isnull().sum()

reviewerID       0
productID        0
reviewerName    27
helpful          0
reviewText       0
rating           0
summary          0
reviewDate       0
dtype: int64

In [11]:
df = df.dropna()
df.shape

(10234, 8)

### 2.4 Distinct values

In [12]:
df.reviewerID.value_counts()

ADH0O8UVJOT10     42
A1L7M2JXN4EZCR    38
A15TYOEWBQYF0X    38
A2EZWZ8MBEDOLN    36
A2NYK9KWFMJV4Y    34
                  ..
A3BMYEA3J6RBVV     4
A3CVZRPQH1PFQ5     4
A2T8JRVJRVNX8R     4
AKP6IWJ24C1A5      3
AA5TINW2RJ195      2
Name: reviewerID, Length: 1428, dtype: int64

In [13]:
df.productID.value_counts()

B003VWJ2K8    163
B0002E1G5C    143
B0002F7K7Y    116
B003VWKPHC    114
B0002H0A3S     93
             ... 
B0018PZR86      5
B000SZVYLQ      4
B0002CZVB4      4
B000CBE3GE      4
B003CLIPHO      4
Name: productID, Length: 900, dtype: int64

### 2.5 Adjust 'helpful' column

In [14]:
helpfulness = pd.DataFrame(df['helpful'])
helpfulness[['num_helpful','num_reviews']] = pd.DataFrame(df.helpful.tolist(), index= helpfulness.index)
helpfulness['helpful_ratio'] = helpfulness['num_helpful']/helpfulness['num_reviews']
helpfulness['helpful_ratio'] = helpfulness['helpful_ratio'].fillna(0)
del(helpfulness['helpful'])
df = pd.concat([df, helpfulness], axis = 1, sort = False)
del(df['helpful'])
df.sample(1)

Unnamed: 0,reviewerID,productID,reviewerName,reviewText,rating,summary,reviewDate,num_helpful,num_reviews,helpful_ratio
3690,A2WYAHJGST6AOT,B0006NDF8A,Matt,"This is a great stand, it looks the same as an...",5.0,Way better than the musicians gear,2014-02-23,0,0,0.0


### 2.6 Distribution of ratings

In [15]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# Count the number of times each rating appears in the dataset
data = df['rating'].value_counts().sort_index(ascending=False)

# Create the histogram
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values)

# Create layout
layout = dict(title = 'Distribution Of {} musical instruments ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [16]:
# Number of ratings per food 
data = df.groupby('productID')['rating'].count()

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per instrument',
                   xaxis = dict(title = 'Number of Ratings Per instrument'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

### 2.7 Number of ratings per reviewer

In [17]:
# Number of ratings per user
data = df.groupby('reviewerID')['rating'].count()

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0, size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Reviewer',
                   xaxis = dict(title = 'Ratings Per Reviewer'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## 3. Collaborative Filtering Recommender System

### 3.1 Read data with Surprise

Convert datasets to the format required by the Surprise library

In [18]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['reviewerID', 'productID', 'rating']], reader = reader)

### 3.2 Split data between train and test for unbiased evaluation

In [19]:
from sklearn.model_selection import train_test_split

#trainset & testset will be used whenever no surprise algorithm is being used (content-based)
trainset, testset = train_test_split(df, test_size=0.2,stratify=df['reviewerID'], random_state=42)

#those 2 sets will be used for collaborative filtering
data_train = Dataset.load_from_df(trainset[['reviewerID', 'productID', 'rating']], reader = reader)
data_test = Dataset.load_from_df(testset[['reviewerID', 'productID', 'rating']], reader = reader)

### 3.3 Benchmarking

We will try different algorithm with defefault parameters to get the base

In [20]:
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import NormalPredictor


my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), BaselineOnly(), CoClustering()]:
    
    print("Testing: {}".format(algorithm))
    # Perform cross validation
    results = cross_validate(algorithm, data_train, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   

Testing: <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001AD3C61DE88>
Testing: <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001AD3C622088>
Testing: <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001AD3C6220C8>
Testing: <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001AD3C622108>
Testing: <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001AD3C622148>
Testing: <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001AD3C386888>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Testing: <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001AD3C622188>
Estimating biases using

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.879134,0.007635,0.010316
SVD,0.881526,0.250555,0.01497
SVDpp,0.881913,0.73389,0.047872
KNNBaseline,0.951915,0.055066,0.050047
CoClustering,1.050085,0.290679,0.012499
SlopeOne,1.054882,0.023946,0.014294
NormalPredictor,1.137396,0.005606,0.014464
NMF,1.177164,0.374654,0.012946


Now that we have defined the algorithm configuration, we can actually test it. Again, Surprise provides some handy functions for it, in particular it includes a function to evaluate the algorithm via Cross Validation

Looking at the results we see that SVD++ (0.877386), BaselineOnly (0.875190) and SVD (0.877386) provide the best results.

### 3.4 Hyperparameters Tuning
#### 3.4.1 KNN Baseline

In [21]:
from surprise import KNNBaseline
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import KFold


my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5, 7, 10],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data_train)
print('Grid Search...')
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

#### 3.4.2 SVDpp

Default parameters: <br>
n_factors – The number of factors. Default is 20.<br>
n_epochs – The number of iteration of the SGD procedure. Default is 20.<br>
lr_all – The learning rate for all parameters. Default is 0.007. <br>
reg_all – The regularization term for all parameters. Default is 0.02. <br>

In [22]:
from __future__ import (absolute_import, division, print_function, unicode_literals)   

my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

sim_options = {
    'n_factors':[5,10,20,30],
    "n_epochs": [5, 10, 20, 30], 
    "lr_all": [.0025, .007, .001, .01],
    'reg_all':[.0025, .005, .002,0.2]
}

gs = GridSearchCV(SVDpp, sim_options, measures = ["rmse"], cv = 3)
gs.fit(data_train)
print('Grid Search...')
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Grid Search...
0.8742613741603303
{'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.2}


#### 3.4.3 SVD

In [23]:
from __future__ import (absolute_import, division, print_function,         
                        unicode_literals)   

my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

sim_options = {
    'n_factors':[5,10,25,50],
    'n_epochs':[5,10,20],  
    'lr_all':[.0025, .005, .001, .01, 0.1],
    'reg_all':[.0025, .005, .001,0.1]}

gs_svd = GridSearchCV(SVD, sim_options, measures = ["rmse"], cv = 3)
gs_svd.fit(data_train)
print('Grid Search...')
print(gs_svd.best_score["rmse"])
print(gs_svd.best_params["rmse"])

Grid Search...
0.8745822973661915
{'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}


### 3.5 Final model with SVD

{'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}

In [24]:
algo = gs_svd.best_estimator['rmse']

In [25]:
# retrain on the whole Train set

#trainset = data.build_full_trainset()
trainset2 = data_train.build_full_trainset()
algo.fit(trainset2)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ad3e154e88>

In [26]:
# Compute biased accuracy on Train Set

#predictions = algo.test(trainset.build_testset())
predictions = algo.test(trainset2.build_testset())
print('Biased accuracy on Trainset:', end='   ')
accuracy.rmse(predictions)

Biased accuracy on Trainset:   RMSE: 0.7488


0.7487811564450433

In [27]:
# Compute unbiased accuracy on Test Dataset

#testset = data.construct_testset(B_raw_ratings)  # testset 
testset2 = data_test.build_full_trainset() #test dataset
predictions = algo.test(testset2.build_testset())
print('Unbiased accuracy on Testset,', end=' ')
accuracy.rmse(predictions)

Unbiased accuracy on Testset, RMSE: 0.8566


0.8566129895603092

We confirm that SVD provides the best results. Now that we have defined the algorithm configuration, we can actually test it. 

Since we selected a matrix factorization based algorithm as our best model, we can analyze the resulting matrices to understand what the algorithm is doing. We can analize Item and User Bias plus Factors.

### Analize Item Bias 

In [28]:
#item_bias = [(b, data.build_full_trainset().to_raw_iid(i)) for i, b in enumerate(algo.bi)]

item_bias = [(b, data.build_full_trainset().to_raw_iid(i)) for i, b in enumerate(algo.bi)]
print("Worst products ever:")
sorted(item_bias, key=lambda x: x[0])[:20]

Worst products ever:


[(-0.8831782020522051, 'B0002GZ052'),
 (-0.8169822507219041, 'B001RNHE30'),
 (-0.7655562631715239, 'B000LFCXL8'),
 (-0.6714576013638588, 'B001J1SX94'),
 (-0.6414192498289552, 'B0002DVBJY'),
 (-0.6258065375473804, 'B0002E1O3G'),
 (-0.5952796804399808, 'B001FQ74FW'),
 (-0.5792608324941906, 'B002GYWBIM'),
 (-0.5710906068654249, 'B0037M62JQ'),
 (-0.5624897530710264, 'B000EEJF4O'),
 (-0.5522412040439305, 'B000RKAFIU'),
 (-0.551160837674994, 'B004TE5HBU'),
 (-0.5445012928965443, 'B001D2TPZU'),
 (-0.5405004900835743, 'B0002II6V0'),
 (-0.5372842608675934, 'B0002E2XCW'),
 (-0.5362769576394226, 'B0002H03YY'),
 (-0.5313452612903876, 'B005ZV5K0O'),
 (-0.525701507304398, 'B000Y7Q2C4'),
 (-0.523880041077063, 'B000AC6DVS'),
 (-0.5153345085539228, 'B001RNOHHG')]

In [29]:
print("Best products ever:")
sorted(item_bias, key=lambda x: -x[0])[:20]

Best products ever:


[(0.4083371316743087, 'B0002E38AS'),
 (0.36665998486348345, 'B0002E1NNC'),
 (0.34579785146845915, 'B0002GXV3A'),
 (0.32382971441694136, 'B0002D02IU'),
 (0.3186390515312655, 'B0002H0H4A'),
 (0.2987696065317783, 'B000V8GA46'),
 (0.28996666284151307, 'B0002PBS6S'),
 (0.2899025142805928, 'B000068NSX'),
 (0.2894426211539652, 'B000LQLDM2'),
 (0.28738331783170984, 'B000L3FPUG'),
 (0.2820492497726461, 'B0002D0CIK'),
 (0.28075709624424344, 'B0002D0N70'),
 (0.2781358924023377, 'B001R2LQWQ'),
 (0.27492319521430925, 'B001GD07SK'),
 (0.2737293205605305, 'B0053CUHMG'),
 (0.27005683999569846, 'B000165DSM'),
 (0.2661709494015139, 'B000EEJJI6'),
 (0.2649030425500627, 'B0002E1NNM'),
 (0.2619507422418738, 'B000068O3X'),
 (0.25537457900831273, 'B0002F7K7Y')]

The following code prints the user with the most negative bias. Basically a user always giving bad ratings

In [30]:
user_bias = [(b, data.build_full_trainset().to_raw_uid(i)) for i, b in enumerate(algo.bu)]
print("Users giving most bad ratings:")
sorted(user_bias, key=lambda x: x[0])[:10]

Users giving most bad ratings:


[(-0.9900127429470958, 'AXJ19189TLBLJ'),
 (-0.8696991670433627, 'A1LJXZC1RFW07C'),
 (-0.8075545185550467, 'A14Z9LAETO21KL'),
 (-0.7909003520295033, 'A2CARFAX5FNQT9'),
 (-0.7539363406018373, 'AEMC16IPBZORW'),
 (-0.7467845323917176, 'ADH957M2NJJ8Z'),
 (-0.6999649702013144, 'A3NRNJN4GAQ2V6'),
 (-0.6493623136092694, 'A2XVE2J5WGCSGX'),
 (-0.6354343083381413, 'A3KOWHTH4J4ZIK'),
 (-0.6318784396492946, 'A1786LAQ18LHDC')]

In [31]:
df[df.reviewerID == 'AKGWRO6IC0VIE']

Unnamed: 0,reviewerID,productID,reviewerName,reviewText,rating,summary,reviewDate,num_helpful,num_reviews,helpful_ratio
1935,AKGWRO6IC0VIE,B0002E5518,Vivid,"Haven't had this long, but so far, it's just a...",5.0,Rock Solid - Just about perfect,2012-10-12,0,0,0.0
2511,AKGWRO6IC0VIE,B0002GWFEQ,Vivid,"Simple enough, right? Works as advertised. A g...",4.0,"Works well, a couple caveats",2011-05-09,11,13,0.846154
3538,AKGWRO6IC0VIE,B0002ZO3LK,Vivid,"OK, it was cheap (nine bucks at the time), the...",1.0,"Good Idea, Really Bad Execution - better unit ...",2012-10-05,3,3,1.0
4224,AKGWRO6IC0VIE,B000B6DHB2,Vivid,I wanted to like this. I really did. The idea ...,1.0,Bummer,2014-04-09,0,0,0.0
4708,AKGWRO6IC0VIE,B000EELFTW,Vivid,In case you're wondering about the difference ...,3.0,"Does the job well for cheap, but requires a bi...",2010-05-27,41,44,0.931818
9449,AKGWRO6IC0VIE,B005VM5Z5C,Vivid,"Hi Folks! OK, this thing is cheap. That's cool...",1.0,Ignore the tempting price - TOO SMALL for any ...,2014-06-19,0,0,0.0
10027,AKGWRO6IC0VIE,B00B1N06PO,Vivid,Soooo bummed. I simply love the sound on these...,2.0,Great Sound - Terrible Comfort,2014-06-19,0,0,0.0


The following code prints the user with the most positive bias. Basically a user mostly always giving great reviews

In [32]:
print("User with most great ratings:")
sorted(user_bias, key=lambda x: x[0])[-1]

User with most great ratings:


(0.37987235968910094, 'ASP978VO96X7C')

In [33]:
df[df.reviewerID == 'A2F2L5F9OWWVY6']

Unnamed: 0,reviewerID,productID,reviewerName,reviewText,rating,summary,reviewDate,num_helpful,num_reviews,helpful_ratio
124,A2F2L5F9OWWVY6,B000068O3X,AT,"If you need some 6"" cables to connect your ped...",5.0,Great pedal cable connectors!,2013-02-10,0,0,0.0
756,A2F2L5F9OWWVY6,B0002D0CAI,AT,These are the best acoustic money can buy in m...,5.0,"Two words ""THE BEST""",2013-02-11,0,0,0.0
1015,A2F2L5F9OWWVY6,B0002D0E8S,AT,I needed a new strap but I didn't want to pay ...,5.0,Great!,2013-02-10,1,1,1.0
1150,A2F2L5F9OWWVY6,B0002DURNK,AT,I have tried nearly every kind of popular stri...,5.0,Fantastic,2013-04-01,0,0,0.0
1409,A2F2L5F9OWWVY6,B0002E1J5E,AT,I'm not a metal player in any means but someti...,5.0,Great for metal,2013-04-03,0,0,0.0
1789,A2F2L5F9OWWVY6,B0002E3B78,AT,This acoustic pickup is the most amazing thing...,5.0,Just awesome,2013-02-10,0,0,0.0
2454,A2F2L5F9OWWVY6,B0002GTZR6,AT,This is a heavy duty strap. I use these on my ...,5.0,The best,2013-04-01,0,0,0.0
2595,A2F2L5F9OWWVY6,B0002GXV2Q,AT,These cables are awesome. They do what they ar...,5.0,Sweet!,2013-01-13,0,0,0.0
4608,A2F2L5F9OWWVY6,B000EEK4VM,AT,I hate winding strings. I got this to help wit...,5.0,Very good,2013-04-03,0,0,0.0
4900,A2F2L5F9OWWVY6,B000KIPTE4,AT,I had always wanted a phase shifter but I neve...,5.0,Worth every penny,2013-04-03,0,0,0.0


## 4. Content-based Recommender System
### 4.1 Reset index in order to match with TF-IDF matrix

In [34]:
trainset.reset_index(drop = True, inplace = True)

In [35]:
testset.reset_index(drop = True, inplace = True)

### 4.2 Create TF-IDF vectorizer

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(encoding='utf-8', decode_error='strict', strip_accents = None, lowercase = True,
                             preprocessor = None, tokenizer = None, analyzer = 'word', stop_words = 'english',
                             ngram_range = (1, 3), max_df = 1.0, min_df = 0, max_features = None,
                             vocabulary = None, binary = False, norm = 'l2', use_idf = True,
                             smooth_idf = True, sublinear_tf = False)
tfidf_matrix = vectorizer.fit_transform(trainset.reviewText)

In [37]:
print(tfidf_matrix)

  (0, 99736)	0.12919394517319274
  (0, 405220)	0.12919394517319274
  (0, 397200)	0.12919394517319274
  (0, 203789)	0.12919394517319274
  (0, 122198)	0.12919394517319274
  (0, 496526)	0.12919394517319274
  (0, 244195)	0.11958272383911703
  (0, 14761)	0.12919394517319274
  (0, 373157)	0.12919394517319274
  (0, 530413)	0.12919394517319274
  (0, 455122)	0.12919394517319274
  (0, 168785)	0.12919394517319274
  (0, 544653)	0.12919394517319274
  (0, 455170)	0.12919394517319274
  (0, 223952)	0.12919394517319274
  (0, 143761)	0.12919394517319274
  (0, 14340)	0.12919394517319274
  (0, 144046)	0.12919394517319274
  (0, 117790)	0.12919394517319274
  (0, 122065)	0.12919394517319274
  (0, 531162)	0.12919394517319274
  (0, 346241)	0.12919394517319274
  (0, 455531)	0.11958272383911703
  (0, 201541)	0.12919394517319274
  (0, 47391)	0.12919394517319274
  :	:
  (8186, 144898)	0.19022809652873976
  (8186, 212283)	0.19022809652873976
  (8186, 32490)	0.19022809652873976
  (8186, 240179)	0.19022809652873976
 

### 4.3 Cosine similarity

In [38]:
# Compute cosine similarity
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Iterate over the items in the dataset to find the most similar ones to each one
results = {}

for idx, row in trainset.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
    similar_items = [(cosine_similarities[idx][i], trainset['productID'][i]) for i in similar_indices] 
    results[row['productID']] = similar_items[1:]

In [39]:
def item(id):
        return trainset.loc[trainset.productID == id]['productID'].tolist()[0].split(' - ')[0]

recommended_items = []

def recommender(item_id):
        scores = results[item_id]
        for x in scores:
                recommended_items.append((item(item_id),item(x[1]),x[0]))
        return recommended_items

### 4.4 Check model performance

In [40]:
#for loop storing all products and its recommendations
products = trainset['productID'].unique()

for i in products:
    recommender(item_id = i)

In [41]:
#converting the list into an Dataframe, so we can use the pd.merge function later

recommended_items_1 = pd.DataFrame(recommended_items, columns = ['productID','recommendations','score'])
recommended_items_1.head()

Unnamed: 0,productID,recommendations,score
0,B0002IHFVM,B0002E37MM,0.040283
1,B0002IHFVM,B0002E3DNK,0.035746
2,B0002IHFVM,B005AGGZ8K,0.03522
3,B0002IHFVM,B0002M3OVI,0.034722
4,B0002IHFVM,B0002GWFEQ,0.033161


In [42]:
# aggregating it per product & recommendation to get the max score (following the professors comment on deleting the duplicate rows, we decided to use the max instead of mean)
recommended_items_final = recommended_items_1.groupby(['productID','recommendations'], as_index=False).max().sort_values(by=['productID','score'], ascending=False)
recommended_items_final.head(10)

Unnamed: 0,productID,recommendations,score
59881,B00JBIVXGC,B00J4TBMVO,0.871533
59880,B00JBIVXGC,B00IZCSW3M,0.178013
59843,B00JBIVXGC,B0002Y6BJI,0.084426
59866,B00JBIVXGC,B003VWJ2K8,0.072453
59834,B00JBIVXGC,B0002GXPRM,0.061516
59877,B00JBIVXGC,B00AQBT3EW,0.056895
59829,B00JBIVXGC,B0002E1O2W,0.054532
59841,B00JBIVXGC,B0002OOMU8,0.051093
59833,B00JBIVXGC,B0002F7K7Y,0.047736
59847,B00JBIVXGC,B000A2HOB6,0.047291


In [43]:
#getting the original ratings for each purchased product per user
user_prod = trainset[['reviewerID','productID','rating']].drop_duplicates()
user_prod.head()

Unnamed: 0,reviewerID,productID,rating
0,AROOYR32BS1VL,B0002IHFVM,5.0
1,A1RGF9CS1V8O14,B0002GZM00,2.0
2,A5MC7LP0ZBO4Q,B0002E1G5C,5.0
3,A3ITN3125FJETP,B000L6GD04,5.0
4,A1ROUMJOGO4QMB,B0010SHU18,1.0


In [44]:
#Left Join the User Product Dataframe (which contains all bought products per user) and recommended items 
# result will be an array with all recommendations and scores 
rec_users = pd.merge(left=user_prod, right=recommended_items_final, left_on='productID', right_on='productID',how='left')

In [45]:
rec_users.head(10)

Unnamed: 0,reviewerID,productID,rating,recommendations,score
0,AROOYR32BS1VL,B0002IHFVM,5.0,B0002E37MM,0.040283
1,AROOYR32BS1VL,B0002IHFVM,5.0,B0002E3DNK,0.035746
2,AROOYR32BS1VL,B0002IHFVM,5.0,B005AGGZ8K,0.03522
3,AROOYR32BS1VL,B0002IHFVM,5.0,B0002M3OVI,0.034722
4,AROOYR32BS1VL,B0002IHFVM,5.0,B0002GWFEQ,0.033161
5,AROOYR32BS1VL,B0002IHFVM,5.0,B0037M62AK,0.032638
6,AROOYR32BS1VL,B0002IHFVM,5.0,B0009G1E0K,0.032401
7,AROOYR32BS1VL,B0002IHFVM,5.0,B0002GZQ1U,0.032372
8,AROOYR32BS1VL,B0002IHFVM,5.0,B000XPPURU,0.032274
9,AROOYR32BS1VL,B0002IHFVM,5.0,B004PFWZHM,0.030461


In [46]:
# aggregating the recommendations per reviewer since a product can be recommended coming from different products. Just taking the max score here
top_rec = rec_users.groupby(['reviewerID','recommendations'], as_index=False).max().sort_values(by=['reviewerID','score'], ascending=False)

In [47]:
# creating bins to get the predicted rating based on the score (the higher the score, the better the pred. rating)
top_rec['predictedRating'] = pd.cut(rec_users['score'], bins=[0,0.2,0.4,0.6,0.8,1], labels=[1,2,3,4,5])

In [48]:
#checking the distribution of our predictions
top_rec['predictedRating'].value_counts()

1    379958
2       127
5        36
3        21
4        14
Name: predictedRating, dtype: int64

In [49]:
#Next: Load the test dataset and check if product was one of the recommendations and what rating was predicted
test_content = testset[['reviewerID','productID','rating']]
test_content.head()

Unnamed: 0,reviewerID,productID,rating
0,A3NAA6BH9LWIH4,B002DYJEMQ,5.0
1,A3IKOEE8Z3T6BH,B0002CZV78,5.0
2,A2PBMCBT1R8TTL,B001V5K2S8,5.0
3,A3CSSZ6U5J4YS5,B000WN4J9S,5.0
4,A10FM4ILBIMJJ7,B0002E1G5C,5.0


In [50]:
#merging the test set with the predictions DF
results = pd.merge(test_content, top_rec[['reviewerID','recommendations','score','predictedRating']],  left_on=['reviewerID','productID'], right_on = ['reviewerID','recommendations'] ,how='left')

In [51]:
#calculating the error based on predicted rating and actual rating of the test set
results['error'] =pd.to_numeric(results['predictedRating'])- pd.to_numeric(results['rating'])

In [52]:
results.head()

Unnamed: 0,reviewerID,productID,rating,recommendations,score,predictedRating,error
0,A3NAA6BH9LWIH4,B002DYJEMQ,5.0,B002DYJEMQ,0.01595,1.0,-4.0
1,A3IKOEE8Z3T6BH,B0002CZV78,5.0,,,,
2,A2PBMCBT1R8TTL,B001V5K2S8,5.0,,,,
3,A3CSSZ6U5J4YS5,B000WN4J9S,5.0,B000WN4J9S,0.026259,1.0,-4.0
4,A10FM4ILBIMJJ7,B0002E1G5C,5.0,B0002E1G5C,0.053992,1.0,-4.0


In [54]:
#calculating the RMSE
rmse = np.sqrt((results['error']**2).mean())
rmse

3.6175557890028194