## Loading data and necessary libraries

In [49]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [5]:
# Read data
data = pd.read_csv("merged_file.csv", index_col = 0)

In [6]:
# Explore shape
data.shape

(497577, 20)

In [7]:
# Explore first rows
data.head()

Unnamed: 0,reviewID,overall,verified,reviewTime,reviewerID,productID,reviewText,summary,vote,style,category,title,brand,rank,main_cat,description,also_buy,also_view,feature,numberOfReviews
0,0,5,True,2015-10-17,A1HP7NVNPFMA4N,700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
1,1,4,False,2015-07-27,A1JGAP0185YJI6,700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
2,2,3,True,2015-02-23,A1YJWEXHQBWK2B,700026657,ok game.,Three Stars,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
3,3,2,True,2015-02-20,A2204E1TH211HT,700026657,"found the game a bit too complicated, not what...",Two Stars,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
4,4,5,True,2014-12-25,A2RF5B5H74JLPE,700026657,"great game, I love it and have played it since...",love this game,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13


In [11]:
# Unique reviewerIDs (%)
len(data["reviewerID"].unique())/len(data)*100

11.098382762868862

## Constructing the Matrix

In [28]:
# Pivot table to create the matrix
ratings_matrix = data.pivot_table(index = 'reviewerID', columns = 'productID', values = 'overall', aggfunc = "mean", fill_value = 0)

In [29]:
# Explore first rows
ratings_matrix.head()

productID,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0059486XI1Z0P98KP35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0220159ZRNBTRKLG08H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0266076X6KPZ6CCHGVS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0277912HT4JSJKVSL3E,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02836981FYG9912C66F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Explore shape of user-item matrix
ratings_matrix.shape

(55223, 17408)

In [57]:
# Unique users
print(len(data["reviewerID"].unique()))
# Unique products
print(len(data["productID"].unique()))

55223
17408


In [30]:
# Check for an example
data.groupby(["reviewerID", "productID"]).count()["overall"]

reviewerID            productID 
A0059486XI1Z0P98KP35  B00024W1U6    1
                      B000WE8JES    2
                      B00SUUTSA0    1
                      B012JMS4W2    1
A0220159ZRNBTRKLG08H  B000084318    1
                                   ..
AZZTOUKVTUMVM         B0012N1Z8A    2
                      B001CP5WXC    1
                      B0094X2066    1
                      B00BU3ZLJQ    1
                      B00CXTX2YW    1
Name: overall, Length: 473427, dtype: int64

In [35]:
# Let's look at the matrix for user A0059486XI1Z0P98KP35, product B000WE8JES: ORIGINAL DATA
data.loc[(data["reviewerID"] == "A0059486XI1Z0P98KP35") & (data["productID"] == "B000WE8JES")]

Unnamed: 0,reviewID,overall,verified,reviewTime,reviewerID,productID,reviewText,summary,vote,style,category,title,brand,rank,main_cat,description,also_buy,also_view,feature,numberOfReviews
143178,143178,5,True,2014-06-07,A0059486XI1Z0P98KP35,B000WE8JES,If you're like me and happen to run out of spa...,"Great choice, you won't be disappointed",0,,"['Video Games', 'Retro Gaming & Microconsoles'...",Playstation 2 Memory Card 16MB,by Katana,">#16,000 in Video Games (See Top 100 in Video ...",Video Games,"[""Katana's official Sony-licensed 16 MB Memory...","['B000066TS5', 'B00004YRQ9', 'B0045L3SNQ', 'B0...","['B0792PXC9T', 'B0015DOL8I', 'B00006F2EP', 'B0...","['16 MB Storage', 'Allows the user to save gam...",53
143204,143204,5,True,2014-06-07,A0059486XI1Z0P98KP35,B000WE8JES,If you're like me and happen to run out of spa...,"Great choice, you won't be disappointed",0,,"['Video Games', 'Retro Gaming & Microconsoles'...",Playstation 2 Memory Card 16MB,by Katana,">#16,000 in Video Games (See Top 100 in Video ...",Video Games,"[""Katana's official Sony-licensed 16 MB Memory...","['B000066TS5', 'B00004YRQ9', 'B0045L3SNQ', 'B0...","['B0792PXC9T', 'B0015DOL8I', 'B00006F2EP', 'B0...","['16 MB Storage', 'Allows the user to save gam...",53


In [36]:
# Let's look at the matrix for user A0059486XI1Z0P98KP35, product B000WE8JES
ratings_matrix.loc["A0059486XI1Z0P98KP35", "B000WE8JES"]

5

In [46]:
# Check unique combinations of user and product


### Implementing Non-negative Matrix Factorization

In [48]:
# Convert matrix to array
R = np.array(ratings_matrix)

In [50]:
# Instantiate NMF with 4 components
nmf = NMF(n_components = 4, max_iter = 2000, random_state = 232323, verbose = 5)

In [51]:
# Find matrix with features
W = nmf.fit_transform(R)

violation: 1.0
violation: 0.4383189547612514
violation: 0.16048231225841422
violation: 0.1171522333200848
violation: 0.09651815126215055
violation: 0.08350624760043215
violation: 0.07309329943630058
violation: 0.06380219437466543
violation: 0.05509813433566676
violation: 0.0469522428326789
violation: 0.039498366108902075
violation: 0.03274643956874198
violation: 0.027025892710030596
violation: 0.02221280501040572
violation: 0.018192090933477862
violation: 0.014874722340107682
violation: 0.012164657498937045
violation: 0.00995144143959776
violation: 0.00815794217303045
violation: 0.006710692145675376
violation: 0.005542527784494082
violation: 0.004600994639416332
violation: 0.0038328722936516514
violation: 0.0032046669937021185
violation: 0.0026901726384994003
violation: 0.002267496855306264
violation: 0.0019196500299080883
violation: 0.0016334464723922152
violation: 0.001397839567040678
violation: 0.0012036428552340875
violation: 0.00104340278060873
violation: 0.0009110187138908943
vio

In [58]:
# Explore shape of Feature Matrix (rows = users, columns = features)
W.shape

(55223, 4)

In [62]:
# Explore first rows of Feature Matrix
W[:5]

array([[0.0026341 , 0.00358641, 0.00070855, 0.00416665],
       [0.        , 0.0339253 , 0.        , 0.03151454],
       [0.00302407, 0.01389829, 0.        , 0.01052313],
       [0.04617359, 0.        , 0.00074319, 0.0729523 ],
       [0.06926943, 0.        , 0.0018468 , 0.00072501]])

In [64]:
# Construct Coefficient Matrix ( V = W * H)
H = nmf.components_

In [67]:
# Explore shape of Coefficient Matrix (rows = feaures, columns = products)
H.shape

(4, 17408)

In [63]:
# Reconstruction error
print(nmf.reconstruction_err_)

2998.448380876802


## Recommending based on NMF

In [68]:
# Reconstruct the matrix by computing dot product between Feature Matrix (W) and Coefficient Matrix (H)
predictions = np.dot(W, H)

In [74]:
# Convert to df
predictions_df = pd.DataFrame(predictions, columns = ratings_matrix.columns)
predictions_df.index = ratings_matrix.index

In [75]:
# Explore resulting df
print(predictions_df.shape)
predictions_df.head()

(55223, 17408)


productID,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0059486XI1Z0P98KP35,0.000118,0.000114,9.3e-05,0.000761,8.571009e-06,0.00013,4.9e-05,9.4e-05,1.1e-05,1.1e-05,...,6e-06,5e-05,4.2e-05,0.000142,5.4e-05,0.000158,1.7e-05,0.000104,0.000531,2.9e-05
A0220159ZRNBTRKLG08H,0.000419,0.000673,0.000287,0.006049,7.845146e-05,0.001078,0.000422,0.000708,8.9e-05,0.0001,...,1.9e-05,0.000163,0.000216,0.0,0.000128,0.001035,6.2e-05,0.000499,0.002027,5.5e-05
A0266076X6KPZ6CCHGVS,0.000212,0.000315,0.000189,0.002132,3.134602e-05,0.000398,0.000162,0.000236,3.6e-05,4e-05,...,1e-05,8.7e-05,8.9e-05,0.000163,7.4e-05,0.000369,3.1e-05,0.00021,0.000992,5e-05
A0277912HT4JSJKVSL3E,0.002068,0.000908,0.001087,0.010584,2.424416e-05,0.001323,0.000338,0.001638,5e-05,4.8e-05,...,9.6e-05,0.00087,0.000717,0.002496,0.000785,0.002758,0.000299,0.00182,0.009077,0.00041
A02836981FYG9912C66F,0.001656,0.001064,0.001631,0.000105,2.409425e-07,1.5e-05,3e-06,1.6e-05,2.3e-05,6e-06,...,7.8e-05,0.000745,0.000362,0.003744,0.000745,0.000568,0.000233,0.00101,0.007047,0.000615


We would recommend products to users with a high predicted rating