In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dfply import *

%matplotlib inline

In [2]:
df = pd.read_csv("./data/ratings_Electronics.csv", names=['userId', 'productId','rating','timestamp'])

In [3]:
df_sub = df >> mask(
    X.timestamp <= (X.timestamp.min() + X.timestamp.max())/2
)
df_sub.head(10)

Unnamed: 0,userId,productId,rating,timestamp
165,A2R4GEWPLORVSO,899336795,2.0,1103328000
166,A1KKUYTDUZDZSA,899336795,4.0,1104192000
170,A28K8QC9C4WPGE,899336795,1.0,1141084800
171,A266DODBJYK0X,899336795,1.0,1116806400
172,A17RBVZX3VTNBW,899336795,1.0,1111449600
173,AV4GK35MHBFMW,899336795,1.0,1133395200
174,A3UKB1QYS8KBW0,899336795,1.0,1104537600
3936,A3DX16W5GTC0TL,6301977173,1.0,1122595200
3937,A2XM8ANEZJR4X7,6301977173,5.0,1094342400
3949,A30PMBDGB7VPAO,6301977173,4.0,1006905600


In [4]:
product_lst = (df_sub >> group_by(
    X.productId
) >> summarize(
    Count = n(X.userId)
) >> ungroup() >> arrange(
    X.Count
)>> mask(
    X.Count <=20
))['productId'].values

product_lst

array(['B000085BCW', 'B000B6Q13S', 'B000B6Q0LQ', ..., 'B0000AQFUR',
       'B00009ZHS7', 'B000AMO2H0'], dtype=object)

In [5]:
df_sub["productId"].unique()

array(['0899336795', '6301977173', '7805717443', ..., 'B00D12U1IK',
       'B00DR0PDNE', 'B00E3QH61S'], dtype=object)

In [6]:
df_sub2=(df_sub >> mask(
    ~X.productId.isin(product_lst)
))

In [7]:
df_product_features = df_sub2.pivot(
    index='userId',
    columns='productId',
    values='rating'
).fillna(0)

In [8]:
df_product_features.head()

productId,6301977173,B00000DM9W,B00000DMA3,B00000G20L,B00000IGBF,B00000J05A,B00000J060,B00000J061,B00000J0D5,B00000J0D8,...,B000FDOWQK,B000FGEC94,B000FI2Y5Q,B000FJEYZS,B000H1UJRG,B000HCT12O,B000HEW0KW,B000HKIPVE,B000NONHYY,B0055P9K38
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1002MCOTLC9PR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003EYWAQFOUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003HDK1GHMSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003I35SXKFUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004OUGEY1MFQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from surprise import KNNWithMeans, SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV, train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_sub2[['userId', 'productId', 'rating']], reader)

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.3760560380861389
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [10]:
trainset, testset = train_test_split(data, test_size=0.2,random_state=123)

In [11]:
svd_model=SVD(n_epochs=10,reg_all=0.4,lr_all=0.005)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17887d790>

In [12]:
svd_model.predict(uid="AGHZXQL9F94T9",iid="B00002EQD2")

Prediction(uid='AGHZXQL9F94T9', iid='B00002EQD2', r_ui=None, est=3.6621681934609294, details={'was_impossible': False})

In [None]:
knn_model = KNNWithMeans(k=9, sim_options={'name': 'pearson_baseline', 'user_based': True})

In [None]:
knn_model.fit(trainset)

In [None]:
knn_model.predict(uid="AGHZXQL9F94T9",iid="B00002EQD2")

In [13]:
from collections import defaultdict
def get_top_n(model,data, n=n):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in model.test(data,verbose=0):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.g
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [14]:
get_top_n(svd_model,testset,3)

defaultdict(list,
            {'AHCVWPLA1O4X8': [('B00065AO0K', 3.9842173763752293),
              ('B000BMXJR8', 3.900289671085697),
              ('B000A3WS84', 3.896146497108509)],
             'A81REJ5E4Y5R7': [('B0002WTK48', 3.670043838877743)],
             'A1BAPER1EEJOP5': [('B00004YZCV', 3.8768456935697193)],
             'A2M8R5FYX2SKON': [('B0000CE1VP', 4.1231729554586565)],
             'A2I1NGKVC2LGHX': [('B00004Z6BJ', 3.5146043421595747)],
             'A3GB2190SJIDSJ': [('B00006LHLC', 3.782492713640056)],
             'A3VE806XWJOKUA': [('B00008KWWF', 3.97896483906433)],
             'A2928S5BNHSPRF': [('B000EF1820', 3.610785807894593)],
             'AOME13PIKI4QB': [('B00006CXS8', 3.3220783137133547)],
             'A1T1GPUM2HH3S2': [('B00004YBVE', 3.2019069397606237)],
             'A3F4XPEH0F8WYI': [('B000CS7U1C', 4.178164835650888)],
             'A4X6TT3ZKXY6F': [('B00008W7LL', 3.8183010156962234)],
             'A2T60N64FXRD8C': [('B0000633EO', 3.7626826872578003)

In [15]:
def get_new_ratings(user,model,data):
    test_pred = model.test(data,verbose=0)
    pred = pd.DataFrame(test_pred)
    pred = pred[(pred['uid'] == user)][['iid', 'r_ui','est']].sort_values(by = 'est',ascending = False).head(10)
    return(pred)

In [17]:
get_new_ratings("AGHZXQL9F94T9",svd_model,testset)

Unnamed: 0,iid,r_ui,est
4722,B0007QKMQY,5.0,4.238884
2849,B00006DY6M,1.0,4.124844
9892,B00064O1P0,4.0,3.733312
4434,B00008IOEJ,2.0,3.712951
4737,B000063BGY,2.0,3.629094
4553,B0002ZAEYA,4.0,3.48103
15410,B00008J6VW,3.0,3.40203
15679,B000069J56,4.0,3.326462
4310,B0001KWGP6,4.0,3.25059


In [None]:
get_new_ratings("A231WM2Z2JL0U3",knn_model,testset)

In [18]:
df_product_features.head()

productId,6301977173,B00000DM9W,B00000DMA3,B00000G20L,B00000IGBF,B00000J05A,B00000J060,B00000J061,B00000J0D5,B00000J0D8,...,B000FDOWQK,B000FGEC94,B000FI2Y5Q,B000FJEYZS,B000H1UJRG,B000HCT12O,B000HEW0KW,B000HKIPVE,B000NONHYY,B0055P9K38
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1002MCOTLC9PR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003EYWAQFOUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003HDK1GHMSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003I35SXKFUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004OUGEY1MFQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
from scipy.sparse import csr_matrix
users_items_pivot_sparse_matrix = csr_matrix(df_product_features)
users_items_pivot_sparse_matrix

<98950x2289 sparse matrix of type '<class 'numpy.float64'>'
	with 120393 stored elements in Compressed Sparse Row format>

In [20]:
from scipy.sparse.linalg import svds
NUMBER_OF_FACTORS_MF = 15
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [21]:
U.shape

(98950, 15)

In [23]:
Vt.shape

(15, 2289)

In [24]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [25]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 2.84313431e-07,  1.15327885e-04,  5.18765229e-05, ...,
         7.71173358e-05,  1.07076256e-04,  1.00779214e-05],
       [ 4.65917167e-08,  2.67692719e-06,  6.74561844e-06, ...,
         1.65238015e-05,  4.31234703e-06,  3.33429037e-06],
       [ 9.69077004e-08,  3.71388823e-05,  1.13512368e-05, ...,
         1.96162761e-05,  3.49477552e-05,  4.40193200e-05],
       ...,
       [ 2.84313431e-07,  1.15327885e-04,  5.18765229e-05, ...,
         7.71173358e-05,  1.07076256e-04,  1.00779214e-05],
       [ 4.09561531e-08,  1.32532580e-05,  3.60351931e-06, ...,
         5.55788042e-06,  1.20054807e-05,  1.25278264e-06],
       [ 3.54571740e-07,  6.37095040e-08,  1.12987350e-06, ...,
         5.32781034e-06,  1.17970558e-06, -2.03371276e-07]])

In [26]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [31]:
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = df_product_features.columns, index=df_product_features.index)
cf_preds_df.head(10)

productId,6301977173,B00000DM9W,B00000DMA3,B00000G20L,B00000IGBF,B00000J05A,B00000J060,B00000J061,B00000J0D5,B00000J0D8,...,B000FDOWQK,B000FGEC94,B000FI2Y5Q,B000FJEYZS,B000H1UJRG,B000HCT12O,B000HEW0KW,B000HKIPVE,B000NONHYY,B0055P9K38
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1002MCOTLC9PR,0.110725,0.110735,0.110729,0.110728,0.110726,0.110738,0.110729,0.110779,0.110726,0.110726,...,0.110725,0.110726,0.110726,0.110725,0.110726,0.11073,0.110733,0.110731,0.110734,0.110725
A1003EYWAQFOUG,0.110725,0.110725,0.110725,0.110726,0.110725,0.110726,0.110725,0.110728,0.110725,0.110725,...,0.110725,0.110726,0.110725,0.110725,0.110725,0.110728,0.110725,0.110726,0.110725,0.110725
A1003HDK1GHMSP,0.110725,0.110728,0.110726,0.110726,0.110725,0.110729,0.110725,0.110736,0.110725,0.110725,...,0.110725,0.110725,0.110725,0.110725,0.110725,0.110733,0.110727,0.110726,0.110728,0.110728
A1003I35SXKFUG,0.110725,0.110734,0.110734,0.110735,0.110728,0.110743,0.110732,0.110803,0.110728,0.110729,...,0.110725,0.110727,0.110728,0.110725,0.110727,0.110754,0.110732,0.11074,0.110734,0.110734
A1004OUGEY1MFQ,0.110725,0.110729,0.11073,0.110729,0.110726,0.110736,0.110731,0.110783,0.110726,0.110727,...,0.110725,0.110725,0.110726,0.110725,0.110726,0.11074,0.110729,0.110732,0.110729,0.110728
A1007OOXKZK6ZR,0.110725,0.110725,0.110725,0.110725,0.110725,0.110726,0.110725,0.110727,0.110725,0.110725,...,0.110725,0.110725,0.110725,0.110725,0.110725,0.110725,0.110725,0.110725,0.110725,0.110725
A1008EEMWRT7DD,0.110725,0.110738,0.110733,0.110737,0.110728,0.110752,0.110728,0.110784,0.110728,0.110731,...,0.110725,0.110726,0.110728,0.110725,0.110727,0.110754,0.110736,0.110747,0.110737,0.110727
A1008ZG6MG1YYB,0.110725,0.11073,0.110727,0.110728,0.110726,0.110734,0.110726,0.110741,0.110726,0.110726,...,0.110725,0.110725,0.110726,0.110725,0.110725,0.110734,0.110729,0.11073,0.110729,0.110726
A100BAHQAYT379,0.110725,0.110725,0.110726,0.110725,0.110725,0.110725,0.110727,0.110741,0.110725,0.110725,...,0.110725,0.110725,0.110725,0.110725,0.110725,0.110726,0.110725,0.110725,0.110725,0.110725
A100H2PFTQXRDN,0.110725,0.110725,0.110726,0.110727,0.110725,0.110728,0.110725,0.110735,0.110725,0.110727,...,0.110725,0.110725,0.110725,0.110725,0.110725,0.110728,0.110725,0.110732,0.110725,0.110725


Collaborative filtering - memory based

In [34]:
df_product_features.head()

productId,6301977173,B00000DM9W,B00000DMA3,B00000G20L,B00000IGBF,B00000J05A,B00000J060,B00000J061,B00000J0D5,B00000J0D8,...,B000FDOWQK,B000FGEC94,B000FI2Y5Q,B000FJEYZS,B000H1UJRG,B000HCT12O,B000HEW0KW,B000HKIPVE,B000NONHYY,B0055P9K38
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1002MCOTLC9PR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003EYWAQFOUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003HDK1GHMSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1003I35SXKFUG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004OUGEY1MFQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(df_product_features.values,metric="cosine")
item_similarity = pairwise_distances(df_product_features.values.T,metric="cosine")

In [38]:
def prediction(ratings,similarity,type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)])
    elif type =='item':
        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])

    return pred

In [39]:
item_prediction = prediction(df_product_features.values,item_similarity,type='item')

In [40]:
item_prediction

array([[0.00043708, 0.00043714, 0.00043728, ..., 0.00043718, 0.00043712,
        0.00043718],
       [0.00218538, 0.0021857 , 0.00218639, ..., 0.00218592, 0.0021856 ,
        0.00218589],
       [0.00218538, 0.0021857 , 0.00218639, ..., 0.00218592, 0.0021856 ,
        0.00218589],
       ...,
       [0.00043708, 0.00043714, 0.00043728, ..., 0.00043718, 0.00043712,
        0.00043718],
       [0.00087415, 0.00087428, 0.00087456, ..., 0.00087437, 0.00087424,
        0.00087435],
       [0.00218538, 0.0021857 , 0.00218639, ..., 0.00218592, 0.0021856 ,
        0.00218589]])