In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dfply import *
from scipy.sparse.linalg import svds

%matplotlib inline

In [2]:
df = pd.read_csv("./data/ratings_Electronics.csv", names=['userId', 'productId','rating','timestamp'])

In [9]:
df_sub = df >> mask(
    X.timestamp <= (X.timestamp.min() + X.timestamp.max())/2
)
df_sub.head(10)

Unnamed: 0,userId,productId,rating,timestamp
165,A2R4GEWPLORVSO,899336795,2.0,1103328000
166,A1KKUYTDUZDZSA,899336795,4.0,1104192000
170,A28K8QC9C4WPGE,899336795,1.0,1141084800
171,A266DODBJYK0X,899336795,1.0,1116806400
172,A17RBVZX3VTNBW,899336795,1.0,1111449600
173,AV4GK35MHBFMW,899336795,1.0,1133395200
174,A3UKB1QYS8KBW0,899336795,1.0,1104537600
3936,A3DX16W5GTC0TL,6301977173,1.0,1122595200
3937,A2XM8ANEZJR4X7,6301977173,5.0,1094342400
3949,A30PMBDGB7VPAO,6301977173,4.0,1006905600


In [32]:
product_lst = (df_sub >> group_by(
    X.productId
) >> summarize(
    Count = n(X.userId)
) >> ungroup() >> arrange(
    X.Count
)>> mask(
    X.Count <=20
))['productId'].values

product_lst

array(['B000085BCW', 'B000B6Q13S', 'B000B6Q0LQ', ..., 'B0000AQFUR',
       'B00009ZHS7', 'B000AMO2H0'], dtype=object)

In [33]:
df_sub["productId"].unique()

array(['0899336795', '6301977173', '7805717443', ..., 'B00D12U1IK',
       'B00DR0PDNE', 'B00E3QH61S'], dtype=object)

In [35]:
df_sub2=(df_sub >> mask(
    X.productId.isin(product_lst)
))

In [38]:
df_product_features = df_sub2.pivot(
    index='userId',
    columns='productId',
    values='rating'
).fillna(0)

In [43]:
df_product_features.head()

productId,0899336795,7805717443,9043413585,9751023327,9752890474,9758515810,9810521510,9864216155,B000000O48,B000000X4X,...,B007S02GRK,B007SEZT1Q,B0082E9K7U,B0088LYCZC,B0094SHQI8,B009YC3YDK,B00D05BKOW,B00D12U1IK,B00DR0PDNE,B00E3QH61S
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1000CO22IW508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10012K7DF3SBQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004HHMSDY5IP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1006RKX3L8DUL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100HVSUE3S1GS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
from surprise import KNNWithMeans, SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV, train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_sub2[['userId', 'productId', 'rating']], reader)

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.4660802653195333
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [96]:
trainset, testset = train_test_split(data, test_size=0.2,random_state=123)

In [99]:
svd_model=SVD(n_epochs=10,reg_all=0.4,lr_all=0.005)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x186368c70>

In [101]:
svd_model.predict(uid="AGHZXQL9F94T9",iid="B00002EQD2")

Prediction(uid='AGHZXQL9F94T9', iid='B00002EQD2', r_ui=None, est=3.0134470097163017, details={'was_impossible': False})

In [102]:
knn_model = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
knn_model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x186368dc0>

In [103]:
knn_model.predict(uid="AGHZXQL9F94T9",iid="B00002EQD2")

Prediction(uid='AGHZXQL9F94T9', iid='B00002EQD2', r_ui=None, est=5, details={'actual_k': 1, 'was_impossible': False})

In [109]:
from collections import defaultdict
def get_top_n(model,data, n=n):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in model.test(data,verbose=0):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [114]:
get_top_n(knn_model,testset,3)

defaultdict(list,
            {'A2EEN5VFJOE86Z': [('B0000669B8', 3.664129388018944),
              ('B000068O20', 3.664129388018944)],
             'A1PQ3EE1PJ45TP': [('B0001MHL0E', 3.664129388018944)],
             'A1AOQYE4FLES7J': [('B00005AMDA', 5)],
             'AX1URRBGS3CI8': [('B000ARLOE4', 4.0)],
             'AUBTSYY8R4F78': [('B0000AI0NK', 3.664129388018944)],
             'A11S77NFFS2ZNL': [('B0001OP226', 3.664129388018944)],
             'A1CV19Z3EDXMJ6': [('B0007L6TEI', 4.333333333333333),
              ('B00000J6WY', 4.333333333333333)],
             'A34Y6CNLT3ZKQA': [('B00005852M', 3.664129388018944)],
             'A8KCZ0P1YHHB3': [('B0006IWLX0', 5),
              ('B00030DEQE', 3.664129388018944)],
             'A3LAGZEURVPXM2': [('B00005R8TA', 5)],
             'A1BQX78OKLHGTU': [('B00004WZQO', 3.664129388018944)],
             'A32XBS76GRQRNN': [('B00004Z6R2', 3.664129388018944)],
             'A1X80TI1FMHG7F': [('B000G5WGYW', 3.664129388018944)],
             'A2

In [117]:
def get_new_ratings(user,model,data):
    test_pred = model.test(data,verbose=0)
    pred = pd.DataFrame(test_pred)
    pred = pred[(pred['uid'] == user)][['iid', 'r_ui','est']].sort_values(by = 'est',ascending = False).head(10)
    return(pred)

In [121]:
get_new_ratings("A231WM2Z2JL0U3",svd_model,testset)

Unnamed: 0,iid,r_ui,est
6145,B00005B6YF,3.0,4.336592
15692,B00005OMZN,3.0,4.274609
9068,B000066CDG,3.0,4.265817
6774,B00006LU7E,4.0,4.251071
17677,B00005137P,5.0,4.214791
874,B00004SD9Q,5.0,4.212428
229,B00003CWCI,4.0,4.205112
11700,B00005NHAK,4.0,4.204746
6087,B0000510NY,3.0,4.198586
104,B00005A0R4,5.0,4.159496


In [122]:
get_new_ratings("A231WM2Z2JL0U3",knn_model,testset)

Unnamed: 0,iid,r_ui,est
104,B00005A0R4,5.0,4.303797
8806,B00000J1US,5.0,4.303797
16628,B00005T3TL,5.0,4.303797
15692,B00005OMZN,3.0,4.303797
14297,B0000510YS,5.0,4.303797
13404,B00004SCIT,4.0,4.303797
13056,B00005I9P3,5.0,4.303797
11700,B00005NHAK,4.0,4.303797
10983,B0000A55F4,3.0,4.303797
10816,B00006LUJZ,4.0,4.303797
