# SVD baseline + Slope One Implementation
- This notebook includes the implementation of SVD baseline and Slope One


## Installation and Importing Libraries

In [None]:
!pip install --upgrade numpy
!pip install scikit-surprise

In [None]:
!pip install numba --upgrade
!pip install --upgrade pandas
!pip install ipysheet

In [921]:
import json
import sys
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from math import log2
from sklearn.metrics import mean_absolute_error
import random
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import surprise as sp
sys.path.insert(1, 'funk-svd-master')
from funk_svd import SVD
from surprise.model_selection import train_test_split
from ipysheet import sheet, cell, row, column, cell_range


## Loading Data and cleaning

In [770]:
df = pd.read_json('Grocery_and_Gourmet_Food_5.json',lines=True)

In [771]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143860 entries, 0 to 1143859
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   overall         1143860 non-null  int64 
 1   verified        1143860 non-null  bool  
 2   reviewTime      1143860 non-null  object
 3   reviewerID      1143860 non-null  object
 4   asin            1143860 non-null  object
 5   reviewerName    1143722 non-null  object
 6   reviewText      1143470 non-null  object
 7   summary         1143641 non-null  object
 8   unixReviewTime  1143860 non-null  int64 
 9   vote            158202 non-null   object
 10  style           592086 non-null   object
 11  image           9510 non-null     object
dtypes: bool(1), int64(2), object(9)
memory usage: 97.1+ MB


In [772]:
#drop useless variables - 'image','vote','style','review_text','summary','reviewTime'
#filter out unverified ratings
#rename columns
#drop duplicate ratings -- extract latest review

df = df[df['verified']==True]
drop_list = ['image','vote','style','reviewText','summary','reviewTime','reviewerName','unixReviewTime','verified']
df.drop(df[drop_list], axis=1, inplace=True)

df = df.iloc[:,[1,2,0]]
df.dropna()

df.rename(columns={'reviewerID':'u_id', 'asin':'i_id', 'overall':'rating'}, inplace=True)
df.drop_duplicates(inplace=True)

## Reduce dataset

In [773]:
df_TopReviewers = df.groupby('u_id').agg(mean_rating = ('rating',
'mean'),
number_of_ratings =
('rating', 'count')).reset_index()

df_TopRev = df_TopReviewers[df_TopReviewers['number_of_ratings']>20]
df_TopRev

Unnamed: 0,u_id,mean_rating,number_of_ratings
46,A100WO06OQR8BQ,4.416667,84
142,A1047EDJ84IMAS,4.491803,61
1799,A11WNQ3PPU73Y1,4.444444,90
2264,A12FLMSWRKK2IK,4.972603,73
2452,A12O5SEIF162P8,4.830189,53
...,...,...,...
122289,AWP3OETSAYH0V,4.545455,55
122754,AX80SWHDEKJCY,4.553571,56
123051,AXK37UZY8UPYP,4.436364,55
123499,AY1EF0GOH80EK,4.037500,80


In [774]:
df_final = pd.merge(df, df_TopRev[['u_id']], on='u_id',how='inner')
df_final

Unnamed: 0,u_id,i_id,rating
0,ALOGZDSPWZI72,4639725043,4
1,ALOGZDSPWZI72,B0000EYK3E,3
2,ALOGZDSPWZI72,B00014JNI0,5
3,ALOGZDSPWZI72,B00016LZT8,5
4,ALOGZDSPWZI72,B0001M0Z7A,5
...,...,...,...
27857,AA2YXSKQWPG11,B00K279FP2,5
27858,AA2YXSKQWPG11,B00Q1RT1N8,5
27859,AA2YXSKQWPG11,B00UH2J5M0,5
27860,AA2YXSKQWPG11,B01AAYESMG,5


In [775]:
print('The ratings dataset has', df_final['u_id'].nunique(), 'reviewers')
print('The ratings dataset has', df_final['i_id'].nunique(), 'unique products')
print('The unique ratings are', sorted(df_final['rating'].unique()), 'ratings')


The ratings dataset has 376 reviewers
The ratings dataset has 13697 unique products
The unique ratings are [1, 2, 3, 4, 5] ratings


In [776]:
#reviewer_products_ratings = df_final.groupby('reviewerID')[['asin', 'overall']].agg(list)
reviewer_products_ratings = df_final.groupby('u_id')
reviewer_products_ratings

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f77d4f47820>

In [777]:
reviewer_ids = list(reviewer_products_ratings.groups.keys())
#reviewer_ids

In [778]:
asin_values = reviewer_products_ratings.get_group(reviewer_ids[1])['i_id'].tolist()
#asin_values

In [779]:
def split_train_test(group):
    train = group.sample(frac=0.6, random_state=123)
    val = group.drop(train.index).sample(frac=0.5, random_state=124)
    test = group.drop(train.index).drop(val.index)
    return train, val, test

In [780]:
train_test_splits = reviewer_products_ratings.apply(split_train_test)


In [781]:
train_test_splits = train_test_splits.reset_index(level=[0], drop=True)


In [782]:
train_df = pd.concat([split[0] for split in train_test_splits])
val_df = pd.concat([split[1] for split in train_test_splits])
test_df = pd.concat([split[2] for split in train_test_splits])


In [783]:
train_df

Unnamed: 0,u_id,i_id,rating
18210,A100WO06OQR8BQ,B00OQDD5ZY,3
18163,A100WO06OQR8BQ,B0029XLH4Y,5
18144,A100WO06OQR8BQ,B000F3Q4AM,3
18219,A100WO06OQR8BQ,B0167BR01C,5
18202,A100WO06OQR8BQ,B00GMWB09U,5
...,...,...,...
14540,AZV26LP92E6WU,B000S5RBZ4,5
14554,AZV26LP92E6WU,B001E53016,5
14584,AZV26LP92E6WU,B002DHMVQE,5
14553,AZV26LP92E6WU,B001E52YY0,5


In [784]:
val_df

Unnamed: 0,u_id,i_id,rating
18165,A100WO06OQR8BQ,B003DQDK50,5
18200,A100WO06OQR8BQ,B00FNP3LC6,5
18157,A100WO06OQR8BQ,B001ELL4ZY,2
18213,A100WO06OQR8BQ,B00SUE81HC,5
18167,A100WO06OQR8BQ,B00474ASJI,5
...,...,...,...
14513,AZV26LP92E6WU,B000E1FZHS,5
14608,AZV26LP92E6WU,B00N4Z3IZW,5
14562,AZV26LP92E6WU,B001EPQVFS,5
14606,AZV26LP92E6WU,B00IAE8ZQY,5


In [785]:
test_df

Unnamed: 0,u_id,i_id,rating
18142,A100WO06OQR8BQ,B00099XOQO,5
18152,A100WO06OQR8BQ,B0014EW4C8,5
18168,A100WO06OQR8BQ,B004S7TZD4,5
18170,A100WO06OQR8BQ,B004WZ4KK0,5
18172,A100WO06OQR8BQ,B005HB19KW,1
...,...,...,...
14596,AZV26LP92E6WU,B005G2FCI2,5
14598,AZV26LP92E6WU,B005VBD46U,5
14602,AZV26LP92E6WU,B00BT7C9R0,5
14610,AZV26LP92E6WU,B000F0DSAO,5


## SVD using FunkSGD
- changed column names
- added cross-validation data split

In [933]:
svd = SVD(lr=0.009, reg=0.05, n_epochs=100, n_factors=90, 
          early_stopping=True, shuffle=False, min_rating=1, max_rating=5)

In [939]:
svd.fit(X=train_df, X_val=val_df)

In [935]:
pred_svd = svd.predict(test_df)

In [936]:
rmse_svd = mean_squared_error(test_df['rating'], pred_svd, squared=False)
rmse_svd

0.855878538450241

In [937]:
mae_svd = mean_absolute_error(test_df['rating'], pred_svd)
mae_svd

0.6623146671535849

## Prediction using Slope One

In [938]:
# Slope One Algorithm
algo_slope = sp.SlopeOne()

r = sp.Reader()
train2_df = train_df.append(val_df)
trainset = sp.Dataset.load_from_df(train2_df,r)
train2_df.insert(len(train2_df.columns), 'timestamp', np.zeros(len(train2_df.index)).tolist())


trainset = trainset.construct_trainset(train2_df.values.tolist())


algo_slope.fit(trainset)
pred_slope = algo_slope.test(test_df.values.tolist())

# Then compute RMSE + MAE
sp.accuracy.rmse(predictions)
sp.accuracy.mae(predictions)

train2_df.drop('timestamp', axis=1)


RMSE: 1.0529
MAE:  0.7481


Unnamed: 0,u_id,i_id,rating
18210,A100WO06OQR8BQ,B00OQDD5ZY,3
18163,A100WO06OQR8BQ,B0029XLH4Y,5
18144,A100WO06OQR8BQ,B000F3Q4AM,3
18219,A100WO06OQR8BQ,B0167BR01C,5
18202,A100WO06OQR8BQ,B00GMWB09U,5
...,...,...,...
14513,AZV26LP92E6WU,B000E1FZHS,5
14608,AZV26LP92E6WU,B00N4Z3IZW,5
14562,AZV26LP92E6WU,B001EPQVFS,5
14606,AZV26LP92E6WU,B00IAE8ZQY,5


In [905]:
def get_topN(base_df, rating_list, flag, known):
    if(not known):
        results = base_df[['u_id','i_id']].copy()
        if flag == 'Slope':
            results.insert(2, 'rating', [x.est for x in rating_list])
        else:
            results.insert(2, 'rating', [x for x in rating_list])
    else:
        results = base_df.copy()
        
    results['item_rating'] = results[['i_id', 'rating']].values.tolist()
    results.drop(['i_id', 'rating'], axis=1, inplace=True)

    results_groups = results.groupby('u_id')
    results_groups = results_groups[['item_rating']].agg(list)
    results_groups['item_rating'] = results_groups.apply(lambda row: sort_rec(row['item_rating'])
                                                                           , axis=1)
    results_groups.reset_index(inplace=True)
    return results_groups


In [902]:
def sort_rec(rating_list):
    return sorted(rating_list, key= lambda x: x[1], reverse=True) #[:10]

In [903]:
results_slope_groups = get_topN(test_df, pred_slope, 'Slope', False)
results_slope_groups.apply(lambda row : sort_rec(row['item_rating']), axis=1)

0      [[B00C584ALK, 4.932835820895522], [B00EDADBSQ,...
1      [[B000GAT6NG, 5.0], [B002SWB73C, 5.0], [B000EQ...
2      [[B000SKLPME, 4.4341756811347], [B004ZK48M2, 4...
3      [[B0001N48SG, 5.0], [B00U9W0VPS, 5.0], [B0106G...
4      [[B002HQCWYM, 5.0], [B004LWOJC8, 4.95833333333...
                             ...                        
371    [[B0009F3PIA, 5.0], [B000WR8TT0, 4.86090909090...
372    [[B011FEEBTU, 5.0], [B000F4DKAI, 4.62222222222...
373    [[B000GZSDZI, 4.737121212121212], [B003ZXCFQ6,...
374    [[B00B59QBE4, 5.0], [B00ESDZIRA, 5.0], [B002B8...
375    [[B000E1HUVC, 5.0], [B000ED9L6C, 5.0], [B00BT7...
Length: 376, dtype: object

In [795]:
known_ratings = get_topN(test_df, test_df['rating'],'', True)
known_ratings.apply(lambda row : sort_rec(row['item_rating']), axis=1)

0      [[B00099XOQO, 5], [B0014EW4C8, 5], [B004S7TZD4...
1      [[B000EQT77M, 5], [B000GAT6NG, 5], [B002SWB73C...
2      [[B000P6G12U, 5], [B000R4JI2U, 5], [B004ZK48M2...
3      [[B0001N48SG, 5], [B000LKXADS, 5], [B009OWOKFQ...
4      [[B000E1FZHS, 5], [B000F4DKAI, 5], [B001SB4FTW...
                             ...                        
371    [[B0009F3PIA, 5], [B000CL4MFQ, 5], [B000JJHDVG...
372    [[B000F4DKAI, 5], [B00F94YPAS, 5], [B00L2BZCYA...
373    [[B000GZSDZI, 5], [B001HTR1ZU, 5], [B002C4QLGE...
374    [[B000DZDJ0K, 5], [B001EO5NAS, 5], [B003ZXAMPW...
375    [[B000ED9L6C, 5], [B000LKTXNY, 5], [B000LQNK50...
Length: 376, dtype: object

In [906]:
train_ratings = get_topN(train_df, train_df['rating'],'',True)
train_ratings.apply(lambda row : sort_rec(row['item_rating']), axis=1)

0      [[B0029XLH4Y, 5], [B0167BR01C, 5], [B00GMWB09U...
1      [[B0078DQ85S, 5], [B00DHFJVIC, 5], [B00MFC5P4M...
2      [[B00N3FP5G8, 5], [B002VH4H4A, 5], [B00HZZ0NZK...
3      [[B01CECIVF2, 5], [B01FT13LNC, 5], [B000WLW9TW...
4      [[B00J2C7L1S, 5], [B0072DGXJ0, 5], [B0197ALU34...
                             ...                        
371    [[B00WIXYVIY, 5], [B00FCZJ7CK, 5], [B000HDJXH6...
372    [[B00DB8KKDK, 5], [B00JD7MDT2, 5], [B00T2DGTWO...
373    [[B001EQ4Y4W, 5], [B008YUL4KI, 5], [B005VOOOR0...
374    [[B00G8O0W8W, 5], [B01A1H2R7U, 5], [B00BBR1G9U...
375    [[B001EPQTSW, 5], [B001PEZLCM, 5], [B00XOORKRK...
Length: 376, dtype: object

In [796]:
def Precision(predicted_list, known_list):
    tp = 0
    #fp = 0
    known_list = [x[0] for x in known_list]
    predicted_list = [x[0] for x in predicted_list]
    for i in range(0,10):
        if predicted_list[i] in known_list[:10]:
            tp += 1
    
    return (tp/10)
    
        
        

In [797]:
def Recall(predicted_list, known_list):
    tp = 0
    fn = 0
    known_list = [x[0] for x in known_list]
    predicted_list = [x[0] for x in predicted_list]
    for i in range(0,10):
        if predicted_list[i] in known_list[i][:10]:
            tp += 1
    
    
    return (tp/len(known_list))
    

In [798]:
def ndcg(predicted_list, known_list):
    rel = 0
    dcg = 0
    idcg = 0
    known_list = [x[0] for x in known_list][:10]
    predicted_list = [x[0] for x in predicted_list][:10]
    for i in range(0,10):
        if predicted_list[i] == known_list[i]:
            dcg += 1/(log2(i+1+1))
            rel += 1
    
    for j in range(0,rel):
        idcg += 1/(log2(j+1+1))
    
    
    if(idcg):
        return (dcg/idcg)
    else:
        return 0

In [800]:
results_svd_groups = get_topN(test_df, pred_svd, 'svd', False)
results_svd_groups.apply(lambda row : sort_rec(row['item_rating']), axis=1)

0      [[B009E7YBEQ, 4.591134037333008], [B005HB19KW,...
1      [[B000EQT77M, 4.688993930610735], [B000GAT6NG,...
2      [[B000RGYJI6, 4.553275129504622], [B000SKLPME,...
3      [[B009OWOKFQ, 4.961142631747975], [B00RW0MZ6S,...
4      [[B002HQCWYM, 4.936396521390749], [B00C3YAH5E,...
                             ...                        
371    [[B00BCG0OAC, 4.585511343758281], [B000WR8TT0,...
372    [[B011FEEBTU, 4.785913475966577], [B00F94YPAS,...
373    [[B005HB19KW, 4.696248581886642], [B000E46GGO,...
374    [[B00ESDZIRA, 4.113267383986644], [B00B59QBE4,...
375    [[B001L498A2, 4.952719919690081], [B001FA1EHC,...
Length: 376, dtype: object

## Getting Metadata to extract Item Titles

In [848]:
#take an example 
df_meta = pd.read_json('meta_Grocery_and_Gourmet_Food.json', lines=True)

In [838]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287051 entries, 0 to 287050
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   category         287051 non-null  object
 1   tech1            287051 non-null  object
 2   description      287051 non-null  object
 3   fit              287051 non-null  object
 4   title            287051 non-null  object
 5   also_buy         287051 non-null  object
 6   tech2            287051 non-null  object
 7   brand            287051 non-null  object
 8   feature          287051 non-null  object
 9   rank             287051 non-null  object
 10  also_view        287051 non-null  object
 11  main_cat         287051 non-null  object
 12  similar_item     287051 non-null  object
 13  date             287051 non-null  object
 14  price            287051 non-null  object
 15  asin             287051 non-null  object
 16  imageURL         287051 non-null  object
 17  imageURLHi

In [849]:
df_meta = df_meta.iloc[:,[4,15]]
df_meta.drop_duplicates()
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287051 entries, 0 to 287050
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   287051 non-null  object
 1   asin    287051 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


## Results + Predictions

In [944]:

print("#----------SLOPE ONE METRICS------------#")
print("RMSE: ", rmse_slope)
print("MAE: ",mae_slope)
df_prec_slope = results_slope_groups.apply(lambda row : Precision(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
avg_prec_slope = df_prec_slope.mean()
print("Precision: ",avg_prec_slope)


df_recall_slope = results_slope_groups.apply(lambda row : Recall(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
avg_recall_slope = df_recall_slope.mean()
print("Recall: ",avg_recall_slope)

print("F-score: ", (2*avg_prec_slope*avg_recall_slope)/(avg_prec_slope+avg_recall_slope))
df_ndcg_slope = results_slope_groups.apply(lambda row : ndcg(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
print("NDCG: ", df_ndcg_slope.mean())
               

#----------SLOPE ONE METRICS------------#
RMSE:  0.9929189612315695
MAE:  0.6404119190604137
Precision:  0.7784574468085108
Recall:  0.10299482441879389
F-score:  0.18192042988308205
NDCG:  0.34033226858944593


In [945]:
print("#----------SVD METRICS------------#")
print("RMSE: ", rmse_svd)
print("MAE: ",mae_svd)
df_prec_svd = results_svd_groups.apply(lambda row : Precision(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
avg_prec_svd = df_prec_svd.mean()
print("Precision: ", avg_prec_svd)


df_recall_svd = results_svd_groups.apply(lambda row : Recall(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
avg_recall_svd = df_recall_svd.mean()
print("Recall: ", avg_recall_svd)

print("F-score: ", (2*avg_prec_svd*avg_recall_svd)/(avg_prec_svd+avg_recall_svd))
df_ndcg_svd = results_svd_groups.apply(lambda row : ndcg(row.iloc[[1]].values[0], 
                                                           known_ratings[known_ratings['u_id'] == row['u_id']].values[0][1]),
                                       axis=1)
print("NDCG: ",df_ndcg_svd.mean())

#----------SVD METRICS------------#
RMSE:  0.855878538450241
MAE:  0.6623146671535849
Precision:  0.772340425531915
Recall:  0.10162609299411798
F-score:  0.17961772732575237
NDCG:  0.3140628979696285


In [942]:
def get_recommendations(reviewer_id):
    
    print("SVD+baseline\t|\tTest Data\t|\tTrain Data")
    
    recc_svd = results_svd_groups[results_svd_groups['u_id'] == reviewer_id]['item_rating'].values.tolist()[0]
    #recc_slope = results_slope_groups[results_slope_groups['u_id'] == reviewer_id]['item_rating'].values.tolist()[0]
    recc_known = known_ratings[known_ratings['u_id'] == reviewer_id]['item_rating'].values.tolist()[0]
    recc_train = train_ratings[train_ratings['u_id'] == reviewer_id]['item_rating'].values.tolist()[0]
    #print(recc)
    for i in range(0,10):
        #print(recc[i][0], end='\t)
        
        print(df_meta[df_meta['asin']==recc_svd[i][0]]['title'].values.tolist()[0],"|", end='')
        #print(df_meta[df_meta['asin']==recc_slope[i][0]]['title'].values.tolist()[0], "|", end='')
        print(df_meta[df_meta['asin']==recc_known[i][0]]['title'].values.tolist()[0], "|", end='')
        print(df_meta[df_meta['asin']==recc_train[i][0]]['title'].values.tolist()[0])

                               
        '''
        print(recc_svd[i][1],"|", end='')
        print(recc_slope[i][1], "|", end='')
        print(recc_known[i][1], "|", end='')
        print(recc_train[i][1])
        '''
    
            
        

In [943]:
recc_list = get_recommendations('A100WO06OQR8BQ')

SVD+baseline	|	Test Data	|	Train Data
Bell Plantation PB2 Powdered Peanut Butter, 1 lb Jar (2-pack) |Knorr Pasta Sides, Beef 4.3 oz |Keurig, Coffee People, Jet Fuel, K-Cup Counts,Dark Roast Coffee 50 Count
Bai Flavored Water, Sumatra Dragonfruit, Antioxidant Infused Drinks, 18 Fluid Ounce Bottles, 12 count |V8 Original Low Sodium 100% Vegetable Juice, 5.5 oz. Can (8 packs of 6, Total of 48) |Napalm Coffee, EXTRA DARK ROAST, 100% Arabica, Single Serve Cups for Keurig K-Cup Brewers, 12 Count
Community Coffee Caf&eacute; Special Medium Dark Roast Single Serve, 36 Ct Box, Compatible with Keurig 2.0 K Cup Brewers, Full Body Smooth Full Flavor, 100% Arabica Coffee Beans |Coffee People DARK Roast Variety Sampler * JET FUEL &amp; BLACK TIGER * Extra Bold 48 K-Cups for Keurig Brewers |ALMOSTcoffee Coffee Substitute, 16 Oz. Brews Like Real Coffee, Great Tasting, Healthy, Naturally Caffeine-Free and Acid-Free, Non GMO.
Brooklyn Beans Oh Fudge Coffee Pods for Keurig K Cups Coffee Maker, 40 Count |