# Recommendation System | Project | KG

In [1]:
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.1.0
Keras Version: 2.2.4-tf

Python 3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
Pandas 1.1.1
Scikit-Learn 0.23.2
GPU is available


In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
rawdata = pd.read_csv("ratings_Electronics.csv", names = ['userID','prodID','rating','timeStamp'] )

In [5]:
rawdata.describe()

Unnamed: 0,rating,timeStamp
count,7824482.0,7824482.0
mean,4.012337,1338178000.0
std,1.38091,69004260.0
min,1.0,912729600.0
25%,3.0,1315354000.0
50%,5.0,1361059000.0
75%,5.0,1386115000.0
max,5.0,1406074000.0


In [6]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userID     object 
 1   prodID     object 
 2   rating     float64
 3   timeStamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


In [7]:
rawdata.drop(['timeStamp'], axis=1, inplace=True)

In [8]:
rawdata.isnull().sum()

userID    0
prodID    0
rating    0
dtype: int64

There are no null values!

In [9]:
print("There are ", rawdata['userID'].nunique()," users who reviewed ", rawdata['prodID'].nunique(), " products")

There are  4201696  users who reviewed  476002  products


In [10]:
user_rating_counts = rawdata['userID'].value_counts()

In [11]:
user_rating_counts

A5JLAU2ARJ0BO     520
ADLVFFE4VBT8      501
A3OXHLG6DIBRW8    498
A6FIAB28IS79      431
A680RUE1FDO8B     406
                 ... 
A30WC6JE6FBEBO      1
A350YH5R3X0G5I      1
A3QQ15T3HNRJBJ      1
ARI9ZOEA34YE0       1
A3BH4N97KHC615      1
Name: userID, Length: 4201696, dtype: int64

In [12]:
workingData = rawdata[rawdata['userID'].isin(user_rating_counts[user_rating_counts > 50].index)]

In [13]:
workingData.shape

(122171, 3)

In [14]:
print("There are ", workingData['userID'].nunique()," users who reviewed ", workingData['prodID'].nunique(), " products")
print("The current overview in our working data set ")

There are  1466  users who reviewed  47155  products
The current overview in our working data set 


In [15]:
groupedUser = workingData.groupby('prodID')

In [16]:
popularProducts = groupedUser.agg({'userID':'count'})

In [17]:
popularProducts.rename(columns={'userID':'score'}, inplace=True)

In [18]:
popularProducts.reset_index()
popularProducts.head()

Unnamed: 0_level_0,score
prodID,Unnamed: 1_level_1
594481813,1
970407998,2
972683275,3
1400501466,5
1400501520,1


In [19]:
topPopularProducts = popularProducts.sort_values(by='score', ascending=False)[:10]

However, in this case, popular products does not necessarily mean highly rated products, but most reviewed products.

In [20]:
topPopularProducts

Unnamed: 0_level_0,score
prodID,Unnamed: 1_level_1
B0088CJT4U,204
B003ES5ZUU,177
B000N99BBC,163
B007WTAJTO,156
B00829TIEK,146
B008DWCRQW,135
B00829THK0,132
B002R5AM7C,127
B004CLYEDC,117
B004T9RR6I,108


## Train test split

In [34]:
from sklearn.model_selection import train_test_split

In [32]:
# from surprise.model_selection import train_test_split

In [35]:
trainData, testData = train_test_split(workingData, test_size = 0.3, random_state = 0)


print(trainData.info())
print()
print(testData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85519 entries, 1374364 to 4178710
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userID  85519 non-null  object 
 1   prodID  85519 non-null  object 
 2   rating  85519 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.6+ MB
None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36652 entries, 7799371 to 4091526
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userID  36652 non-null  object 
 1   prodID  36652 non-null  object 
 2   rating  36652 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.1+ MB
None


# Collaborative Filtering

In [27]:
from surprise import Reader, Dataset

from surprise import KNNBasic, KNNBaseline, KNNWithZScore, KNNWithMeans, NormalPredictor, SVD, SVDpp, NMF, SlopeOne, CoClustering, BaselineOnly

from surprise.model_selection import cross_validate

In [48]:
r = Reader(rating_scale=(1,5))
#d = Dataset.load_from_df(trainData, r)
d = Dataset.load_from_df(workingData, r)

In [49]:
bm = []

algos = [KNNBasic(), KNNBaseline(), KNNWithZScore(), KNNWithMeans(), NormalPredictor(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering(), BaselineOnly() ]

for algo in algos:

    results = cross_validate(algo, d, measures = ['RMSE'], cv = 2, verbose = False)
    output = pd.DataFrame.from_dict(results).mean(axis = 0)
    output = output.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    bm.append(output)

pd.DataFrame(bm).set_index('Algorithm').sort_values('test_rmse')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.985082,0.323106,0.516501
SVD,0.987144,5.607271,0.738349
SVDpp,0.988322,73.310576,5.607432
KNNBaseline,1.043523,0.433004,1.795501
KNNWithMeans,1.066131,0.183502,1.585498
KNNWithZScore,1.070207,0.290001,1.924999
CoClustering,1.090862,7.004556,0.519003
SlopeOne,1.0963,41.289502,3.062246
KNNBasic,1.113432,0.123174,1.703263
NMF,1.154593,10.694884,0.696503


In [50]:
from surprise.model_selection import train_test_split

In [51]:
trainset, testset = train_test_split(d, test_size=0.3,random_state=20)

In [46]:

# baselineOnly_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5 }
# algo = BaselineOnly(bsl_options=baselineOnly_options)
# cross_validate(algo, trainset, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9848268 , 0.99080434, 0.97784835]),
 'fit_time': (0.16100215911865234, 0.1750044822692871, 0.1979992389678955),
 'test_time': (0.22900032997131348, 0.2559995651245117, 0.26599979400634766)}

In [42]:
# algos

In [43]:
# d

In [52]:
for algo in algos:
    algo.fit(trainset)
    test_predictions = algo.test(testset)
    print(algo.__class__.__name__, " Test Prediction accuracy ", accuracy.rmse(test_predictions, verbose=True))
    print()




Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1086
KNNBasic  Test Prediction accuracy  1.1085573112306182
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0415
KNNBaseline  Test Prediction accuracy  1.0415018537731429
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0635
KNNWithZScore  Test Prediction accuracy  1.063501937599031
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0614
KNNWithMeans  Test Prediction accuracy  1.0614354113224311
RMSE: 1.3601
NormalPredictor  Test Prediction accuracy  1.3601235941053618
RMSE: 0.9870
SVD  Test Prediction accuracy  0.9869895676314099
RMSE: 0.9875
SVDpp  Test Prediction accuracy  0.9874588556161215
RMSE: 1.1455
NMF  Test Prediction accuracy  1.1454994942441592
RMSE: 1.0946
SlopeOne  Test Prediction accuracy  1.0946289557606204
RMSE: 1.0762
CoClustering  Test Prediction accuracy  1.07623

## *NormalPredictor seems to have better test prediction accuracy over the rest*

### code below obtained from https://surprise.readthedocs.io/en/stable/FAQ.html How to get the top-N recommendations for each user

In [55]:
from collections import defaultdict

In [56]:

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [57]:
top_n = get_top_n(test_predictions, n=5)



In [58]:
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

PZQYE6', 'B003TM3EL8']
A15Q2L6DYRTFIU ['B004UQ05M8', 'B00483WRZ6', 'B00394T7BW', 'B001HJTK5Y', 'B00GUQJZCW']
A2PC9ZZIFV31B1 ['B007WTAJTO', 'B0044YPN0A', 'B008SFPMRK', 'B004L3R0Z0', 'B004I43F9C']
A3NXJDNTQK2YJL ['B0052SCU8U', 'B000IJY8DS', 'B0082E9K7U', 'B00005T3G0', 'B003M0NURK']
A3TR3KLL5PXSZ8 ['B004AZ38Z0', 'B001GCU0MY', 'B0000ZJDXU', 'B004XY65WQ', 'B00AO0K3KS']
A3BI8BKIHESDNQ ['B000FBK3QK', 'B0009ON12G', 'B0003NN83K', 'B0057XC2X8', 'B007P4VOWC']
A11LNPG39A2ZV4 ['B00BOHNYU6', 'B00BOHNYTW', 'B0041OSAZ8', 'B001KUL012', 'B000LRMS66']
AFFVUZEGP1FDQ ['B004HKJTT2', 'B001TH7GT6', 'B00FJWKYYG', 'B00IX2VGFA', 'B000BSOBG0']
A3GQY0TFRG0MPZ ['B008JCVF0U', 'B001XURP7W', 'B009FD4UDG', 'B007I5BRIE', 'B003V42O6K']
A1143YEAZYMYHJ ['B0002L5R78', 'B001FA1NZK', 'B00894YP00', 'B001EYV9TM', 'B00HR7FWUC']
A3AGQAXTX6VRJ3 ['B0019EHU8G', 'B0007QKMQY', 'B007WTAJTO', 'B0001CNMFM', 'B003ZUIHY8']
AELK0E5DK7LIZ ['B00BOHNYU6', 'B00829THK0', 'B00005N5WW', 'B000GINMOW', 'B00007FS0F']
AZCE11PSTCH1L ['B003W9B4H4', 'B00

## It is quite conclusively evident that better than recommending a product to a user based on the products' popularity, recommending the identified (*read predicted*) product(s), selected based on understanding the users' past affiliation pattern be more meaningful both for the user and the movement of the product(s).