# Recommendation Systems (Popularity and Collaborative Filtering)

In [2]:
import pandas as pd
#importing the pandas library

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
#importing the seaborn and matplotlib library

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
#importing numpy and train_test_split library

In [5]:
colnames=['Userid','Itemid','Ratings','TimeStamp']
#providing the column values to the dataset

In [6]:
ratings_data=pd.read_csv('ratings_Electronics.csv', names=colnames)
#reading the CSV file

In [7]:
ratings_data.shape
#Getting the shape of the dataset

(7824482, 4)

In [8]:
ratings_data.dtypes
#checking the data type for the dataset

Userid        object
Itemid        object
Ratings      float64
TimeStamp      int64
dtype: object

In [9]:
ratings_data.head()
#checking the first five rows using the head function

Unnamed: 0,Userid,Itemid,Ratings,TimeStamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [10]:
ratings_data.isnull().values.any()
#Checking If the dataset contains any null values

False

In [125]:
ratings_data['Userid'].value_counts()
#this gives us the unique users

A5JLAU2ARJ0BO     520
ADLVFFE4VBT8      501
A3OXHLG6DIBRW8    498
A6FIAB28IS79      431
A680RUE1FDO8B     406
A1ODOGXEYECQQ8    380
A36K2N527TXXJN    314
A2AY4YUOX2N1BQ    311
AWPODHOB4GFWL     308
ARBKYIVNYWK3C     296
A25C2M3QF9G7OQ    296
A22CW0ZHY3NJH8    292
A3EXWV8FNSSFL6    282
A38RMU1Y5TDP9     282
A3LGT6UZL99IW1    279
A2NOW4U7W3F7RI    277
A23GFTVIETX7DS    270
A3PD8JD9L4WEII    266
A17BUUBOU0598B    261
A3AYSYSLHU26U9    257
A2XRMQA6PJ5ZJ8    253
A231WM2Z2JL0U3    252
A12DQZKRKTNF5E    252
A1UQBFCERIP7VJ    247
AGVWTYW0ULXHT     244
A203OCQQ12MAVT    240
AEJAGHLC675A7     239
A2NYK9KWFMJV4Y    238
A3A4ZAIBQWKOZS    236
A1T1YSCDW0PD25    227
                 ... 
A39MG6QWPGBNZ7      1
A3NOF8OVCNMCN0      1
A22QHGNSH22ZUS      1
A2KLU88CDXUYJ9      1
A3DQE41C9U9MDJ      1
AF6B5TNJXH20I       1
A1282NPGGSQ0HU      1
A3V5GS6KCHN0TA      1
A1C976N1W1JD47      1
A26KLH2KM047FR      1
A12J5PUHG9B3NG      1
A1YFHY9K2X6HH4      1
A6NU4TFDJUJOX       1
A2UNC7RO47VSXB      1
A18X32L2M2

In [126]:
ratings_data['Itemid'].value_counts()
#This shows us the different products in the data.

B0074BW614    18244
B00DR0PDNE    16454
B007WTAJTO    14172
B0019EHU8G    12285
B006GWO5WK    12226
B003ELYQGG    11617
B003ES5ZUU    10276
B007R5YDYA     9907
B00622AG6S     9823
B0002L5R78     9487
B008OHNZI0     8966
B003LR7ME6     8840
B000LRMS66     8715
B009SYZ8OC     8370
B00BGA9WK2     7561
B004QK7HI8     7060
B009A5204K     7059
B00BGGDVOO     6893
B0098F5W0Q     6616
B002MAPRYU     6599
B002WE6D44     6509
B005HMKKH4     6134
B0012S4APK     5642
B0052YFYFK     5521
B0001FTVEK     5345
B0044YU60M     5239
B00316263Y     5038
B000I68BD4     4903
B006ZP8UOW     4842
B0041Q38NU     4774
              ...  
B004C3V4EC        1
B0000AI1EX        1
B000LSJCIC        1
B00478SMVU        1
B004RC7ZL4        1
B00H5HFAWY        1
B008NNYVH4        1
B004U2FSV0        1
B00A3RXRLE        1
B00A2V95N0        1
B00C3DDDZG        1
B0050I0E9O        1
B0009Y6IIA        1
B0002BG6JG        1
B000HKMNNU        1
B002COJ9ZO        1
B008PJV77S        1
B00AB4D6KG        1
B00KATCXII        1


In [133]:
ratings_data['Ratings'].value_counts()
# getting the unique ratings applied to different products

5.0    4347541
4.0    1485781
1.0     901765
3.0     633073
2.0     456322
Name: Ratings, dtype: int64

In [18]:
top_fifty=ratings_data.Userid.value_counts().rename_axis('UserId').reset_index(name="Rating count")
#getting the list of users and counting the times when the user had rated a product and showing it in descending order.

In [19]:
amazon_df=pd.DataFrame((ratings_data[ratings_data.Userid.isin(top_fifty[top_fifty['Rating count']>=50].UserId)]))
#getting the list of all the users who have rated more than 50 products

In [20]:
users=amazon_df['Userid'].unique()
#getting the list of unique users

In [21]:
train_data, test_data=train_test_split(amazon_df, test_size=0.30, random_state=0)
#using the train, test function to split the data into training and test data

## We now use ways to sort the most popular products

In [52]:
train_data_grouped=train_data.groupby(['Itemid']).agg({'Userid': 'count'}).reset_index()
#Group the data by Products and sort by the user and get their count

In [53]:
train_data_grouped.rename(columns = {'Userid': 'score'},inplace=True)
#change the column name from userid to score

In [54]:
train_data_sort = train_data_grouped.sort_values(['score', 'Itemid'], ascending = [0,1])
#sort the values in terms of the score

In [55]:
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
#create a new column called RANK and store the list of the ranks calculated

In [38]:
popularity_recommendations = train_data_sort.head(10)
# This stores the top 10 rated products

In [57]:
user_recommendations = popularity_recommendations
cols = user_recommendations.columns.tolist()
cols = cols[-1:] + cols[:-1]
user_recommendations = user_recommendations[cols]
# we then store the top rated products to a variable and then rearrange the rows such that the rank goes as the first columns.

In [58]:
user_recommendations

Unnamed: 0,Rank,Itemid,score
30847,1.0,B0088CJT4U,133
30287,2.0,B007WTAJTO,124
19647,3.0,B003ES5ZUU,122
8752,4.0,B000N99BBC,114
30555,5.0,B00829THK0,97
30559,6.0,B00829TIEK,97
17384,7.0,B002R5AM7C,94
31107,8.0,B008DWCRQW,91
17573,9.0,B002SZEOLG,84
22744,10.0,B004CLYEDC,82


# Collaborative based filtering model

In [59]:
from surprise import Reader, Dataset
#importing the Surprise library

In [60]:
reader=Reader(rating_scale=(1,5))
#initialising the READER object

In [127]:
data=Dataset.load_from_df(amazon_df[['Userid', 'Itemid','Ratings']], reader)
#adding the dataset to the Dataset object with the Userid, Itemid and the ratings and also passing the reader as the object.

In [128]:
data
#data is now a SURPRISE object.

<surprise.dataset.DatasetAutoFolds at 0x17f6be109b0>

In [63]:
from surprise.model_selection import train_test_split
#importing train-test-split function from surprise

In [88]:
trainset, testset = train_test_split(data,test_size=.30, random_state=1233)
#Using the train-test split function to split the data in the ratio 70:30

In [89]:
trainset
#Trainset again is a surprise object

<surprise.trainset.Trainset at 0x17e178abc18>

In [90]:
from surprise import KNNWithMeans
#importing KNNwithMeans from the surprise library

In [91]:
from surprise import accuracy
#importing accuracy from Surprise library

In [92]:
from surprise import Prediction
#importing prediction from surprise library

In [93]:
algo=KNNWithMeans(51, sim_options={'name':'pearson', 'user_based': True})
#Creating the model with the vale of K=51 and the model being a user-user similarity model.

In [94]:
algo.fit(trainset)
#fitting the training set to the model

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x17e0c5a8b38>

In [95]:
len(testset)
#calculating the length of the test set.

37762

In [96]:
test_pred=algo.test(testset)
#getting the predictions of the test set and storing it in a vairable called test_pred

In [100]:
test_pred[5]
# checking the recommendation for a userid with an itemid

Prediction(uid='A3V6QSTG2T9MIO', iid='B00829TIEK', r_ui=5.0, est=4.985848628463074, details={'actual_k': 2, 'was_impossible': False})

In [101]:
test_pred_df=pd.DataFrame(test_pred)
#passing the predictions to a dataframe

In [102]:
test_pred_df

Unnamed: 0,uid,iid,r_ui,est,details
0,AHR86ZEYETLJI,B0073HSH8U,5.0,4.522388,"{'actual_k': 0, 'was_impossible': False}"
1,A3PD8JD9L4WEII,B0085BR4RS,5.0,4.265989,"{'was_impossible': True, 'reason': 'User and/o..."
2,A3GX0FAMEXV6FB,B007PJ4Q4A,4.0,4.045455,"{'actual_k': 0, 'was_impossible': False}"
3,ADLVFFE4VBT8,B002QQ7TLY,5.0,4.385184,"{'actual_k': 3, 'was_impossible': False}"
4,A2HKUM9OEBQKDW,B006Z0Q2SI,5.0,4.183673,"{'actual_k': 0, 'was_impossible': False}"
5,A3V6QSTG2T9MIO,B00829TIEK,5.0,4.985849,"{'actual_k': 2, 'was_impossible': False}"
6,A2JXS1JII6SAUD,B009HP42HQ,5.0,4.607843,"{'actual_k': 0, 'was_impossible': False}"
7,A38Z6QG6988WS7,B008YQAFH0,5.0,4.857143,"{'actual_k': 0, 'was_impossible': False}"
8,A1SX9PHJWTDMF9,B000O8I474,5.0,4.733333,"{'actual_k': 0, 'was_impossible': False}"
9,A2BYV7S1QP2YIG,B007IFVSYM,5.0,4.265989,"{'was_impossible': True, 'reason': 'User and/o..."


In [104]:
algo.predict(uid='AQNSQW4495SG0',iid='B002SA9N8K')
#using the model to predict the recommendation settings for a userid and an item. In this case, the user would not be recommended the item.

Prediction(uid='AQNSQW4495SG0', iid='B002SA9N8K', r_ui=None, est=4.265988718519107, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

# Calculating the accuracy for the collaborative model

In [131]:
# we use the accuracy library from the surprise package and use the rmse as a metric
accuracy.rmse(test_pred)

RMSE: 1.0582


1.0581779546301344

# Get top - K ( K = 5) recommendations. 

In [105]:
testset_new=trainset.build_anti_testset()
# creating a testset out of the training data

In [106]:
len(testset_new)
#checking the length of the dataset

58652111

In [107]:
testset_new[0:5]
#getting the top 5 recommendations

[('A30JPZ9TZ7I61U', 'B0017KH6OU', 4.265988718519107),
 ('A30JPZ9TZ7I61U', 'B001VFUN82', 4.265988718519107),
 ('A30JPZ9TZ7I61U', 'B00009W3SK', 4.265988718519107),
 ('A30JPZ9TZ7I61U', 'B00B9BUZW2', 4.265988718519107),
 ('A30JPZ9TZ7I61U', 'B0076POAOE', 4.265988718519107)]

In [108]:
predictions=algo.test(testset_new[0:10000])
#using the algorithm to test the first 10000 items

In [109]:
predictions_df=pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])
#storing the userid, itemid and the established ratings to a dataframe

In [110]:
predictions_df.head()
#checking the dataframe

Unnamed: 0,0,1,2
0,A30JPZ9TZ7I61U,B0017KH6OU,4.416667
1,A30JPZ9TZ7I61U,B001VFUN82,3.752521
2,A30JPZ9TZ7I61U,B00009W3SK,4.416667
3,A30JPZ9TZ7I61U,B00B9BUZW2,4.416667
4,A30JPZ9TZ7I61U,B0076POAOE,4.416667


In [111]:
predictions_df.columns=['Userid','Productid','est_rating']
#Adding the column names

In [112]:
predictions_df.head()
# chekcing the data

Unnamed: 0,Userid,Productid,est_rating
0,A30JPZ9TZ7I61U,B0017KH6OU,4.416667
1,A30JPZ9TZ7I61U,B001VFUN82,3.752521
2,A30JPZ9TZ7I61U,B00009W3SK,4.416667
3,A30JPZ9TZ7I61U,B00B9BUZW2,4.416667
4,A30JPZ9TZ7I61U,B0076POAOE,4.416667


In [115]:
predictions_df.sort_values(by=['Userid','est_rating'], ascending=False, inplace=True)
#sorting the values by userid and the ratings

In [117]:
predictions_df.head()

Unnamed: 0,Userid,Productid,est_rating
22,A30JPZ9TZ7I61U,B00HFRWWAM,5.0
28,A30JPZ9TZ7I61U,B0015DYMVO,5.0
29,A30JPZ9TZ7I61U,B000LP0R3E,5.0
34,A30JPZ9TZ7I61U,B0062XB9FE,5.0
63,A30JPZ9TZ7I61U,B000IF51UQ,5.0


In [119]:
top_5_recos=predictions_df.groupby('Userid').head(5).reset_index(drop=True)
#getting the top 5 recommendations

In [123]:
top_5_recos
# the top 5 recommendations for the user.

Unnamed: 0,Userid,Productid,est_rating
0,A30JPZ9TZ7I61U,B00HFRWWAM,5.0
1,A30JPZ9TZ7I61U,B0015DYMVO,5.0
2,A30JPZ9TZ7I61U,B000LP0R3E,5.0
3,A30JPZ9TZ7I61U,B0062XB9FE,5.0
4,A30JPZ9TZ7I61U,B000IF51UQ,5.0


# Insights

### Based on this project, we have created two recommender systems, the first one is a popularity based recommender system and the second one is a collaborative filtering model. Both these models are different, the popularity based recommender system recommends the most popular products to all the users whereas the collaborative model uses two approaches for, the first one being item based and the second one being user based. We have used a 'user' based collaborative filtering model.

### No of users: 4201696

### No of products: 476002

### Rating scale: 1 to 5