In [None]:
#Problem Statement -

#Build your own recommendation system for products on an e-commerce website like Amazon.com.

#Dataset  - ratings_Electronics.csv

#Dataset columns - first three columns are userId, productId, and ratings and the fourth column is timestamp. 
#You can discard the timestamp column as in this case you may not need to use it.

#Source - Amazon Reviews data (http://jmcauley.ucsd.edu/data/amazon/)  The repository has several datasets. 
#For this case study, we are using the Electronics dataset.

In [234]:
#Read and explore the given dataset.  ( Rename column/add headers, plot histograms, find data characteristics)
import pandas as pd
from sklearn.model_selection import train_test_split
from surprise import Dataset,Reader
from surprise import KNNWithMeans
from surprise import accuracy
import Recommenders as Recommenders
import Evaluation as Evaluation
import time
from sklearn.externals import joblib

In [204]:
#loading data and assigning column names
electronic_df = pd.read_csv("ratings_Electronics.csv",header = None, names= ['userId','productId','ratings','timestamp'])
electronic_df.head()

Unnamed: 0,userId,productId,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [205]:
electronic_df = electronic_df.drop('timestamp', axis =1) # Dropping time stamp column as its not relevant for this analysis.
electronic_df.head()

Unnamed: 0,userId,productId,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [206]:
electronic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 3 columns):
userId       object
productId    object
ratings      float64
dtypes: float64(1), object(2)
memory usage: 179.1+ MB


In [207]:
electronic_df.shape

(7824482, 3)

In [208]:
print("Number of unique users: ",len(electronic_df['userId'].unique())) # There are 4201696 unique users.
users = electronic_df['userId']

Number of unique users:  4201696


In [209]:
len(electronic_df['productId'].unique()) # There are 476002 unique products.

476002

In [210]:
electronic_df['ratings'].value_counts() # Various ratings given by user.

5.0    4347541
4.0    1485781
1.0     901765
3.0     633073
2.0     456322
Name: ratings, dtype: int64

In [220]:
##Take a subset of the dataset to make it less sparse/ denser. 
#( For example, keep the users only who has given 50 or more number of ratings ) # Making it 125 as data set is still dense when 
# considered users only who has given 50+ ratings
userID_count = electronic_df["userId"].value_counts()
user_rated_100 = electronic_df[electronic_df["userId"].isin(userID_count[userID_count >= 155].index)]
#print(user_rated_100) # only has users who have rated atleast 100 or more movies
print(user_rated_100.shape)

(21355, 3)


In [221]:
user_rated_100.head()

Unnamed: 0,userId,productId,ratings
631,A3TAS1AG6FMBQW,972683275,5.0
2162,A5JLAU2ARJ0BO,1400532655,1.0
3383,A3PD8JD9L4WEII,1400699169,5.0
4495,A1ZU55TM45Y2R8,8862936826,2.0
4614,A1VQHH85U7PX0,9043413585,3.0


In [222]:
user_rated_100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21355 entries, 631 to 7824444
Data columns (total 3 columns):
userId       21355 non-null object
productId    21355 non-null object
ratings      21355 non-null float64
dtypes: float64(1), object(2)
memory usage: 667.3+ KB


In [239]:
train_data, test_data = train_test_split(user_rated_100, test_size = 0.30, random_state=0)
train_data.head()

Unnamed: 0,userId,productId,ratings
506118,A1EVV74UQYVKRY,B0002SQ2P2,5.0
3440851,A1ZU55TM45Y2R8,B003LVZO92,5.0
3703637,A38RMU1Y5TDP9,B003ZG9T62,5.0
392435,A4WEZJOIZIV4U,B0001G6U4S,5.0
316472,A5JLAU2ARJ0BO,B0000AH5HF,2.0


# Popularity based model 

In [231]:
#Build Popularity Recommender model.
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'userId', 'productId')
user_id = users[20]
pm.recommend(user_id) # list of top 10 products recommended for user 20 based on popularity.

Unnamed: 0,userId,productId,score,Rank
7148,A37K02NKUIT68K,B003ES5ZUU,31,1.0
10388,A37K02NKUIT68K,B0088CJT4U,26,2.0
8052,A37K02NKUIT68K,B004CLYEDC,25,3.0
8054,A37K02NKUIT68K,B004CLYEFK,24,4.0
9985,A37K02NKUIT68K,B0079UAT0A,22,5.0
10319,A37K02NKUIT68K,B00829TIEK,22,6.0
10315,A37K02NKUIT68K,B00829THK0,21,7.0
10324,A37K02NKUIT68K,B0082E9K7U,21,8.0
6442,A37K02NKUIT68K,B002R5AM7C,20,9.0
10234,A37K02NKUIT68K,B007WTAJTO,20,10.0


# Build Collaborative Filtering model.

In [226]:

user_rated_100.userId=user_rated_100.userId.astype(str)
user_rated_100.productId=user_rated_100.productId.astype(str)
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(user_rated_100[['userId', 'productId', 'ratings']], reader)
data
from surprise.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.30,random_state=0)


In [227]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(train_set)
algo.get_neighbors

Computing the pearson similarity matrix...
Done computing similarity matrix.


<bound method AlgoBase.get_neighbors of <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001D698E17208>>

In [228]:
# Evalute on test set
test_pred = algo.test(test_set)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.0532


1.0531914436523602

# Get top - K ( K = 5) recommendations. 
# Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.
# Summarise your insights.

In [232]:
testset_new = train_set.build_anti_testset()
predictions = algo.test(testset_new[0:10000])
predictions_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])
predictions_df.columns = ["userId","productId","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)
top_10_recos

Unnamed: 0,userId,productId,est_rating
0,A6FIAB28IS79,B0006FTKYG,5.0
1,A6FIAB28IS79,B00AIHYHJA,5.0
2,A6FIAB28IS79,B004QZFOF6,5.0
3,A6FIAB28IS79,B001AHSNB0,5.0
4,A6FIAB28IS79,B00009UTCD,5.0
5,A6FIAB28IS79,B000YEMKGY,5.0
6,A6FIAB28IS79,B00GQS290Y,5.0
7,A6FIAB28IS79,B000EMTJ92,5.0
8,A6FIAB28IS79,B00FDLHNHO,5.0
9,A6FIAB28IS79,B003Z4G3I6,5.0


In [None]:
#Evaluate both the models. 
#( Once the model is trained on the training data, it can be used to compute the error (like RMSE) on predictions made on 
#the test data.) #You can also use a different method to evaluate the models.


In [240]:
# RMSE for collaborative is 

# Evalute on test set
test_pred = algo.test(test_set)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.0532


1.0531914436523602