In [1]:
!pip install PyDrive
!pip install surprise



In [0]:
import pandas as pd
import numpy as np
from collections import defaultdict

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from surprise import Reader, Dataset, SVD, KNNWithMeans, accuracy
from surprise.model_selection import cross_validate, train_test_split

**Load the data**



In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1ClBptsK3V5KgKXtK2GSRzFNAW7GnTPDW"}) #file id is present in the sharable link of the document in Google Drive
downloaded.GetContentFile('ratings_Electronics.csv')

In [5]:
import pandas as pd
data = pd.read_csv('ratings_Electronics.csv', names=['userId','productId','ratings','timestamp'])
data.head()

Unnamed: 0,userId,productId,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [6]:
data.shape

(7824482, 4)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
userId       object
productId    object
ratings      float64
timestamp    int64
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


**Filter data based on users giving 50 or more raings**

In [0]:
data1 = data.groupby(by=['userId'],axis=0).sum().reset_index()[['userId','ratings']]
#selecting users haven given 60 or more ratings since 50 is causing the RAM to crash
filtered_userId = data1[data1['ratings'] >= 60.0]['userId']

In [9]:
filtered_data = data[data['userId'].isin(list(filtered_userId))]
filtered_data.head(10)

Unnamed: 0,userId,productId,ratings,timestamp
6,A3J3BRHTDRFJ2G,511189877,2.0,1397433600
16,A3N7T0DY83Y4IG,528881469,3.0,1283990400
17,A1H8PY3QHMQQA0,528881469,2.0,1290556800
89,AAZ084UMH8VZ2,594451647,5.0,1399161600
94,A3BY5KCNQZXV5U,594451647,5.0,1390176000
113,A2QBZA4S1ROX9Q,594481813,3.0,1369440000
118,AT09WGFUM934H,594481813,3.0,1377907200
150,A2C8SNQ59NMJ1,594514681,5.0,1389916800
158,A2E14J26DQZOAA,777700018,5.0,1355529600
165,A2R4GEWPLORVSO,899336795,2.0,1103328000


In [10]:
filtered_data.shape

(685413, 4)

**Parameters for the Recommendation System**

In [0]:
k=5

**Popularity Recommender Model**

In [12]:
#recommending k top products based on the total ratings received
filtered_data.groupby('productId').sum().sort_values(by=['ratings'], ascending=False).reset_index()[['productId','ratings']].head(k)

Unnamed: 0,productId,ratings
0,B007WTAJTO,7426.0
1,B003ES5ZUU,6348.0
2,B0019EHU8G,4585.0
3,B002WE6D44,4243.0
4,B002V88HFE,3767.0


In [13]:
#recommending k top products based on the total mean ratings received
filtered_data.groupby('productId').mean().sort_values(by=['ratings'], ascending=False).reset_index()[['productId','ratings']].head(k)

Unnamed: 0,productId,ratings
0,B003XY6QC6,5.0
1,B002Q0WMBC,5.0
2,B008UG15QE,5.0
3,B008UG2W20,5.0
4,B002Q48XXO,5.0


**Split data**

In [14]:
sampled_data = filtered_data.sample(frac = 0.1, replace = False) 
#using 10% percent of the data because using more than that causes the RAM to crash
sampled_data.shape

(68541, 4)

In [0]:
reader = Reader(rating_scale=(1, 5))
model_data = Dataset.load_from_df(sampled_data[['userId', 'productId', 'ratings']], reader)
trainset, testset = train_test_split(model_data, test_size=.3)

**Build Collaborative Filtering Model**

In [16]:
#User-User Collaborative Filtering
algo_user = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': True})
algo_user.fit(trainset)
test_pred_user = algo_user.test(testset)
print("User-based Model : Test Set")
accuracy.rmse(test_pred_user, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
User-based Model : Test Set
RMSE: 1.1981


1.198073965078897

In [17]:
#Item-Item Collaborative Filtering
algo_item = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': False})
algo_item.fit(trainset)
test_pred_item = algo_item.test(testset)
print("Item-based Model : Test Set")
accuracy.rmse(test_pred_item, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Item-based Model : Test Set
RMSE: 1.1749


1.1748738435237422

In [18]:
# SVD based Model
algo_svd = SVD()
algo_svd.fit(trainset)
test_pred_svd = algo_svd.test(testset)
print("SVD Model : Test Set")
accuracy.rmse(test_pred_svd, verbose=True)

SVD Model : Test Set
RMSE: 1.0877


1.0877463199048651

**Make Predictions**

In [0]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [20]:
# Predict using User-based rec model
top_n = get_top_n(test_pred_user, n=k)
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

A1PO9F60BWSD11 ['B001M1DLYS']
A130VGG4P4PW5J ['B00004SQET']
A2F4IS0G0O6UT8 ['B002HZW6WQ', 'B004S4R5CK']
AV834JROS4D9X ['B00140P90G', 'B001602N4E', 'B005QAL2KG']
AL73NEC9YOLQB ['B00728ZBA2']
A38Q631TAJPPLL ['B00B588HY2']
ATFBVUXDIRXT6 ['B0035VFWNG', 'B002TZ4CRG', 'B008O6KES0', 'B002AKKFQ2', 'B002AEU3EW']
A1Y5RRW8I4YX5Y ['B0002WPSBC', 'B004SPVZKW', 'B001F0RPGG', 'B0003NN83K', 'B002KNT0M4']
A3MGHSSFCPSCMT ['B001DKMO0A', 'B000FS4OOU']
A2ERMAM9K6DEOR ['B005IQRMN4', 'B001GTXGVO']
AWEI6TB5Z2D37 ['B0054U6CEE']
A1GF4BH6WPBZ4Y ['B0017VRGXU']
A1WDKTBMZCUAM5 ['B0036Q7MV0', 'B00BFYUWZI', 'B007W1QBO4']
AGEKVD8JPZQMT ['B008MF3X9K', 'B004286VMW']
AWBA6E5E1FIR8 ['B003WU6KFO']
A5N0QU8JBRVQQ ['B001S2PPT0', 'B00007E7QS']
A2AAOTWCPRFYYT ['B00077AA5Q', 'B00004WCFY']
ADSXIDO4TTTLA ['B001TOD7ME']
AFQXRY4F0I7H3 ['B00FS9EXV0']
A1ZOPX3N6QCCRO ['B0088X4DEK', 'B000M4M70K', 'B00006JN3G']
A2OS3TIVAKUAHG ['B00IDYS0XY', 'B004GK0GKO', 'B009AEYDZA']
A13YCEHOME8UYC ['B001H9NR2Q']
AONUCJBC85BL ['B000NLLXUM', 'B005LMWP82',

In [21]:
# Predict using Item-based rec model
top_n = get_top_n(test_pred_item, n=k)
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

A1PO9F60BWSD11 ['B001M1DLYS']
A130VGG4P4PW5J ['B00004SQET']
A2F4IS0G0O6UT8 ['B004S4R5CK', 'B002HZW6WQ']
AV834JROS4D9X ['B00140P90G', 'B001602N4E', 'B005QAL2KG']
AL73NEC9YOLQB ['B00728ZBA2']
A38Q631TAJPPLL ['B00B588HY2']
ATFBVUXDIRXT6 ['B008O6KES0', 'B002R5AM7C', 'B0035VFWNG', 'B002TZ4CRG', 'B002AEU3EW']
A1Y5RRW8I4YX5Y ['B0003NN83K', 'B0002WPSBC', 'B001F0RPGG', 'B002KNT0M4', 'B004SPVZKW']
A3MGHSSFCPSCMT ['B001DKMO0A', 'B000FS4OOU']
A2ERMAM9K6DEOR ['B001GTXGVO', 'B005IQRMN4']
AWEI6TB5Z2D37 ['B0054U6CEE']
A1GF4BH6WPBZ4Y ['B0017VRGXU']
A1WDKTBMZCUAM5 ['B0036Q7MV0', 'B00BFYUWZI', 'B007W1QBO4']
AGEKVD8JPZQMT ['B004286VMW', 'B008MF3X9K']
AWBA6E5E1FIR8 ['B003WU6KFO']
A5N0QU8JBRVQQ ['B001S2PPT0', 'B00007E7QS']
A2AAOTWCPRFYYT ['B00004WCFY', 'B00077AA5Q']
ADSXIDO4TTTLA ['B001TOD7ME']
AFQXRY4F0I7H3 ['B00FS9EXV0']
A1ZOPX3N6QCCRO ['B00006JN3G', 'B0088X4DEK', 'B000M4M70K']
A2OS3TIVAKUAHG ['B009AEYDZA', 'B00IDYS0XY', 'B004GK0GKO']
A13YCEHOME8UYC ['B001H9NR2Q']
AONUCJBC85BL ['B005LMWP82', 'B004DI7CW4',

In [22]:
# Predict using SVD rec model
top_n = get_top_n(test_pred_svd, n=k)
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

A1PO9F60BWSD11 ['B001M1DLYS']
A130VGG4P4PW5J ['B00004SQET']
A2F4IS0G0O6UT8 ['B002HZW6WQ', 'B004S4R5CK']
AV834JROS4D9X ['B00140P90G', 'B001602N4E', 'B005QAL2KG']
AL73NEC9YOLQB ['B00728ZBA2']
A38Q631TAJPPLL ['B00B588HY2']
ATFBVUXDIRXT6 ['B002R5AM7C', 'B008O6KES0', 'B0035VFWNG', 'B002TZ4CRG', 'B002AEU3EW']
A1Y5RRW8I4YX5Y ['B001F0RPGG', 'B0002WPSBC', 'B002KNT0M4', 'B004SPVZKW', 'B0003NN83K']
A3MGHSSFCPSCMT ['B001DKMO0A', 'B000FS4OOU']
A2ERMAM9K6DEOR ['B001GTXGVO', 'B005IQRMN4']
AWEI6TB5Z2D37 ['B0054U6CEE']
A1GF4BH6WPBZ4Y ['B0017VRGXU']
A1WDKTBMZCUAM5 ['B0036Q7MV0', 'B00BFYUWZI', 'B007W1QBO4']
AGEKVD8JPZQMT ['B004286VMW', 'B008MF3X9K']
AWBA6E5E1FIR8 ['B003WU6KFO']
A5N0QU8JBRVQQ ['B001S2PPT0', 'B00007E7QS']
A2AAOTWCPRFYYT ['B00004WCFY', 'B00077AA5Q']
ADSXIDO4TTTLA ['B001TOD7ME']
AFQXRY4F0I7H3 ['B00FS9EXV0']
A1ZOPX3N6QCCRO ['B00006JN3G', 'B0088X4DEK', 'B000M4M70K']
A2OS3TIVAKUAHG ['B009AEYDZA', 'B00IDYS0XY', 'B004GK0GKO']
A13YCEHOME8UYC ['B001H9NR2Q']
AONUCJBC85BL ['B005LMWP82', 'B00AR95FOU',