1. Install Scikit-Surprise Library and Import Libraries

In [1]:
!pip install scikit-surprise
import numpy as np  
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
from surprise import KNNWithMeans
from surprise import SVD, SVDpp
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

import time

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 33.1 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626488 sha256=2e8b1b1a26d83097360e848f5ff19657e464d4b01c5a7fa56363f4d201a95fa7
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


2. Connect to GDrive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


3. Load Dataset

In [4]:
start_time = time.time()

df = pd.read_csv("/content/drive/MyDrive/SNAProject/ratings_Electronics.csv", names=["userId", "productId", "rating", "timestamp"])  
print(df.head()) 

computational_time = time.time() - start_time
print('Done in %0.3fs' %(computational_time))

           userId   productId  rating   timestamp
0   AKM1MP6P0OYPR  0132793040     5.0  1365811200
1  A2CX7LUOHB2NDG  0321732944     5.0  1341100800
2  A2NWSAGRHCP8N5  0439886341     1.0  1367193600
3  A2WNBOD3WNDNKT  0439886341     3.0  1374451200
4  A1GI0U4ZRJA8WN  0439886341     1.0  1334707200
Done in 11.317s


4. Print number of Rows and Columns in the dataset

In [5]:
rows_count, columns_count = df.shape
print('Total Number of rows :', rows_count)
print('Total Number of columns :', columns_count)

Total Number of rows : 7824482
Total Number of columns : 4


5. Print number of unique users and products in the dataset

In [6]:
unique_userId = df['userId'].nunique()
unique_productId = df['productId'].nunique()
print('Total number of unique Users    : ', unique_userId)
print('Total number of unique Products : ', unique_productId)

Total number of unique Users    :  4201696
Total number of unique Products :  476002


6. Check if any null values are present in the dataset

In [7]:
df.isna().any()

userId       False
productId    False
rating       False
timestamp    False
dtype: bool

7. Drop Timestamp Column

In [8]:
df = df.drop(['timestamp'], axis=1)
df.head()

Unnamed: 0,userId,productId,rating
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


8. Select Users who have given more than 50 ratings

In [9]:
users_counts = df['userId'].value_counts().rename('users_counts')
users_data   = df.merge(users_counts.to_frame(),
                                left_on='userId',
                                right_index=True)
subset_df = users_data[users_data.users_counts >= 50]
rows_count, columns_count = subset_df.shape
print('Total Number of rows :', rows_count)
print('Total Number of columns :', columns_count)
subset_df.head()

Total Number of rows : 125871
Total Number of columns : 4


Unnamed: 0,userId,productId,rating,users_counts
94,A3BY5KCNQZXV5U,0594451647,5.0,50
14863,A3BY5KCNQZXV5U,B00000JD4V,4.0,50
134213,A3BY5KCNQZXV5U,B000063574,5.0,50
338368,A3BY5KCNQZXV5U,B0000CDJP8,5.0,50
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,50


9. Select Products which have more than 5 ratings

In [10]:
product_rating_counts = subset_df['productId'].value_counts().rename('product_rating_counts')
product_rating_data   = subset_df.merge(product_rating_counts.to_frame(),
                                left_on='productId',
                                right_index=True)
product_rating_data = product_rating_data[product_rating_data.product_rating_counts >= 5]
print(product_rating_data.head())
rows_count, columns_count = product_rating_data.shape
print('Total Number of rows :', rows_count)
print('Total Number of columns :', columns_count)
product_rating_data.head()

                userId   productId  rating  users_counts  \
634048  A3BY5KCNQZXV5U  B0007Y794O     5.0            50   
633970   AKT8TGIT6VVZ5  B0007Y794O     5.0           192   
633944  A1ILWPH1GHUXE2  B0007Y794O     4.0            98   
634073   A1ZM846Y7AUYD  B0007Y794O     4.0            77   
633998  A2ED50E3KWKUKW  B0007Y794O     5.0            65   

        product_rating_counts  
634048                     18  
633970                     18  
633944                     18  
634073                     18  
633998                     18  
Total Number of rows : 65290
Total Number of columns : 5


Unnamed: 0,userId,productId,rating,users_counts,product_rating_counts
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,50,18
633970,AKT8TGIT6VVZ5,B0007Y794O,5.0,192,18
633944,A1ILWPH1GHUXE2,B0007Y794O,4.0,98,18
634073,A1ZM846Y7AUYD,B0007Y794O,4.0,77,18
633998,A2ED50E3KWKUKW,B0007Y794O,5.0,65,18


10. Final Dataset after preprocessing

In [11]:
amazon_df = product_rating_data.copy()
panda_data = amazon_df.drop(['users_counts', 'product_rating_counts'], axis=1)
panda_data.info()
panda_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65290 entries, 634048 to 3827474
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     65290 non-null  object 
 1   productId  65290 non-null  object 
 2   rating     65290 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.0+ MB


Unnamed: 0,userId,productId,rating
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0
633970,AKT8TGIT6VVZ5,B0007Y794O,5.0
633944,A1ILWPH1GHUXE2,B0007Y794O,4.0
634073,A1ZM846Y7AUYD,B0007Y794O,4.0
633998,A2ED50E3KWKUKW,B0007Y794O,5.0


In [12]:
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(panda_data[['userId', 'productId', 'rating']], reader)

12. Split datasets to train and test sets

In [13]:
trainset, testset = train_test_split(surprise_data, test_size=.30, random_state=7)

13. Popularity based filtering based on product ratings

In [14]:
#POPULARITY BASED FILTERING BASED ON PRODUCT RATINGS (FOR NEW CUSTOMERS)
products_df = pd.DataFrame(panda_data.groupby('productId')['rating'].mean()) #will take mean of ratings for each product
products_df['product_rating_counts'] = pd.DataFrame(panda_data.groupby('productId')['rating'].count()) #will take count of ratings for each product
print(products_df.head())
print(products_df.sort_values('rating', ascending=False).head())
print(products_df.sort_values('product_rating_counts', ascending=False).head())

              rating  product_rating_counts
productId                                  
1400501466  3.333333                      6
1400532655  3.833333                      6
1400599997  4.000000                      5
9983891212  4.875000                      8
B00000DM9W  5.000000                      5
            rating  product_rating_counts
productId                                
B00LGQ6HL8     5.0                      5
B003DZJQQI     5.0                     14
B005FDXF2C     5.0                      7
B00I6CVPVC     5.0                      7
B00B9KOCYA     5.0                      8
              rating  product_rating_counts
productId                                  
B0088CJT4U  4.218447                    206
B003ES5ZUU  4.864130                    184
B000N99BBC  4.772455                    167
B007WTAJTO  4.701220                    164
B00829TIEK  4.436242                    149


14. Selecting best hyperparameters

In [None]:
start_time = time.time()

knn_param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k': [15, 20, 25, 30,35,40,45,50,55,60,65,70],
              'sim_options': {'name': ['pearson_baseline']}
              }

knnmeans_gs = GridSearchCV(KNNWithMeans, knn_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)

knnmeans_gs.fit(surprise_data)

print(knnmeans_gs.best_score['rmse'])

print(knnmeans_gs.best_params['rmse'])

computational_time = time.time() - start_time
print('\nComputational Time : %0.3fs' %(computational_time))

0.998835655284587
{'bsl_options': {'method': 'sgd', 'reg': 2}, 'k': 30, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}

Computational Time : 129.829s


15. User based Collaborating Filtering

In [15]:
#USER BASED
start_time = time.time()

# Creating Model using best parameters
knnMeansUU_model = KNNWithMeans(k=60, sim_options={'name': 'pearson_baseline', 'user_based': True})

# Training the algorithm on the trainset
knnMeansUU_model.fit(trainset)

# Predicting for testset
prediction_knnMeansUU = knnMeansUU_model.test(testset)

# Evaluating RMSE, MAE of algorithm KNNWithMeans User-User on 5 split(s)
knnMeansUU_cv = cross_validate(knnMeansUU_model, surprise_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Storing Crossvalidation Results in dataframe
knnMeansUU_df = pd.DataFrame.from_dict(knnMeansUU_cv)
knnMeansUU_described = knnMeansUU_df.describe()
knnMeansUU_cv_results = pd.DataFrame([['KNNWithMeans User-User', knnMeansUU_described['test_rmse']['mean'], knnMeansUU_described['test_mae']['mean'], 
                           knnMeansUU_described['fit_time']['mean'], knnMeansUU_described['test_time']['mean']]],
                            columns = ['Model', 'RMSE', 'MAE', 'Fit Time', 'Test Time'])

#cv_results = cv_results.append(knnMeansUU_cv_results, ignore_index=True)

# get RMSE
print("\n\n==================== Model Evaluation ===============================")
accuracy.rmse(prediction_knnMeansUU, verbose=True)
print("=====================================================================")
accuracy.mae(prediction_knnMeansUU, verbose=True)

computational_time = time.time() - start_time
print('\n Computational Time : %0.3fs' %(computational_time))
#cv_results

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9978  0.9896  0.9973  1.0208  0.9879  0.9987  0.0117  
MAE (testset)     0.7063  0.7013  0.7015  0.7156  0.6981  0.7045  0.0061  
Fit time  

In [17]:
#uid='A23WPWLYT3P1CZ'
uid='AZBXKUH4AIW3X'
pid='B003RRY9RS'
#pid='0594451647'
# Get predictions

pred=knnMeansUU_model.predict(uid, pid, r_ui=4, verbose=True) #r_ui denotes actual rating aka expected rating, est is predicted rating

user: AZBXKUH4AIW3X item: B003RRY9RS r_ui = 4.00   est = 4.29   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


16. Item based Collaborative Filtering

In [19]:
#ITEM BASED
start_time = time.time()

# Creating Model using best parameters
knnMeansII_model = KNNWithMeans(k=60, sim_options={'name': 'pearson_baseline', 'user_based': False})

# Training the algorithm on the trainset
knnMeansII_model.fit(trainset)

# Predicting for testset
prediction_knnMeansII = knnMeansII_model.test(testset)

# Evaluating RMSE, MAE of algorithm KNNWithMeans Item-Item on 5 split(s)
knnMeansII_cv = cross_validate(knnMeansII_model, surprise_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Storing Crossvalidation Results in dataframe
knnMeansII_df = pd.DataFrame.from_dict(knnMeansII_cv)
knnMeansII_described = knnMeansII_df.describe()
knnMeansII_cv_results = pd.DataFrame([['KNNWithMeans Item-Item', knnMeansII_described['test_rmse']['mean'], knnMeansII_described['test_mae']['mean'], 
                           knnMeansII_described['fit_time']['mean'], knnMeansII_described['test_time']['mean']]],
                            columns = ['Model', 'RMSE', 'MAE', 'Fit Time', 'Test Time'])

#cv_results = cv_results.append(knnMeansII_cv_results, ignore_index=True)

# get RMSE
print("\n\n==================== Model Evaluation ===============================")
accuracy.rmse(prediction_knnMeansII, verbose=True)
print("=====================================================================")
accuracy.mae(prediction_knnMeansII, verbose=True)

computational_time = time.time() - start_time
print('\n Computational Time : %0.3fs' %(computational_time))
#cv_results

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0306  1.0238  1.0182  1.0101  1.0141  1.0194  0.0072  
MAE (testset)     0.7220  0.7188  0.7155  0.7128  0.7125  0.7163  0.0036  
Fit time  

17. Recommending top five products in user based collaborative filtering

In [23]:
k=5
top_n = defaultdict(list)
def get_top_n(predictions, n=k):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(prediction_knnMeansUU, n=k)
top_n

defaultdict(list,
            {'A1Z7U9K6X3FEOU': [('B000VDCT3C', 4.708571428571429),
              ('B004Q81CKY', 4.428571428571429),
              ('B002UT42UI', 4.428571428571429),
              ('B00834SJNA', 4.428571428571429),
              ('B002HU39BS', 4.428571428571429)],
             'A18HE80910BTZI': [('B001HBH1B2', 4.648875133593654),
              ('B00J238GV6', 4.325146198830409),
              ('B003ZSHKIO', 4.263497453310697),
              ('B0040JHVC2', 4.254857149460412),
              ('B005HTIWF4', 4.187056100155953)],
             'A1F1A0QQP2XVH5': [('B008X9Z44M', 4.787886359460263),
              ('B002C7481G', 4.7534996408238035),
              ('B00AJHDZSI', 4.750364248654246),
              ('B00BOHNYU6', 4.693684672276373),
              ('B00746LVOM', 4.64715854543668)],
             'ARBKYIVNYWK3C': [('B0015YJOK2', 5),
              ('B003WO7MZC', 5),
              ('B00DVFLJDS', 5),
              ('B001UI2FPE', 5),
              ('B001OORMVQ', 5)],
       

In [24]:
uid= "A2EMUM49CE0JV4"
print(top_n[uid])

[('B000QUUFRW', 5), ('B001GGAIGI', 4.636363636363637), ('B00004ZCDD', 4.636363636363637), ('B000NB05MO', 4.636363636363637), ('B0058UUR6E', 4.636363636363637)]


18. Recommending top five products in item based collaborative filtering

In [25]:
k=5
top_n = defaultdict(list)
def get_top_n(predictions, n=k):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(prediction_knnMeansII, n=k)
top_n

defaultdict(list,
            {'A1Z7U9K6X3FEOU': [('B000VDCT3C', 5),
              ('B003GSCS3U', 5),
              ('B00834SJNA', 4.45945945945946),
              ('B004Q81CKY', 4.428571428571429),
              ('B002FYL7PG', 4.3076923076923075)],
             'A18HE80910BTZI': [('B002MPPHKY', 5),
              ('B00836H2BI', 5),
              ('B00J238GV6', 4.666666666666667),
              ('B003LPUWT0', 4.666666666666667),
              ('B0066CHKMW', 4.5)],
             'A1F1A0QQP2XVH5': [('B003CJTQJC', 5),
              ('B00000K4KH', 5),
              ('B00BQHD4B8', 5),
              ('B00BOHNYU6', 5),
              ('B000NK8EWI', 5)],
             'ARBKYIVNYWK3C': [('B0015YJOK2', 5),
              ('B004SBBD8U', 5),
              ('B0052SCU8U', 5),
              ('B000068MP2', 5),
              ('B00007IFED', 5)],
             'A2QDOJFFLFGF18': [('B00DQZQPNM', 5),
              ('B00G4FCHH4', 5),
              ('B00DQZRQPI', 5),
              ('B0002L5R78', 4.924081132969951),

In [26]:
uid= "A2EMUM49CE0JV4"
print(top_n[uid])

[('B00004ZCDD', 5), ('B000QUUFRW', 5), ('B000Q85WRC', 5), ('B0058UUR6E', 4.526315789473684), ('B000NB05MO', 4.5)]
