In [1]:
# import libraries
import math
import json
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from funcsigs import signature
# import recmetrics
import ml_metrics

In [2]:
# Load rating data
df = pd.read_csv("ratings_Electronics (1).csv", names=['userId','productId','rating','timestamp'])
df = df.head(1000000)

In [3]:
# Gain information of the attributes of the dataser
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1000000 non-null  object 
 1   productId  1000000 non-null  object 
 2   rating     1000000 non-null  float64
 3   timestamp  1000000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 30.5+ MB


In [4]:
# Show statistics of dataset
df.describe()

Unnamed: 0,rating,timestamp
count,1000000.0,1000000.0
mean,3.97362,1246846000.0
std,1.399741,110010400.0
min,1.0,912729600.0
25%,3.0,1168301000.0
50%,5.0,1246666000.0
75%,5.0,1355184000.0
max,5.0,1406074000.0


In [5]:
# No null values in the dataframe
df.isnull().sum()

userId       0
productId    0
rating       0
timestamp    0
dtype: int64

In [6]:
# We only need userId, productId, 
# and rating to train the model.
df.drop(['timestamp'], axis=1, inplace=True)

In [7]:
# Filter dataset to only recommend products with more than 50 reviews
filtered_df = df.groupby('productId').filter(lambda x: x['rating'].count() >= 50)

In [8]:
# convert dataset to the one used by Surprise library
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(filtered_df[['userId','productId','rating']],reader)

In [9]:
# Split training data and testing data, with testing data consisting of 10% of the original dataset
trainset, testset = train_test_split(dataset, test_size=0.1)

In [10]:
# Training Algorithm
algo = KNNWithMeans(k=5, verbose=True, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [11]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fa8f431b9d0>

In [12]:
# Prediction
test_pred = algo.test(testset)

In [13]:
rmse(test_pred, verbose=True)

RMSE: 1.3382


1.3382383168025949

In [14]:
# Something is 'recommended' to a user when the predicted
# rating of that user to that product is no lower than 4.
actual_rated = list(filter(lambda x: x.r_ui >= 4.0, test_pred))
predicted_rec = list(filter(lambda x: x.est >= 4.0, test_pred))

In [15]:
len(actual_rated)

48817

In [16]:
len(predicted_rec)

57997

In [17]:
rating_est = list(map(lambda x: x.iid, actual_rated))
rating_true = list(map(lambda x: x.iid, predicted_rec))
ml_metrics.apk(rating_true, rating_est, k=5)

1.0