In [24]:
# import libraries
import math
import json
# from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
# import recmetrics
import ml_metrics

In [2]:
# Load data
df = pd.read_csv("ratings_Electronics (1).csv", names=['userId','productId','rating','timestamp'])
df = df.head(1000000)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1000000 non-null  object 
 1   productId  1000000 non-null  object 
 2   rating     1000000 non-null  float64
 3   timestamp  1000000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 30.5+ MB


In [4]:
df.describe()

Unnamed: 0,rating,timestamp
count,1000000.0,1000000.0
mean,3.97362,1246846000.0
std,1.399741,110010400.0
min,1.0,912729600.0
25%,3.0,1168301000.0
50%,5.0,1246666000.0
75%,5.0,1355184000.0
max,5.0,1406074000.0


In [5]:
df.isnull().sum()

userId       0
productId    0
rating       0
timestamp    0
dtype: int64

In [6]:
df.drop(['timestamp'], axis=1, inplace=True)

In [7]:
filtered_df = df.groupby('productId').filter(lambda x: x['rating'].count() >= 50)

In [8]:
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(filtered_df[['userId','productId','rating']],reader)

In [9]:
trainset, testset = train_test_split(dataset, test_size=0.1)

In [10]:
algo = KNNWithMeans(k=5, verbose=True, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [11]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f27b467f370>

In [12]:
test_pred = algo.test(testset)

In [35]:
rmse(test_pred, verbose=True)

RMSE: 1.3402


1.3401750218826656

In [36]:
rating_est = list(map(lambda x: x.est, test_pred))
rating_true = list(map(lambda x: x.r_ui, test_pred))
len(rating_est)

64263

In [34]:
ml_metrics.mapk(rating_true, rating_est, k=5)

TypeError: object of type 'numpy.float64' has no len()

In [28]:
new_df = filtered_df.head(10000)

In [31]:
ratings_matrix = new_df.pivot_table(values='rating', index='productId', columns='userId', fill_value=0)

In [32]:
ratings_matrix.head()

userId,A01852072Z7B68UHLI5UG,A0266076X6KPZ6CCHGVS,A0293130VTX2ZXA70JQS,A030530627MK66BD8V4LN,A0571176384K8RBNKGF8O,A0590501PZ7HOWJKBGQ4,A0641581307AKT5MAOU0Q,A076219533YHEV2LJO988,A0821988FXKFYX53V4QG,A099626739FNCRNHIKBCG,...,AZWOPBY75SGAM,AZX0ZDVAFMN78,AZX5LAN9JEAFF,AZX7I110AF0W2,AZXKUK895VGSM,AZXP46IB63PU8,AZYTSU42BZ7TP,AZZGJ2KMWB7R,AZZMV5VT9W7Y8,AZZST8OYL5P4Q
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
972683275,0,0,5,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1400501466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1400501520,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1400501776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1400532620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD()

In [35]:
decomposed_matrix = SVD.fit_transform(ratings_matrix)
decomposed_matrix.shape

(76, 2)

In [36]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(76, 76)

In [38]:
ratings_matrix.index[75]

'B00000K135'

In [40]:
i = 'B00000K135'
product_names = list(ratings_matrix.index)
product_ID = product_names.index(i)
product_ID

75

In [41]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(76,)

In [44]:
recommend = list(ratings_matrix.index[correlation_product_ID > 0.65])
recommend.remove(i)
recommend[0:24]

['1400501466',
 '1400501520',
 '1400501776',
 '1400532620',
 '1400532655',
 '140053271X',
 '1400532736',
 '1400599997',
 '1400698987',
 '7214047977',
 '8862935293',
 '9625993428',
 '9984984354',
 'B000001OM4',
 'B000001ON6',
 'B00000DM9W',
 'B00000J0D2',
 'B00000J0D5',
 'B00000J1EP',
 'B00000J1F3',
 'B00000J1TX',
 'B00000J1U8',
 'B00000J1UQ',
 'B00000J1V3']