In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv("Data/AMAZON_FASHION.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
data = raw_data.sample(n=30000)
data.drop(['Timestamp'], axis=1,inplace=True)
data = data[['User', 'Item', 'Rating']]

In [3]:
data

Unnamed: 0,User,Item,Rating
839980,A11SSHGSVEK0HP,B01EKUOE1G,2.0
157771,A1O834L6Z6JJ1R,B00COM550W,2.0
668678,A1FRV055J81WYG,B0112S1YZI,5.0
812169,AMBFMS9T3OI5I,B01D1AO8QW,5.0
376418,A2DBDBSI8V6OJ8,B01E5T5IR6,5.0
...,...,...,...
565012,A3EUEW2268UTMH,B00OA5QYZG,4.0
460725,A3B0H8SQJF8N61,B00DQYW8RK,5.0
650316,A39KK9OK7NXO8,B00YG7RI58,3.0
598915,A9QUY6NCUPHJO,B00SM4YF34,4.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fcabd500780>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='AP16TKSDGN6TB', iid='B017IOPIK2', r_ui=5.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A12OEVO9EBRBDT', iid='B00WLN18OG', r_ui=3.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AXXHCZJJLYFLL', iid='B0192A3JCO', r_ui=4.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A1ZXRFJNGA0WDY', iid='B01DAFK1TG', r_ui=5.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2DFR4K08CFXVM', iid='B00DH6LY90', r_ui=5.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A1VR3L28GGFN6P', iid='B01DJDBG66', r_ui=1.0, est=3.9180952380952383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2CKGVZ

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.4140


1.4139714923326716