In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv("Data/AMAZON_FASHION.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
data = raw_data.sample(n=30000)
data.drop(['Timestamp'], axis=1,inplace=True)
data = data[['User', 'Item', 'Rating']]

In [3]:
data

Unnamed: 0,User,Item,Rating
54052,A2XGDWW77FEE7V,B00201ER88,5.0
387961,A3ETHOWPDSIXN6,B01FXEHODI,5.0
569520,A1D143Y4JVNBCE,B00OQ04XZS,5.0
494483,AT5A00RYJDEZ8,B00HYKK6V2,5.0
607883,A1UTF7VDL4GI7R,B00TQ63BQO,3.0
...,...,...,...
102369,AB4KPD7OG3FZH,B00ACIFQGS,2.0
299999,A1U1D1KYZQ29JZ,B011M4A0LQ,5.0
292842,A3DJV2CJJDXRUM,B01090LK1S,5.0
672587,A78LWTHV27SHI,B011ISZW3C,1.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f430f721b70>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='A1B6V4Q3W65VUU', iid='B01DWBI8IO', r_ui=3.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AAK4EXLYIE5KM', iid='B00062NHH0', r_ui=5.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A16GRITEI264PH', iid='B00FAD21NQ', r_ui=5.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A6UJXYTPJC3NV', iid='B001G7R0CC', r_ui=4.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A1PN7DYTLVESFT', iid='B01AHD2NQS', r_ui=5.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2IGY77JABPBUA', iid='B019IIX3C6', r_ui=4.0, est=3.8943809523809523, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A9UKQYQ

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.4298


1.4297853035606714

#### 