In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv("Data/Gift_Cards.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
data = raw_data.sample(n=30000)
data.drop(['Timestamp'], axis=1,inplace=True)
data = data[['User', 'Item', 'Rating']]

In [3]:
data

Unnamed: 0,User,Item,Rating
79555,ADGECU172C477,B00C5UMB7S,5.0
45764,A13YUJJPHY0J8E,B006PJHP62,5.0
71207,A1NBVOP8G3UAYI,B00BWDHCFK,5.0
26835,A2Y5TLUQGF6P67,B004Q7CK9M,5.0
69053,A2MYKI5Z820DUF,B00B2TFSO6,5.0
...,...,...,...
36270,A1SXDT07RAO7WU,B005ESMJ02,5.0
118249,A13YDJCUE91G7F,B00UXLIQNY,5.0
123610,A3GBL75U8XGZIE,B014S24DAI,5.0
78767,A196VP7SA1O15O,B00C5UMEMA,4.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f3f256c4e10>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='A2RDO5IGKKKY6P', iid='B004LLIKVU', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A1ZXEPAGS4CYDR', iid='B005DHN6E2', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A86PYSJOACBTN', iid='B00G4IWEZG', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A1KVHTB0UM3RN2', iid='B004KNWWU4', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2GBV1DKXH1FGX', iid='B004Q7CK9M', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2ZCXXSCEVH0RA', iid='B004W8D0Y4', r_ui=5.0, est=4.6786666666666665, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AG7TAN

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 0.9966


0.9965569861040184