In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv("Data/Gift_Cards.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
data = raw_data.sample(n=30000)
data.drop(['Timestamp'], axis=1,inplace=True)
data = data[['User', 'Item', 'Rating']]

In [3]:
data

Unnamed: 0,User,Item,Rating
119694,A285J3E7ND4BU8,B00YD743CW,5.0
213,A39T9O6ZD7Q4KX,B001GXRQW0,5.0
41622,A12J8Y4SSWD2HS,B0066AZGJI,5.0
142890,A1O70UI22A8HDE,B00JQKK3JS,4.0
53922,AXV2V2764280G,B0091JKJ0M,5.0
...,...,...,...
120246,A6UDVLI18K0RS,B0145WHTES,5.0
100160,A1U0KSI5YRTGQ1,B00I542CPE,5.0
51950,AJ0RRML9HVJVX,B0091JKVU0,5.0
17121,A1I1WAEVJ0DC61,B004LLIKVU,5.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNBaseline(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fdf2aa24d68>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='A2EU5Q4LBWFW8I', iid='B005ESMMWW', r_ui=5.0, est=4.654572470651435, details={'was_impossible': False}),
 Prediction(uid='AX1LGXWKLZFOC', iid='B0066AZGD4', r_ui=5.0, est=4.602269927818454, details={'was_impossible': False}),
 Prediction(uid='A2AAKK8ZTE6G3X', iid='B00FTGFAF0', r_ui=5.0, est=4.693757823129252, details={'was_impossible': False}),
 Prediction(uid='A229VZ9LUQFWR7', iid='B00PG8502O', r_ui=5.0, est=4.8046793220875985, details={'was_impossible': False}),
 Prediction(uid='A25WPSZQS99YDC', iid='B00G4IVSWQ', r_ui=4.0, est=4.452798378926039, details={'was_impossible': False}),
 Prediction(uid='A2FGS9IH5PW22O', iid='B00JDQJZWG', r_ui=5.0, est=4.7456395896272126, details={'was_impossible': False}),
 Prediction(uid='A3NYBY0ZLNWQE6', iid='B016PASHH6', r_ui=5.0, est=3.8442990390678924, details={'was_impossible': False}),
 Prediction(uid='A3170S7Y9Z4WCT', iid='B00BXLVHZM', r_ui=5.0, est=4.647503901605563, details={'was_impossible': False}),
 Prediction(uid='A31C3MW1YWUTH

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 0.9309


0.9309328084237224