In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_giftcard = pd.read_csv("Data/Gift_Cards.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
giftcard = raw_giftcard.sample(n=10000)
giftcard.drop(['Timestamp'], axis=1,inplace=True)
giftcard = giftcard[['User', 'Item', 'Rating']]

In [3]:
raw_music = pd.read_csv("Data/Digital_Music.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
music = raw_music.sample(n=10000)
music.drop(['Timestamp'], axis=1,inplace=True)
music = music[['User', 'Item', 'Rating']]

In [4]:
raw_fashion = pd.read_csv("Data/AMAZON_FASHION.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
fashion = raw_fashion.sample(n=10000)
fashion.drop(['Timestamp'], axis=1,inplace=True)
fashion = fashion[['User', 'Item', 'Rating']]

In [5]:
data = pd.concat([giftcard, music, fashion])


In [6]:
data

Unnamed: 0,User,Item,Rating
89558,A4SPI5IVNO72W,B00G4IURXW,5.0
99281,A35FKL9SLQLHXV,B00H5BMH44,5.0
59698,A2267GKPOYUK2V,B0091JKY0M,5.0
19456,A1X5NWTJW8FTWF,B004LLIKVU,5.0
116863,A25WNBB1FOQRRX,B00PMOSXSK,5.0
...,...,...,...
473726,A2J3NNNR86CSCQ,B00FAR01MU,1.0
774553,A2GJC8IN158G83,B01ABSED08,5.0
748837,A1EGMB0S7SRVY8,B0184X947O,3.0
589961,A16QBV17FSAFLD,B00RCYND38,4.0


In [7]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [8]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [9]:
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fe001152d68>

In [10]:
test_pred = algo.test(testset)

In [11]:
test_pred

[Prediction(uid='A1GY7WIWY3O3CE', iid='B0066AZGD4', r_ui=5.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AP2RMCXTMXSIZ', iid='B00122X5VG', r_ui=1.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2FU5D1IXCQ2TC', iid='B00UMI95DI', r_ui=5.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A48KUX9DSTFQS', iid='B004LLIL5A', r_ui=4.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AQMYR173EELDB', iid='B000KPIHQ4', r_ui=5.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A22H1YNO8VW6D', iid='B00DG6VIJW', r_ui=4.0, est=4.416190476190476, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2HCUY33VECUD6'

In [12]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.1659


1.1658789535608503