In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from random import sample
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv("Data/Digital_Music.csv", names=['Item', 'User', 'Rating', 'Timestamp'])
data = raw_data.sample(n=30000)
data.drop(['Timestamp'], axis=1,inplace=True)
data = data[['User', 'Item', 'Rating']]

In [3]:
data

Unnamed: 0,User,Item,Rating
1010016,A1S51UW0CQDL20,B004JYTZWW,5.0
832500,APB1VAE7P1CAV,B001A7ZCVO,4.0
1337787,A18G7OQCDEU64C,B001DX97GQ,5.0
1010748,A2Y80QS3611TV,B001TCDSG6,5.0
1140072,AQV0NWB0Z5OVS,B008VPX6Q6,4.0
...,...,...,...
1579783,A1LX310NWOA6EK,B01GRGDNVS,5.0
816518,AIJBY6I15UKY5,B00160Y612,5.0
393671,A10EUNHCOZQUWH,B00GLP4CRK,5.0
1271246,A1W4RUMGNCWB7U,B00EZ7ESXE,5.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNWithMeans(k=6, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7ff4aa604d68>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='A206GVIIE1S8TU', iid='B01EZLHRG4', r_ui=5.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2L9J2RR03LKQX', iid='B002T9ZE3S', r_ui=4.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AIMKISJW60M4U', iid='B0013AVN9I', r_ui=5.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='AAQRDI2Y7LX47', iid='B00M8JD49K', r_ui=4.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A3E9MAGSDRWU97', iid='B00O51WB2A', r_ui=2.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A2A0XZF9LYIK2C', iid='B000S5111E', r_ui=5.0, est=4.659761904761905, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='A3PPQ57BCVI7D

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 0.8763


0.8762654867671217

###### 