In [1]:
import numpy as np
import pandas as pd
import json
import gzip
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [2]:
def parse(path):
  g = gzip.open(path, 'rt', encoding='utf-8')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

data = getDF('Data/Gift_Cards_5.json.gz')
data.drop(['reviewTime', 'verified','style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1,inplace=True)
data = data[['reviewerID', 'asin', 'overall']]

In [3]:
data
#data.dtypes

Unnamed: 0,reviewerID,asin,overall
0,A31UBHTUUIFJUT,B004LLIKVU,5.0
1,A2MN5JQMIY0FQ2,B004LLIKVU,4.0
2,A25POI5IGGENPM,B004LLIKVU,5.0
3,A2HYGTHB4LJ9FW,B004LLIKVU,5.0
4,ACDG3M94UMZGJ,B004LLIKVU,5.0
...,...,...,...
2967,A1MXZ1CW0ZVTKL,B01DWOZKSC,5.0
2968,A1SVYJFIASQ46Z,B01DWOZKSC,4.0
2969,A1QZ08NSDCZBA3,B01E4QS95I,5.0
2970,A1L4GG3FBMIG6V,B01FERR9FW,5.0


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data,reader)

In [5]:
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [6]:
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f11a9a29e10>

In [7]:
test_pred = algo.test(testset)

In [8]:
test_pred

[Prediction(uid='A3JT9D014239AO', iid='B00CXZPG0O', r_ui=5.0, est=5, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A27A4BCOBGPYR3', iid='B00FTJI60I', r_ui=5.0, est=4.8866930171278, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A33QLA6L0ZJ7ZM', iid='B00MV9O08G', r_ui=5.0, est=4.6, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='A1JH2VX7X8SFVR', iid='B01E4QUN0W', r_ui=5.0, est=4.988499521527636, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A10CJ0DWV2M12X', iid='B00MV9H6VY', r_ui=5.0, est=5, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A28WNQS5B4JGUJ', iid='B00GRLUECA', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='AZKLLDZ59Q4A1', iid='B00BXLW5QC', r_ui=5.0, est=4.927547336151072, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='AD85M8MCAFJKY', iid='B01BLV4R8M', r_ui=5.0, est=5, details={'actual_k': 0, 'was_impossible': Fal

In [9]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 0.4484


0.4483574473278089