In [1]:
import gzip
from collections import defaultdict
import json
import numpy as np
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split,cross_validate
import pandas as pd
import matplotlib 
from matplotlib import pyplot as plt
matplotlib.rcParams['figure.figsize'] = (10, 4)
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args

ModuleNotFoundError: No module named 'surprise'

In [None]:

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("Digital_Music_5.json.gz")

In [None]:
df = df[df['verified'] == True]
df = df[['asin','reviewerID','overall']]

In [None]:
df

In [None]:
# helper functions
def MSE(y, ypred):
    differences = [(x-y)**2 for x,y in zip(ypred,y)]
    return sum(differences) / len(differences)

In [None]:
# global mean predictor
ratingMean = df['overall'].mean()
MSE(df['overall'].tolist()[-14000:],([5]*df.shape[0])[-14000:])

In [None]:
df.to_csv('ratings.csv',header=False,index=False)

In [None]:
############################################################################################
# time played prediction begins
ratings_file = "ratings.csv"

reader = Reader(line_format="item user rating", sep=",",rating_scale=(1,5))
data = Dataset.load_from_file(ratings_file, reader=reader)

trainset,testset = train_test_split(data, test_size=.1) # data is shuffled by default
def validate(latent_factors, init_std_dev, lr_bu, lr_bi, lr_pu, lr_qi, reg_bu, reg_bi, reg_pu, reg_qi, n_iter):
    model = SVD(
            n_factors=latent_factors, 
            n_epochs=n_iter, 
            biased=True,
            init_mean=0,
            init_std_dev=init_std_dev,
            lr_all=None,
            reg_all=None,
            lr_bu=lr_bu,
            lr_bi=lr_bi,
            lr_pu=lr_pu,
            lr_qi=lr_qi,
            reg_bu=reg_bu,
            reg_bi=reg_bi,
            reg_pu=reg_pu,
            reg_qi=reg_qi,
            random_state=None, 
            verbose=False
        )
    
    # last chunk of data for validation
    model.fit(trainset)
    predictions = model.test(testset)
    
    sse = 0
    for p in predictions:
        sse += (p.r_ui - p.est)**2
    mse = sse / len(predictions)
    return mse
    
    # 10-fold CV
    # return cross_validate(model, data, measures=["MSE"], cv=10, verbose=True)

def tune():
    
    search_space = [
        # Real(1e-6, 0.01, prior = 'log-uniform', name = 'lamb'),
        # Integer(1, 2, name ='latent_factors'),
        Real(1e-5, 1e-2, prior = 'log-uniform', name ='lr_bu'),
        Real(1e-5, 1e-2, prior = 'log-uniform', name ='lr_bi'),
        Real(1e-5, 1e-2, prior = 'log-uniform', name ='lr_pu'),
        Real(1e-5, 1e-2, prior = 'log-uniform', name ='lr_qi'),
        Real(1e-7, 0.01, prior = 'log-uniform', name ='reg_bu'),
        Real(1e-7, 0.01, prior = 'log-uniform', name ='reg_bi'),
        Real(1e-7, 0.01, prior = 'log-uniform', name ='reg_pu'),
        Real(1e-7, 0.01, prior = 'log-uniform', name ='reg_qi'),
        Real(1e-4, 1e-2, prior = 'uniform', name ='init_std_dev'),
        Integer(20,100, name ='n_iter'),
    ]
    
    @use_named_args(dimensions=search_space)
    def objective(lr_bu, lr_bi, lr_pu, lr_qi, reg_bu, reg_bi, reg_pu, reg_qi, init_std_dev, n_iter):
        latent_factors = 0
        # latent_factors = 1
        init_mean = 0
        print('calculating objective...')
        print(f'latent_factors={latent_factors}, lr_bu={lr_bu}, lr_bi={lr_bi}, lr_pu={lr_pu}, lr_qi={lr_qi},reg_bu={reg_bu},reg_bi={reg_bi},reg_pu={reg_pu},reg_qi={reg_qi}, n_iter={n_iter},init_mean={init_mean}, init_std_dev={init_std_dev}')
        
        # 10-fold CV
        # cv_result = validate(latent_factors, init_std_dev, lr_bu, lr_bi, lr_pu, lr_qi, reg_bu, reg_bi, reg_pu, reg_qi, n_iter)
        # mean_mse = np.mean(cv_result['test_mse'])
        # print(f'objective = {mean_mse}\n')
        # return mean_mse
        
        # last 10% data as validation data
        mse = validate(latent_factors, init_std_dev, lr_bu, lr_bi, lr_pu, lr_qi, reg_bu, reg_bi, reg_pu, reg_qi, n_iter)
        print(f'objective = {mse}\n')
        return mse
    
    default_parameters = [0.0046905221106103065,0.004707606265909147,0.00034357514547643595,0.0001,1e-6,1e-6,1e-6,1e-6,1e-3,40]
    search_result = gp_minimize(func=objective,
                            dimensions=search_space,
                            # acq_func='EI', # Expected Improvement.
                            n_calls=200,
                            x0=default_parameters)
    return search_result

tune()
