In [21]:
np.logspace(-1,-3,3)

array([ 0.1  ,  0.01 ,  0.001])

## Do cross validation

In [2]:
# load data
from surprise import Dataset, Reader
train_reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
train_file = '../data/data_train_surprise.csv'
train_data = Dataset.load_from_file(train_file, reader=train_reader)

In [None]:
# do cross validation
from surprise import SVD
from surprise.accuracy import rmse
import numpy as np


n_factors = 8 # The number of factors. Default is 100.
n_epochs  = 20 # The number of iteration of the SGD procedure. Default is 20.
biased    = True# Whether to use baselines (or biases). See note above. Default is True.
lr_all    = 0.005 # The learning rate for all parameters. Default is 0.005.
reg_all   = 0.02 # The regularization term for all parameters. Default is 0.02.
#lr_bu    # The learning rate for bubu. Takes precedence over lr_all if set. Default is None.
#lr_bi    # The learning rate for bibi. Takes precedence over lr_all if set. Default is None.
#lr_pu    # The learning rate for pupu. Takes precedence over lr_all if set. Default is None.
#lr_qi    # The learning rate for qiqi. Takes precedence over lr_all if set. Default is None.
#reg_bu   # The regularization term for bubu. Takes precedence over reg_all if set. Default is None.
#reg_bi   # The regularization term for bibi. Takes precedence over reg_all if set. Default is None.
#reg_pu   # The regularization term for pupu. Takes precedence over reg_all if set. Default is None.
#reg_qi   # The regularization term for qiqi. Takes precedence over reg_all if set. Default is None.

def evaluate_k_fold(n_factors, n_epochs, lr_all, reg_all):
    algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
    algo.bsl_options['biased'] = biased
    n_folds = 3
    train_data.split(n_folds=n_folds)
    rmses = []
    for trainset, testset in train_data.folds():
        # train and test algorithm.
        algo.train(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        rmse_k = rmse(predictions, verbose=False)
        rmses.append(rmse_k)
    rmse_mean = np.mean(rmses) 
    return rmse_mean

n_factors_range = range(20,50,5)
reg_all_range = np.logspace(-1,-3,3)
rmses = np.empty((len(n_factors_range),len(reg_all_range)))
for i,n_factors in enumerate(n_factors_range):
    print('testing with n_factors={}'.format(n_factors))
    for j,reg_all in enumerate(reg_all_range):
        print('testing with reg_all={}'.format(reg_all))
        rmses[i,j] = evaluate_k_fold(n_factors,n_epochs,lr_all, reg_all)
        print('rmse={}'.format(rmses[i,j]))

testing with n_factors=20
testing with reg_all=0.1
rmse=0.9994457315784171
testing with reg_all=0.01
rmse=0.9988482932794094
testing with reg_all=0.001
rmse=0.9990355142519917
testing with n_factors=25
testing with reg_all=0.1
rmse=0.9997798460901125
testing with reg_all=0.01
rmse=0.9984176319251953
testing with reg_all=0.001


In [None]:
# Plot results
import matplotlib.pyplot as plt
num_n_factors = rmses.shape[0]
num_reg_all = rmses.shape[1]
fig, axis = plt.subplots((1,num_n_factors)) 
for i,ax in enumerate(axis)
    ax.semilogx(reg_all_range,rmses[i,:])
    ax.set_title('n_factors={}'.format(n_factors_range[i]))

##  Train on whole data

In [None]:
# load data
from surprise import Dataset, Reader
fold_reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
# fold_files = [(train_set, test_set)]
fold_files = [('../data/data_train_surprise.csv', '../data/sampleSubmission_surprise.csv')]
fold_data = Dataset.load_from_folds(fold_files, reader=fold_reader)

In [None]:
# train algorithm.
from surprise import SVD
from surprise.accuracy import rmse
from surprise.dump import dump
algo = SVD()
for (trainset, testset) in fold_data.folds():
    print('training SVD...')
    algo.train(trainset)
    print('testing SVD...')
    predictions_svd = algo.test(testset)
    #rmse(predictions_svd)
    dump('../results/dump_SVD', predictions_svd, trainset, algo)

## Save submission

In [3]:
# create correct format
import pickle
import pandas as pd
dump_obj_svd = pickle.load(open('../results/dump_SVD', 'rb'))
df_svd = pd.DataFrame(dump_obj_svd['predictions'], columns=['uid', 'iid', 'rui', 'est', 'details'])    
df_svd['err'] = abs(df_svd.est - df_svd.rui)
print('Raw: \n',df_svd.head())
df_svd["Id"] = "r" + df_svd["iid"].map(str) + "_c" +df_svd["uid"].map(str)

# TODO: choose if we want to round or not. 
#df_svd["Prediction"] = df_svd["est"].clip(0,5) #round()
df_svd["Prediction"] = df_svd["est"].clip(0,5).round()


df_svd_new = df_svd.drop(["iid","uid","est","details","rui","err"],1)

# save submission
output_path = '../results/submission_surprise.csv'
df_svd_new.to_csv(output_path,columns=["Id","Prediction"],index=False)
print('Submission: \n',df_svd_new.head())

Raw: 
   uid  iid  rui       est                    details       err
0   1   37  3.0  3.373404  {'was_impossible': False}  0.373404
1   1   73  3.0  2.969716  {'was_impossible': False}  0.030284
2   1  156  3.0  3.734202  {'was_impossible': False}  0.734202
3   1  160  3.0  3.228011  {'was_impossible': False}  0.228011
4   1  248  3.0  3.510695  {'was_impossible': False}  0.510695
Submission: 
         Id  Prediction
0   r37_c1         3.0
1   r73_c1         3.0
2  r156_c1         4.0
3  r160_c1         3.0
4  r248_c1         4.0
