In [1]:
%matplotlib inline

## Load Data

In [2]:
# load data
from surprise import Dataset, Reader
def load_data(file_path):
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_file(file_path, reader=reader)
       
def load_folds_data(fold_file_paths):
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_folds(fold_file_paths, reader=reader)
    
train_data=load_data('../data/blending_train_surprise.csv')
fold_data=load_folds_data([('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')])

## Machine Learning Methods

In [None]:
from surprise import SVD
from surprise import KNNBasic
from surprise import BaselineOnly

#params = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
#params = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
def my_SVD(n_factors, n_epochs, lr_all, reg_all, biased):
    algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
    algo.bsl_options['biased'] = biased
    return algo


def ALS_BaselineOnly():
    bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 15,
               'reg_i': 10
               } #1.0004
    return BaselineOnly(bsl_options=bsl_options)

def SGD_BaselineOnly():
    bsl_options = {'method': 'sgd',
               'learning_rate': .005,
               'reg':0.02
                } #1.0021
    return BaselineOnly(bsl_options=bsl_options)

def KNNBasic_ALS_pearson():
    bsl_options = {'method': 'als',
               'n_epochs': 20,
               } #1.0657
    sim_options = {'name': 'pearson_baseline'}
    return KNNBasic(bsl_options=bsl_options, sim_options=sim_options)

def KNNBasic_pearson():
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0  # no shrinkage
               } #1.0700
    return KNNBasic(sim_options=sim_options)


'''

# nepoch K   lr_all  reg_all  rmse 
# 120    40  0.001   0.00005  0.9984
# 80     40  0.001   0.00005  0.9999 
# 80     40  0.01    0.00005  1.0105 
# 80     40  0.01    0.02     1.0058
# 80     40  0.01    0.2      1.0071 
# 40     40  0.001   0.2      1.0042
# 80     40  0.001   0.2      1.0043
# 120    40  0.001   0.2      1.0047
# 40    100  0.001   0.00005  1.0013     
# 80    100  0.001   0.0005   1.0017 
# 120    100  0.001   0.0005  1.0002 
# 120    100  0.001   0.005   
# 120    40  0.001   0.0005   1.0016 
# 120    40  0.001   0.00001  1.0032 

algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
algo.bsl_options['biased'] = biased

params = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
params = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
algo = SVD(n_factors=params['n_factors'],n_epochs=params['n_epochs'],lr_all=params['lr_all'],reg_all=params['reg_all'])
'''


"\n\n# nepoch K   lr_all  reg_all  rmse \n# 120    40  0.001   0.00005  0.9984\n# 80     40  0.001   0.00005  0.9999 \n# 80     40  0.01    0.00005  1.0105 \n# 80     40  0.01    0.02     1.0058\n# 80     40  0.01    0.2      1.0071 \n# 40     40  0.001   0.2      1.0042\n# 80     40  0.001   0.2      1.0043\n# 120    40  0.001   0.2      1.0047\n# 40    100  0.001   0.00005  1.0013     \n# 80    100  0.001   0.0005   1.0017 \n# 120    100  0.001   0.0005  1.0002 \n# 120    100  0.001   0.005   \n# 120    40  0.001   0.0005   1.0016 \n# 120    40  0.001   0.00001  1.0032 \n\nalgo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)\nalgo.bsl_options['biased'] = biased\n\nparams = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}\nparams = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}\nalgo = SVD(n_factors=params['n_factors'],n_epochs=params['n_epochs'],lr_all=params['lr_all'],reg_all=params['reg_all'])\n"

## Cross Validation

In [None]:
# train algorithm.
import numpy as np
from surprise.evaluate import evaluate
from surprise.accuracy import rmse
from surprise.dump import dump

    
def cross_validation(algo, fold_data):
    rmse_list = []
    for trainset, testset in fold_data.folds():
        algo.train(trainset)
        prediction = algo.test(testset)
        rmse_k = rmse(prediction, verbose=False)
        rmse_list.append(rmse_k)
        dump('../results/dump_algo', prediction, trainset, algo)
    return np.mean(rmse_list), prediction


algo=ALS_BaselineOnly()
#fold_data=load_folds_data([('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')])
train_data.split(n_folds=3)
mean_rmse, prediction=cross_validation(algo, fold_data)
print("mean_rmse.{}",format(mean_rmse))
#perf=evaluate(algo, train_data, measures=['rmse'], with_dump=False, dump_dir=None, verbose=1)

Estimating biases using als...


## Find Best parameters of SVD

In [None]:
def train_SVD():
    
    n_factors_range = np.array([10,15,20])#np.linspace(10,30,21)
    reg_all_range = np.logspace(-1.9,-1,10)
    n_epochs_range = np.arange(10,60,10)
    lr_all=0.005
    biased=True
    
    
    results_path = '../results/SGD_surprise/'
    rmses = np.empty((len(n_factors_range),len(reg_all_range), len(n_epochs_range)))
    
    for i,n_factors in enumerate(n_factors_range):
        print('testing with n_factors={}'.format(n_factors))
        for j,reg_all in enumerate(reg_all_range):
            print('testing with reg_all={}'.format(reg_all))
            for k,n_epochs in enumerate(n_epochs_range):
                print('testing with n_epochs={}'.format(n_epochs))
                algo=my_SVD(int(n_factors), n_epochs, lr_all, reg_all, biased)
                rmses[i,j,k]=cross_validation(algo, train_data)
                print('rmse={}'.format(rmses[i,j,k]))
            results_name = 'rmse_{}_{}'.format(n_factors, reg_all)
            np.savetxt(results_path + results_name + '.csv', rmses[i,j,:], delimiter=",")
    return rmses

rmses = train_SVD()

## Save Submition


In [None]:
import pickle
import pandas as pd

# create correct format
def create_submission_dataframe(df_simple):
    print('Raw: \n',df_simple.head())
    df_simple["Id"] = "r" + df_simple["iid"].map(str) + "_c" +df_simple["uid"].map(str)
    df_simple["Prediction"] = df_simple["est"].clip(0,5)
    df_submission = df_simple.drop(["iid","uid","est","details","rui"],1)
    print('Submission: \n',df_submission.head()) 
    return df_submission

def create_submition_csv(prediction, output_path):
    dump_obj_svd = pickle.load(open(dump_file_path, 'rb'))
    df_svd = pd.DataFrame(prediction, columns=['uid', 'iid', 'rui', 'est', 'details'])    
    df_svd_submission = create_submission_dataframe(df_svd)
    df_svd_submission.to_csv(output_path, columns=["Id","Prediction"],index=False)
    print('Submission: \n',df_svd_new.head())

    

## Run on whole data


In [None]:
def run_and_save(algo, fold_files):
    mean_rmse, prediction = cross_validation(algo, fold_data)
    create_submition_csv(prediction, output_path)

fold_files=[('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')]
algo=ALS_BaselineOnly
run_and_save(algo, fold_files)

In [None]:
#print(rmses_clipped.shape)
rmses_clipped = rmses
results_path = '../results/SGD_surprise/'
results_name = 'rmse_{}_{}_{}_{}'.format(n_epochs,lr_all,n_factors_range,reg_all_range)
#np.save(results_path + results_name, rmses_clipped)
#np.savetxt(results_path + results_name + '.csv', rmses_clipped, delimiter=",")
#test_rmses_clipped = np.loadtxt(results_path + results_name + '.csv', delimiter =',')
#assert np.isclose(test_rmses_clipped,rmses_clipped).all()
name = '../results/SGD_surprise/rmses.npy'
#rmses_clipped = np.load(name)

print(rmses_clipped.shape)
print(n_factors_range.shape)
print(reg_all_range.shape)

# Plot results
import matplotlib.pyplot as plt

num_n_factors = rmses_clipped.shape[0]
num_reg_all = rmses_clipped.shape[1]

plt.rc('text', usetex=True)
plt.rc('font', family='verdana')

for k,n_epochs in enumerate(n_epochs_range):
    results_name = 'rmse_{}_{}_{}_{}'.format(n_epochs,lr_all,n_factors_range,reg_all_range)
    fig = plt.figure()
    for i in range(num_n_factors):
        plt.semilogx(reg_all_range,rmses_clipped[i,:,k],label='$K$={}'.format(n_factors_range[i]))
    plt.xlabel('$\lambda$')
    plt.ylabel('rmse')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title('n epochs = {}'.format(n_epochs))
    fig.savefig(results_path + 'lambda_' + results_name + '.png')

    fig = plt.figure()
    for i in range(num_reg_all):
        plt.plot(n_factors_range,rmses_clipped[:,i,k],label='$\lambda$={:1.4f}'.format(reg_all_range[i]))
    plt.xlabel('$K$')
    plt.ylabel('rmse')
    plt.title('n epochs = {}'.format(n_epochs))
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    fig.savefig(results_path + 'K_' + results_name + '.png')
#np.savetxt(results_path + results_name + '.csv', rmses_clipped, delimiter=",")
#test_rmses_clipped = np.loadtxt(results_path + results_name + '.csv', delimiter =',')
#assert np.isclose(test_rmses_clipped,rmses_clipped).all()

##  Train on whole data

In [None]:
# load data
from surprise import Dataset, Reader
fold_reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
# fold_files = [(train_set, test_set)]
fold_files = [('../data/data_train_surprise.csv', '../data/sampleSubmission_surprise.csv')]
fold_data = Dataset.load_from_folds(fold_files, reader=fold_reader)

In [None]:
# train algorithm.
from surprise import SVD
from surprise.accuracy import rmse
from surprise.dump import dump

#algo = SVD(n_factors=params['n_factors'],n_epochs=params['n_epochs'],lr_all=params['lr_all'],reg_all=params['reg_all'])
for (trainset, testset) in fold_data.folds():
    print('training algo...')
    algo.train(trainset)
    print('testing algo...')
    predictions = algo.test(testset)
    #rmse(predictions)
    dump('../results/dump_algo', predictions, trainset, algo)

## Save submission

In [None]:
# create correct format
def create_submission_dataframe(df_simple):
    #df_svd['err'] = abs(df_svd.est - df_svd.est.round())
    #print(df_svd.where(df_svd['err']<0.001))
    #if (df_svd['err']<0.001):
        #df_svd['est'] = df_svd['est'].round()
    #print(df_svd.where(df_svd['err']<0.001))
    print('Raw: \n',df_simple.head())
    df_simple["Id"] = "r" + df_simple["iid"].map(str) + "_c" +df_simple["uid"].map(str)
    df_simple["Prediction"] = df_simple["est"].clip(0,5)
    df_submission = df_simple.drop(["iid","uid","est","details","rui"],1)
    print('Submission: \n',df_submission.head()) 
    return df_submission

In [None]:
import pickle
import pandas as pd
dump_obj_svd = pickle.load(open('../results/dump_SVD', 'rb'))
df_svd = pd.DataFrame(dump_obj_svd['predictions'], columns=['uid', 'iid', 'rui', 'est', 'details'])    
df_svd_submission = create_submission_dataframe(df_svd)

# save submission
output_path = '../results/submission_surprise.csv'
df_svd_new.to_csv(output_path,columns=["Id","Prediction"],index=False)
print('Submission: \n',df_svd_new.head())

In [None]:
## Save ratings matrix for all training entries
from surprise import Dataset, Reader
fold_reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
# fold_files = [(train_set, test_set)]
fold_files = [('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')]
fold_data = Dataset.load_from_folds(fold_files, reader=fold_reader)
from surprise import SVD
from surprise.accuracy import rmse
from surprise.dump import dump
for (trainset, testset) in fold_data.folds():
    print('testing SVD on full dataset...')
    predictions_svd = algo.test(testset)
    print('rmse:',rmse(predictions_svd))
import pandas as pd
df_svd = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    

df_submission = 
# save matrix
output_path = '../results/submission_surprise_full.csv'
df_submission.to_csv(output_path,columns=["Id","Prediction"],index=False)