In [1]:
%matplotlib inline

## Load Data

In [2]:
# load data
from surprise import Dataset, Reader
def load_data(file_path):
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_file(file_path, reader=reader)
       
def load_folds_data(fold_file_paths):
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_folds(fold_file_paths, reader=reader)
    
train_data=load_data('../data/blending_validation_surprise.csv')
fold_data=load_folds_data([('../data/blending_validation_surprise.csv', '../data/blending_test_surprise.csv')])

## Machine Learning Methods

In [3]:
from surprise import SVD
from surprise import KNNBasic
from surprise import BaselineOnly

#params = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
#params = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
def my_SVD(n_factors, n_epochs, lr_all, reg_all, biased):
    algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
    algo.bsl_options['biased'] = biased
    return algo

def ALS_BaselineOnly():
    print("ALS_BaselineOnly")
    bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 15,
               'reg_i': 10
               } #1.0004
    return BaselineOnly(bsl_options=bsl_options)

def SGD_BaselineOnly():
    print("SGD_BaselineOnly")
    bsl_options = {'method': 'sgd',
               'learning_rate': .005,
               'reg':0.02
                } #1.0021
    return BaselineOnly(bsl_options=bsl_options)


def KNNBasic_ALS_pearson_baseline_user_based():
    print("ALS_pearson_baseline_user_based")
    bsl_options = {'method': 'als',
                    'n_epochs': 20,
                    'user_based': True  # compute  similarities between users
                   } 
    sim_options = {'name': 'pearson_baseline'}
    return KNNBasic(bsl_options=bsl_options, sim_options=sim_options)


def KNNBasic_ALS_pearson_baseline_item_based():
    print("ALS_pearson_baseline_item_based")
    bsl_options = {'method': 'als',
               'n_epochs': 20,
               } 
    sim_options = {'name': 'pearson_baseline'}
    return KNNBasic(bsl_options=bsl_options, sim_options=sim_options)

def KNNBasic_pearson_baseline_user_based():
    print("pearson_baseline_user_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_baseline_item_based():
    print("pearson_baseline_item_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)


def KNNBasic_cosine_user_based():
    print("cosine_user_based")
    sim_options = {'name': 'cosine',
                   'user_based': True  # compute  similarities between users
                   }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_cosine_item_based():
    print("cosine_item_based")
    sim_options = {'name': 'cosine',
                   'user_based': False  # compute  similarities between items
                   }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_user_based():
    print("pearson_user_based")
    sim_options = {'name': 'pearson',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_item_based():
    print("pearson_item_based")
    sim_options = {'name': 'pearson',
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_msd_user_based():
    print("msd_user_based")
    sim_options = {'name': 'msd',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_msd_item_based():
    print("msd_item_based")
    sim_options = {'name': 'msd',
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)


'''

# nepoch K   lr_all  reg_all  rmse 
# 120    40  0.001   0.00005  0.9984
# 80     40  0.001   0.00005  0.9999 
# 80     40  0.01    0.00005  1.0105 
# 80     40  0.01    0.02     1.0058
# 80     40  0.01    0.2      1.0071 
# 40     40  0.001   0.2      1.0042
# 80     40  0.001   0.2      1.0043
# 120    40  0.001   0.2      1.0047
# 40    100  0.001   0.00005  1.0013     
# 80    100  0.001   0.0005   1.0017 
# 120    100  0.001   0.0005  1.0002 
# 120    100  0.001   0.005   
# 120    40  0.001   0.0005   1.0016 
# 120    40  0.001   0.00001  1.0032 

algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
algo.bsl_options['biased'] = biased

params = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
params = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
algo = SVD(n_factors=params['n_factors'],n_epochs=params['n_epochs'],lr_all=params['lr_all'],reg_all=params['reg_all'])
'''


"\n\n# nepoch K   lr_all  reg_all  rmse \n# 120    40  0.001   0.00005  0.9984\n# 80     40  0.001   0.00005  0.9999 \n# 80     40  0.01    0.00005  1.0105 \n# 80     40  0.01    0.02     1.0058\n# 80     40  0.01    0.2      1.0071 \n# 40     40  0.001   0.2      1.0042\n# 80     40  0.001   0.2      1.0043\n# 120    40  0.001   0.2      1.0047\n# 40    100  0.001   0.00005  1.0013     \n# 80    100  0.001   0.0005   1.0017 \n# 120    100  0.001   0.0005  1.0002 \n# 120    100  0.001   0.005   \n# 120    40  0.001   0.0005   1.0016 \n# 120    40  0.001   0.00001  1.0032 \n\nalgo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)\nalgo.bsl_options['biased'] = biased\n\nparams = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}\nparams = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}\nalgo = SVD(n_factors=params['n_factors'],n_epochs=params['n_epochs'],lr_all=params['lr_all'],reg_all=params['reg_all'])\n"

## Cross Validation

In [4]:
# train algorithm.
import numpy as np
from surprise.evaluate import evaluate
from surprise.accuracy import rmse
from surprise.dump import dump

    
def cross_validation(algo, fold_data):
    rmse_list = []
    for trainset, testset in fold_data.folds():
        algo.train(trainset)
        prediction = algo.test(testset)
        rmse_k = rmse(prediction, verbose=False)
        rmse_list.append(rmse_k)
        #dump('../results/dump_algo', prediction, trainset, algo)
        rmse_mean=np.mean(rmse_list)
        print(rmse_mean)
    return rmse_mean, prediction


algo=ALS_BaselineOnly()
#fold_data=load_folds_data([('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')])
#train_data.split(n_folds=3)
mean_rmse, prediction=cross_validation(algo, fold_data)
print("mean_rmse {}".format(mean_rmse))
perf=evaluate(algo, fold_data, measures=['rmse'], with_dump=False, dump_dir=None, verbose=1)

ALS_BaselineOnly
Estimating biases using als...
1.04599861118
mean_rmse 1.0459986111798536
Evaluating RMSE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using als...
RMSE: 1.0460
------------
------------
Mean RMSE: 1.0460
------------
------------


## Find Best parameters of SVD

In [5]:
def train_SVD():
    
    n_factors_range = np.array([10,15,20])#np.linspace(10,30,21)
    reg_all_range = np.logspace(-1.9,-1,10)
    n_epochs_range = np.arange(10,60,10)
    lr_all=0.005
    biased=True
    
    
    results_path = '../results/SGD_surprise/'
    rmses = np.empty((len(n_factors_range),len(reg_all_range), len(n_epochs_range)))
    
    for i,n_factors in enumerate(n_factors_range):
        print('testing with n_factors={}'.format(n_factors))
        for j,reg_all in enumerate(reg_all_range):
            print('testing with reg_all={}'.format(reg_all))
            for k,n_epochs in enumerate(n_epochs_range):
                print('testing with n_epochs={}'.format(n_epochs))
                algo=my_SVD(int(n_factors), n_epochs, lr_all, reg_all, biased)
                rmses[i,j,k],_=cross_validation(algo, train_data)
                print('rmse={}'.format(rmses[i,j,k]))
            results_name = 'rmse_{}_{}'.format(n_factors, reg_all)
            np.savetxt(results_path + results_name + '.csv', rmses[i,j,:], delimiter=",")
    return rmses

rmses = train_SVD()

testing with n_factors=10
testing with reg_all=0.0125892541179
testing with n_epochs=10
1.0439708018
1.04654341842
1.04526873732
1.05039099401
1.04903610241
rmse=1.04903610241
testing with n_epochs=20
1.04360218027
1.04397673835
1.04217639919
1.04757023967
1.04604159515
rmse=1.04604159515
testing with n_epochs=30
1.0505973366
1.04916194364
1.04694236825
1.05214879714
1.05069232409
rmse=1.05069232409
testing with n_epochs=40


KeyboardInterrupt: 

## Save Submission


In [5]:
import pickle
import pandas as pd

# create correct format
def create_submission_dataframe(df_simple):
    #print('Raw: \n',df_simple.head())
    df_simple["Id"] = "r" + df_simple["iid"].map(str) + "_c" +df_simple["uid"].map(str)
    df_simple["Prediction"] = df_simple["est"].clip(0,5)
    df_submission = df_simple.drop(["iid","uid","est","details","rui"],1)
    #print('Submission: \n',df_submission.head()) 
    return df_submission

def create_submition_csv(prediction, output_path):
    df_svd = pd.DataFrame(prediction, columns=['uid', 'iid', 'rui', 'est', 'details'])    
    df_svd_submission = create_submission_dataframe(df_svd)
    df_svd_submission.to_csv(output_path, columns=["Id","Prediction"],index=False)
    #print('Submission: \n', df_svd_submission.head())

    

## Run on whole data


In [7]:
def run_and_save(algo, fold_files, output_path):
    fold_data=load_folds_data(fold_files)
    mean_rmse, prediction = cross_validation(algo, fold_data)
    create_submition_csv(prediction, output_path)
    return mean_rmse

def run_all_algorithm(fold_files, output_prefix):
    
    rmse_list = []
    rmse_list.append(run_and_save(ALS_BaselineOnly(), fold_files, "../submission/"+output_prefix+"ALS_BaselineOnly.csv"))
    rmse_list.append(run_and_save(SGD_BaselineOnly(), fold_files, "../submission/"+output_prefix+"SGD_BaselineOnly.csv"))
    rmse_list.append(run_and_save(KNNBasic_ALS_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_ALS_pearson_baseline_user_based.csv"))
    rmse_list.append(run_and_save(KNNBasic_ALS_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_ALS_pearson_baseline_item_based.csv"))
    rmse_list.append(run_and_save(KNNBasic_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_baseline_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_baseline_item_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_cosine_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_cosine_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_cosine_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_cosine_item_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_pearson_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_msd_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_msd_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_msd_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_msd_item_based.csv"))
    return rmse_list

#fold_files=[('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')]
#output_prefix='training_prediction_'
#rmse_list=run_all_algorithm(fold_files, output_prefix)
#print("rmse_list : ")
#print(rmse_list)

fold_files=[('../data/data_train_surprise.csv', '../data/sampleSubmission_surprise.csv')]
output_prefix='submission_'
rmse_list = run_all_algorithm(fold_files, output_prefix)
print("rmse_list : ")
print(rmse_list)


ALS_BaselineOnly
Estimating biases using als...
0.99595235615
SGD_BaselineOnly
Estimating biases using sgd...
1.00796981244
ALS_pearson_baseline_user_based
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
0.960286964805
ALS_pearson_baseline_item_based
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
0.960286964805
pearson_baseline_user_based
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
0.898054902323
rmse_list : 


NameError: name 'rmse_list' is not defined

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
objects = ('ALS_Baseline', 
           'SGD_Baseline', 
           'KNNBasic ALS pearson baseline user based', 
           'KNNBasic_ALS_pearson_baseline_item_based', 
           'KNNBasic_pearson_baseline_user_based', 
           'KNNBasic_pearson_baseline_item_based',
           'KNNBasic_cosine_user_based',
           'KNNBasic_cosine_item_based',
           'KNNBasic_pearson_user_based',
           'KNNBasic_msd_user_based',
           'KNNBasic_msd_item_based')

y_pos=np.arange(11)
performance = [0.990, 0.98, 1.004, 0.990, 0.98, 1.004, 0.990, 0.98, 1.004, 0.990, 0.98]
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('RMSE')
plt.title('Method')
 
plt.show()