In [None]:
%matplotlib inline

# Load Data

In [None]:
# load data
from surprise import Dataset, Reader
def load_data(file_path):
    """
        Load data from file path
    
        input:   file_path      -The input file path
                
        output:  data frame     -The loaded data frame from file path
    """
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_file(file_path, reader=reader)
       
def load_folds_data(fold_file_paths):
    """
        Load folded data from folded file path
    
        input:   fold_file_paths  -The file paths of folded data
                
        output:  fold data frame  -The loaded data frames from folded file path
    """
    reader = Reader(line_format='item rating user', sep=',',skip_lines=1) 
    return Dataset.load_from_folds(fold_file_paths, reader=reader)
    
train_data=load_data('../data/data_train_surprise.csv')
fold_data=load_folds_data([('../data/blending_validation_surprise.csv', '../data/blending_test_surprise.csv')])

## Machine Learning Methods

In [None]:
from surprise.prediction_algorithms import NormalPredictor
from surprise.prediction_algorithms import BaselineOnly
from surprise.prediction_algorithms import KNNBasic
from surprise.prediction_algorithms import KNNWithMeans
from surprise.prediction_algorithms import KNNBaseline
from surprise.prediction_algorithms import SVD
#from surprise.prediction_algorithms import SVDpp
#from surprise.prediction_algorithms import SlopeOne
#from surprise.prediction_algorithms import co_clustering

#params = {'n_factors':12,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
#params = {'n_factors':100,'n_epochs':20,'lr_all':0.005,'reg_all':0.0359,'biased':True}
def my_SVD(n_factors, n_epochs, lr_all, reg_all, biased):
    """
        SVD method
    
        input:  n_factors    - The number of factors
                n_epochs     - The number of iteration of the SGD procedure
                lr_all       - The learning rate for all parameters.
                reg_all      - The regularization term for all parameters
                biased       - Whether to use baselines (or biases)
                
        output: algo         - SVD algorithm based on specified parameters 
    """
    algo = SVD(n_factors=n_factors,n_epochs=n_epochs,lr_all=lr_all,reg_all=reg_all)
    algo.bsl_options['biased'] = biased
    return algo

def ALS_BaselineOnly():
    """
        BaselineOnly method using ALS
        
        input:  --      --
        output: algo    -BaselineOnly method using ALS
    """
    print("ALS_BaselineOnly")
    bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 15,
               'reg_i': 10
               } #1.0004
    return BaselineOnly(bsl_options=bsl_options)

def SGD_BaselineOnly():
    """
        BaselineOnly method using ALS
        
        input:  --      --
        output: algo    -BaselineOnly method using SGD
    """
    print("SGD_BaselineOnly")
    bsl_options = {'method': 'sgd',
               'learning_rate': .005,
               'reg':0.02
                } #1.0021
    return BaselineOnly(bsl_options=bsl_options)


def KNNBasic_ALS_pearson_baseline_user_based():
    """
        BaselineOnly method using ALS
        
        input:  --      --
        output: algo    -BaselineOnly method using SGD
    """
    print("ALS_pearson_baseline_user_based")
    bsl_options = {'method': 'als',
                    'n_epochs': 20,
                    'user_based': True  # compute  similarities between users
                   } 
    sim_options = {'name': 'pearson_baseline'}
    return KNNBasic(bsl_options=bsl_options, sim_options=sim_options)


def KNNBasic_ALS_pearson_baseline_item_based():
    print("ALS_pearson_baseline_item_based")
    bsl_options = {'method': 'als',
               'n_epochs': 20,
                'user_based': False  # compute  similarities between users
               } 
    sim_options = {'name': 'pearson_baseline'}
    return KNNBasic(bsl_options=bsl_options, sim_options=sim_options)

def KNNBasic_pearson_baseline_user_based():
    print("pearson_baseline_user_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_baseline_item_based():
    print("pearson_baseline_item_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)


def KNNBasic_cosine_user_based():
    print("cosine_user_based")
    sim_options = {'name': 'cosine',
                   'user_based': True  # compute  similarities between users
                   }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_cosine_item_based():
    print("cosine_item_based")
    sim_options = {'name': 'cosine',
                   'user_based': False  # compute  similarities between items
                   }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_user_based():
    print("pearson_user_based")
    sim_options = {'name': 'pearson',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_pearson_item_based():
    print("pearson_item_based")
    sim_options = {'name': 'pearson',
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)

def KNNBasic_msd_user_based():
    print("msd_user_based")
    sim_options = {'name': 'msd',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBasic(sim_options=sim_options)

def KNNBasic_msd_item_based():
    print("msd_item_based")
    sim_options = {'name': 'msd',
                'user_based': False  # compute  similarities between items
               }
    return KNNBasic(sim_options=sim_options)

def KNNBaseline_pearson_baseline_user_based():
    print("pearson_baseline_user_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': True  # compute  similarities between users
               } 
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_pearson_baseline_item_based():
    print("pearson_baseline_item_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': False  # compute  similarities between items
               }
    return KNNBaseline(sim_options=sim_options)


def KNNBaseline_cosine_user_based():
    print("cosine_user_based")
    sim_options = {'name': 'cosine',
                   'user_based': True  # compute  similarities between users
                   }
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_cosine_item_based():
    print("cosine_item_based")
    sim_options = {'name': 'cosine',
                   'user_based': False  # compute  similarities between items
                   }
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_pearson_user_based():
    print("pearson_user_based")
    sim_options = {'name': 'pearson',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_pearson_item_based():
    print("pearson_item_based")
    sim_options = {'name': 'pearson',
                'user_based': False  # compute  similarities between items
               }
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_msd_user_based():
    print("msd_user_based")
    sim_options = {'name': 'msd',
                'user_based': True  # compute  similarities between users
               } 
    return KNNBaseline(sim_options=sim_options)

def KNNBaseline_msd_item_based():
    print("msd_item_based")
    sim_options = {'name': 'msd',
                'user_based': False  # compute  similarities between items
               }
    return KNNBaseline(sim_options=sim_options)

def KNNWithMeans_pearson_baseline_user_based():
    print("pearson_baseline_user_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': True  # compute  similarities between users
               } 
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_pearson_baseline_item_based():
    print("pearson_baseline_item_based")
    sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0,  # no shrinkage
                'user_based': False  # compute  similarities between items
               }
    return KNNWithMeans(sim_options=sim_options)


def KNNWithMeans_cosine_user_based():
    print("cosine_user_based")
    sim_options = {'name': 'cosine',
                   'user_based': True  # compute  similarities between users
                   }
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_cosine_item_based():
    print("cosine_item_based")
    sim_options = {'name': 'cosine',
                   'user_based': False  # compute  similarities between items
                   }
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_pearson_user_based():
    print("pearson_user_based")
    sim_options = {'name': 'pearson',
                'user_based': True  # compute  similarities between users
               } 
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_pearson_item_based():
    print("pearson_item_based")
    sim_options = {'name': 'pearson',
                'user_based': False  # compute  similarities between items
               }
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_msd_user_based():
    print("msd_user_based")
    sim_options = {'name': 'msd',
                'user_based': True  # compute  similarities between users
               } 
    return KNNWithMeans(sim_options=sim_options)

def KNNWithMeans_msd_item_based():
    print("msd_item_based")
    sim_options = {'name': 'msd',
                'user_based': False  # compute  similarities between items
               }
    return KNNWithMeans(sim_options=sim_options)


## Cross Validation

In [None]:
# train algorithm.
import numpy as np
from surprise.evaluate import evaluate
from surprise.accuracy import rmse
from surprise.dump import dump

    
def cross_validation(algo, fold_data):
    """
        Cross validation on folded data by specified algorithm.
    
        input:   algo           -Learning algorithm method
                fold_data      -List of (train_data, test_data) to do cross validation on it
                
        output:  rmse_mean      -The mean of rmse on all (train_data, test_data) in fold_data
                prediction     -The prediction on test_data  
    """
    
    rmse_list = []
    for trainset, testset in fold_data.folds():
        #train the learning model on trainset using given algorithm
        algo.train(trainset)
        #predcit the result on testset using the trained model
        prediction = algo.test(testset)
        
        #compute rmse
        rmse_k = rmse(prediction, verbose=False)
        print(rmse_k)
        rmse_list.append(rmse_k)
        #dump('../results/dump_algo', prediction, trainset, algo)
    
    
    rmse_mean=np.mean(rmse_list)
    print(rmse_mean)
    return rmse_mean, prediction


algo=KNNWithMeans_cosine_user_based()
#fold_data=load_folds_data([('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')])
train_data.split(n_folds=3)
mean_rmse, prediction=cross_validation(algo, fold_data)
print("mean_rmse {}".format(mean_rmse))
perf=evaluate(algo, fold_data, measures=['rmse'], with_dump=False, dump_dir=None, verbose=1)

## Find Best parameters of SVD

In [None]:
def train_SVD():
    """
        Find best parameters for SVD algorithm.
       
        input:   --      --
                
        output:  rmses   -The rmse list of running SVD on all parameters set 
    """
    
    #initialize parameters
    n_factors_range = np.array([10,15,20])    #number of columns
    reg_all_range = np.logspace(-1.9,-1,10)   #The regularization term for all parameters
    n_epochs_range = np.arange(10,60,10)      #The number of iteration of the SGD procedure.
    lr_all=0.005                              #The learning rate for all parameters
    biased=True                               #use baselines (or biases)
    
    
    results_path = '../results/SGD_surprise/'
    rmses = np.empty((len(n_factors_range),len(reg_all_range), len(n_epochs_range)))
    
    for i,n_factors in enumerate(n_factors_range):
        print('testing with n_factors={}'.format(n_factors))
        for j,reg_all in enumerate(reg_all_range):
            print('testing with reg_all={}'.format(reg_all))
            for k,n_epochs in enumerate(n_epochs_range):
                print('testing with n_epochs={}'.format(n_epochs))
                
                #train SVD based on given parameters 
                algo=my_SVD(int(n_factors), n_epochs, lr_all, reg_all, biased)
                
                #cross validation on train_data and compute rmse
                rmses[i,j,k],_=cross_validation(algo, train_data)
                print('rmse={}'.format(rmses[i,j,k]))
            
            results_name = 'rmse_{}_{}'.format(n_factors, reg_all)
            np.savetxt(results_path + results_name + '.csv', rmses[i,j,:], delimiter=",")
    return rmses

train_data.split(n_folds=3)
rmses = train_SVD()

## Save Submission


In [None]:
import pickle
import pandas as pd

# create correct format
def create_submission_dataframe(df_simple):
    """
        Convert a data frame in simple format to a data framein submission format
      
        input:   df_simple      -Data frame in simple format
                
        output:  df_submission  -Data frame in submission  format
    """
    
    #print('Raw: \n',df_simple.head())
    df_simple["Id"] = "r" + df_simple["iid"].map(str) + "_c" +df_simple["uid"].map(str)
    df_simple["Prediction"] = df_simple["est"].clip(0,5)
    df_submission = df_simple.drop(["iid","uid","est","details","rui"],1)
    #print('Submission: \n',df_submission.head()) 
    return df_submission

def create_submition_csv(prediction, output_path):
    """save final predictions in output file in csv format
    
       input:   prediction      -The final prediction
                output_path     -The submission file path
       
       output:  --              -- 
    """
    
    df_svd = pd.DataFrame(prediction, columns=['uid', 'iid', 'rui', 'est', 'details'])    
    df_svd_submission = create_submission_dataframe(df_svd)
    df_svd_submission.to_csv(output_path, columns=["Id","Prediction"],index=False)
    #print('Submission: \n', df_svd_submission.head())

    

## Run on whole data


In [None]:
def run_and_save(algo, fold_files, output_path):
    """ 
        Learn the model on trainset based on given algorithm, 
        predict results on testset, 
        save the predictions in output file 
    
        input:   algo            -Learning algorithm
                fold_filse      -List of file paths of (trainset, testset)
                output_path     -Output file path 
       
        output:  mean_rmse       -The mean of rmse on (train_data, test_data) in fold_data
       
    """
    
    #load train_data and test_data from fold_files list
    fold_data=load_folds_data(fold_files)
    #do cross validation on (train_data, test_data) and compute rmse and predictions
    mean_rmse, prediction = cross_validation(algo, fold_data)
    #save prediction in output file path
    create_submition_csv(prediction, output_path)
    return mean_rmse

def run_all_algorithm(fold_files, output_prefix):
    """ 
        Learn and predict result using diffrenet ML methods
    
       input:   
                fold_filse      -List of file paths of (trainset, testset)
                output_prefix   -The prefix of output file path 
       
       output:  rmse_list       -The list of rmse on (train_data, test_data) in fold_data using different ML methods
       
    """
    
    rmse_list = []
    #rmse_list.append(run_and_save(ALS_BaselineOnly(), fold_files, "../submission/"+output_prefix+"ALS_BaselineOnly.csv"))
    #rmse_list.append(run_and_save(SGD_BaselineOnly(), fold_files, "../submission/"+output_prefix+"SGD_BaselineOnly.csv"))
    
    #rmse_list.append(run_and_save(KNNBasic_ALS_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_ALS_pearson_baseline_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_ALS_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_ALS_pearson_baseline_item_based.csv"))
    
    #rmse_list.append(run_and_save(KNNBasic_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_baseline_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_baseline_item_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_cosine_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_cosine_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_cosine_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_cosine_item_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_pearson_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_pearson_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_msd_user_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_msd_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBasic_msd_item_based(), fold_files, "../submission/"+output_prefix+"KNNBasic_msd_item_based.csv"))
    
    rmse_list.append(run_and_save(KNNWithMeans_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_pearson_baseline_user_based.csv"))
    #rmse_list.append(run_and_save(KNNWithMeans_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_pearson_baseline_item_based.csv"))
    rmse_list.append(run_and_save(KNNWithMeans_cosine_user_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_cosine_user_based.csv"))
    rmse_list.append(run_and_save(KNNWithMeans_cosine_item_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_cosine_item_based.csv"))
    rmse_list.append(run_and_save(KNNWithMeans_pearson_user_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_pearson_user_based.csv"))
    rmse_list.append(run_and_save(KNNWithMeans_msd_user_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_msd_user_based.csv"))
    rmse_list.append(run_and_save(KNNWithMeans_msd_item_based(), fold_files, "../submission/"+output_prefix+"KNNWithMeans_msd_item_based.csv"))
    
    rmse_list.append(run_and_save(KNNBaseline_pearson_baseline_user_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_pearson_baseline_user_based.csv"))
    #rmse_list.append(run_and_save(KNNBaseline_pearson_baseline_item_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_pearson_baseline_item_based.csv"))
    rmse_list.append(run_and_save(KNNBaseline_cosine_user_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_cosine_user_based.csv"))
    rmse_list.append(run_and_save(KNNBaseline_cosine_item_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_cosine_item_based.csv"))
    rmse_list.append(run_and_save(KNNBaseline_pearson_user_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_pearson_user_based.csv"))
    rmse_list.append(run_and_save(KNNBaseline_msd_user_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_msd_user_based.csv"))
    rmse_list.append(run_and_save(KNNBaseline_msd_item_based(), fold_files, "../submission/"+output_prefix+"KNNBaseline_msd_item_based.csv"))
    
    return rmse_list

#learn all models on blending train set
fold_files=[('../data/blending_train_surprise.csv', '../data/data_train_surprise.csv')]
output_prefix='training_prediction_'
rmse_list=run_all_algorithm(fold_files, output_prefix)
print("rmse_list : ")
print(rmse_list)

#learn all models on whole train set and test on submition set
fold_files=[('../data/data_train_surprise.csv', '../data/sampleSubmission_surprise.csv')]
output_prefix='submission_'
rmse_list = run_all_algorithm(fold_files, output_prefix)
print("rmse_list : ")
print(rmse_list)


In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
objects = ('ALS_Baseline', 
           'SGD_Baseline', 
           'KNNBasic ALS pearson baseline user based', 
           'KNNBasic_ALS_pearson_baseline_item_based', 
           'KNNBasic_pearson_baseline_user_based', 
           'KNNBasic_pearson_baseline_item_based',
           'KNNBasic_cosine_user_based',
           'KNNBasic_cosine_item_based',
           'KNNBasic_pearson_user_based',
           'KNNBasic_msd_user_based',
           'KNNBasic_msd_item_based')

y_pos=np.arange(11)
rmses = [0.99595235614951694, 1.0079698124442131, 0.96028696480475029, 
         0.96028696480475029, 0.89805490232295371, 0.87317148180816972, 
         1.2899446549009248, 0.9458979465046482, 0.86038596275603796, 1.0858670369478618, 0.95430388112275488]
 
plt.bar(y_pos, rmses, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('RMSE')
plt.title('Method')
 
plt.show()