# Recommerder System Model Selection
In this notebook, we implement several popular algorithms to build a recommender system.

The whole notebook is based on Python package named [Surprise](http://surpriselib.com/)

In [1]:
import warnings
import time
import random
import numpy as np
import pandas as pd

from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise import evaluate
from surprise import print_perf
from surprise import Reader

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Start of the algorithm
t_start = time.time()

# Build recommender system and make predictions

In [3]:
# build the reader and read the dataset
reader = Reader(line_format='user item rating timestamp', sep=r'::')
data = Dataset.load_from_file('./data/ratings.dat', reader=reader)

In [4]:
# split into 5 folders for cross validation
random.seed(2017)
data.split(n_folds=5, shuffle=True)

# 1. [Normal Predictor Algorithm](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor)

In [5]:
# build the algorithm
algo1 = NormalPredictor()

# evaluate the performance of the algorithms
perf1 = evaluate(algo1, data, measures=['RMSE'], verbose=False)
print_perf(perf1)
print('Mean RMSE is:\t', np.mean(perf1['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    1.5077  1.5084  1.5031  1.5029  1.5114  1.5067  
Mean RMSE is:	 1.50669673281


# 2. [Baseline Only Algorithm](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)

In [6]:
# define parameters
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo2 = BaselineOnly(bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf2 = evaluate(algo2, data, measures=['RMSE'], verbose=False)
print_perf(perf2)
print('Mean RMSE is:\t', np.mean(perf2['rmse']))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9109  0.9082  0.9059  0.9084  0.9096  0.9086  
Mean RMSE is:	 0.908611796765


# 3. [KNN Basic Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)

In [7]:
sim_options = {'name': 'cosine',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9788  0.9763  0.9739  0.9762  0.9773  0.9765  
Mean RMSE is:	 0.976519164097


In [8]:
sim_options = {'name': 'cosine',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    1.0015  0.9992  0.9962  0.9991  0.9997  0.9991  
Mean RMSE is:	 0.999130903852


In [9]:
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9251  0.9221  0.9201  0.9232  0.9237  0.9228  
Mean RMSE is:	 0.922827165445


In [10]:
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9172  0.9141  0.9115  0.9150  0.9150  0.9145  
Mean RMSE is:	 0.914542128492


In [11]:
sim_options = {'name': 'pearson',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9646  0.9621  0.9587  0.9618  0.9632  0.9621  
Mean RMSE is:	 0.962065276915


In [12]:
sim_options = {'name': 'pearson',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9976  0.9941  0.9906  0.9949  0.9965  0.9947  
Mean RMSE is:	 0.994737329049


In [13]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9287  0.9268  0.9247  0.9268  0.9287  0.9272  
Mean RMSE is:	 0.927154646769


In [14]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9293  0.9263  0.9248  0.9273  0.9272  0.9270  
Mean RMSE is:	 0.926971165772


# 4. [KNN with Means Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)

In [15]:
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo4 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf4 = evaluate(algo4, data, measures=['RMSE'], verbose=False)
print_perf(perf4)
print('Mean RMSE is:\t', np.mean(perf4['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9324  0.9286  0.9250  0.9295  0.9304  0.9292  
Mean RMSE is:	 0.929183217612


In [16]:
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo4 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf4 = evaluate(algo4, data, measures=['RMSE'], verbose=False)
print_perf(perf4)
print('Mean RMSE is:\t', np.mean(perf4['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8883  0.8843  0.8832  0.8855  0.8867  0.8856  
Mean RMSE is:	 0.885608554713


# 5. [KNN Base Line Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)

In [17]:
# define parameters
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo5 = KNNBaseline(k=40, min_k=1, sim_options=sim_options, bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf5 = evaluate(algo5, data, measures=['RMSE'], verbose=False)
print_perf(perf5)
print('Mean RMSE is:\t', np.mean(perf5['rmse']))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8976  0.8939  0.8923  0.8952  0.8957  0.8949  
Mean RMSE is:	 0.894939040978


In [18]:
# define parameters
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo5 = KNNBaseline(k=40, min_k=1, sim_options=sim_options, bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf5 = evaluate(algo5, data, measures=['RMSE'], verbose=False)
print_perf(perf5)
print('Mean RMSE is:\t', np.mean(perf5['rmse']))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8880  0.8843  0.8830  0.8854  0.8864  0.8854  
Mean RMSE is:	 0.885442839885


# 6. [NMF Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)

In [25]:
# build the algorithm
algo8 = NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=.06,
            reg_qi=.06, reg_bu=.02, reg_bi=.02, lr_bu=.005, lr_bi=.005,
            init_low=0, init_high=1, verbose=False)

# evaluate the performance of the algorithms
perf8 = evaluate(algo8, data, measures=['RMSE'], verbose=False)
print_perf(perf8)
print('Mean RMSE is:\t', np.mean(perf8['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9191  0.9167  0.9160  0.9172  0.9180  0.9174  
Mean RMSE is:	 0.917406759606


# 7. [Slope One Algorithm](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)

In [26]:
# build the algorithm
algo9 = SlopeOne()

# evaluate the performance of the algorithms
perf9 = evaluate(algo9, data, measures=['RMSE'], verbose=False)
print_perf(perf9)
print('Mean RMSE is:\t', np.mean(perf9['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9093  0.9058  0.9039  0.9063  0.9071  0.9065  
Mean RMSE is:	 0.906460992007


# 8. [Co-Clustering Algorithm](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering)

In [27]:
# build the algorithm
algo10 = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, verbose=False)

# evaluate the performance of the algorithms
perf10 = evaluate(algo10, data, measures=['RMSE'], verbose=False)
print_perf(perf10)
print('Mean RMSE is:\t', np.mean(perf10['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9178  0.9168  0.9113  0.9158  0.9165  0.9156  
Mean RMSE is:	 0.91563420467


# 9. [SVD Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)

In [19]:
# build the algorithm
algo6 = SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0,
            init_std_dev=.1, lr_all=.005,
            reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None,
            reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
            verbose=False)

# evaluate the performance of the algorithms
perf6 = evaluate(algo6, data, measures=['RMSE'], verbose=False)
print_perf(perf6)
print('Mean RMSE is:\t', np.mean(perf6['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8756  0.8727  0.8727  0.8742  0.8748  0.8740  
Mean RMSE is:	 0.874007308912


# 10. [SVD++ Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)

In [24]:
# build the algorithm
algo7 = SVDpp(n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1,
              lr_all=.007, reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None,
              lr_qi=None, lr_yj=None, reg_bu=None, reg_bi=None, reg_pu=None,
              reg_qi=None, reg_yj=None, verbose=False)

# evaluate the performance of the algorithms
perf7 = evaluate(algo7, data, measures=['RMSE'], verbose=False)
print_perf(perf7)
print('Mean RMSE is:\t', np.mean(perf7['rmse']))

# End of the program

In [31]:
# get the total code used time information
t_end = time.time()
print("Program running time\n")
print('Start time:\t{0:10.0f}'.format(t_start))
print('Stop  time:\t{0:10.0f}'.format(t_end))
print('Total time:\t{0:10.2f} seconds'.format(t_end - t_start))

Program running time

Start time:	1494021067
Stop  time:	1494038752
Total time:	  17684.45 seconds
