# Recommerder System Model Selection
In this notebook, we implement several popular algorithms to build a recommender system.

The whole notebook is based on Python package named [Surprise](http://surpriselib.com/)

In [1]:
import warnings
import time
import random
import numpy as np
import pandas as pd

from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise import evaluate
from surprise import print_perf
from surprise import Reader

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Start of the algorithm
t_start = time.time()

# Build recommender system and make predictions

In [3]:
# build the reader and read the dataset
reader = Reader(line_format='user item rating timestamp', sep=r'::')
data = Dataset.load_from_file('./data/ratings.dat', reader=reader)

In [4]:
# split into 5 folders for cross validation
random.seed(2017)
data.split(n_folds=5, shuffle=True)

# 1. [Normal Predictor Algorithm](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor)

In [5]:
# build the algorithm
algo1 = NormalPredictor()

# evaluate the performance of the algorithms
perf1 = evaluate(algo1, data, measures=['RMSE'], verbose=False)
print_perf(perf1)
print('Mean RMSE is:\t', np.mean(perf1['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    1.5073  1.5055  1.5092  1.5076  1.5052  1.5069  
Mean RMSE is:	 1.50694780837


# 2. [Baseline Only Algorithm](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)

In [6]:
# define parameters
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo2 = BaselineOnly(bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf2 = evaluate(algo2, data, measures=['RMSE'], verbose=False)
print_perf(perf2)
print('Mean RMSE is:\t', np.mean(perf2['rmse']))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9079  0.9098  0.9102  0.9089  0.9069  0.9087  
Mean RMSE is:	 0.908747425624


# 3. [KNN Basic Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)

In [7]:
sim_options = {'name': 'cosine',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9758  0.9784  0.9773  0.9760  0.9757  0.9767  
Mean RMSE is:	 0.97666155559


In [8]:
sim_options = {'name': 'cosine',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    1.0003  0.9988  1.0023  0.9990  0.9965  0.9994  
Mean RMSE is:	 0.999355412231


In [9]:
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9225  0.9251  0.9240  0.9223  0.9220  0.9232  
Mean RMSE is:	 0.923177542629


In [10]:
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9149  0.9146  0.9168  0.9145  0.9118  0.9145  
Mean RMSE is:	 0.914516975989


In [11]:
sim_options = {'name': 'pearson',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9610  0.9649  0.9626  0.9615  0.9605  0.9621  
Mean RMSE is:	 0.962100176285


In [12]:
sim_options = {'name': 'pearson',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9935  0.9950  0.9968  0.9937  0.9937  0.9945  
Mean RMSE is:	 0.994543795367


In [13]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9266  0.9291  0.9276  0.9259  0.9256  0.9270  
Mean RMSE is:	 0.926953598489


In [14]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo3 = KNNBasic(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf3 = evaluate(algo3, data, measures=['RMSE'], verbose=False)
print_perf(perf3)
print('Mean RMSE is:\t', np.mean(perf3['rmse']))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9276  0.9265  0.9285  0.9254  0.9256  0.9267  
Mean RMSE is:	 0.926728766068


# 4. [KNN with Means Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)

In [15]:
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

# build the algorithm
algo4 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf4 = evaluate(algo4, data, measures=['RMSE'], verbose=False)
print_perf(perf4)
print('Mean RMSE is:\t', np.mean(perf4['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9281  0.9312  0.9311  0.9288  0.9283  0.9295  
Mean RMSE is:	 0.92952396701


In [16]:
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
algo4 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

# evaluate the performance of the algorithms
perf4 = evaluate(algo4, data, measures=['RMSE'], verbose=False)
print_perf(perf4)
print('Mean RMSE is:\t', np.mean(perf4['rmse']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8852  0.8862  0.8873  0.8866  0.8830  0.8857  
Mean RMSE is:	 0.885665349163


# 5. [KNN Base Line Algorithm](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)

In [17]:
# define parameters
sim_options = {'name': 'msd',
               'user_based': True,
               'shrinkage': 100}

bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo5 = KNNBaseline(k=40, min_k=1, sim_options=sim_options, bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf5 = evaluate(algo5, data, measures=['RMSE'], verbose=False)
print_perf(perf5)
print('Mean RMSE is:\t', np.mean(perf5['rmse']))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8945  0.8963  0.8967  0.8953  0.8937  0.8953  
Mean RMSE is:	 0.895311154651


In [18]:
# define parameters
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_i': 10,
               'reg_u': 15}

# build the algorithm
algo5 = KNNBaseline(k=40, min_k=1, sim_options=sim_options, bsl_options=bsl_options)

# evaluate the performance of the algorithms
perf5 = evaluate(algo5, data, measures=['RMSE'], verbose=False)
print_perf(perf5)
print('Mean RMSE is:\t', np.mean(perf5['rmse']))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8853  0.8860  0.8871  0.8865  0.8826  0.8855  
Mean RMSE is:	 0.885499526078


# 6. [NMF Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)

In [19]:
# build the algorithm
algo8 = NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=.06,
            reg_qi=.06, reg_bu=.02, reg_bi=.02, lr_bu=.005, lr_bi=.005,
            init_low=0, init_high=1, verbose=False)

# evaluate the performance of the algorithms
perf8 = evaluate(algo8, data, measures=['RMSE'], verbose=False)
print_perf(perf8)
print('Mean RMSE is:\t', np.mean(perf8['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9150  0.9159  0.9190  0.9186  0.9148  0.9167  
Mean RMSE is:	 0.916670836586


# 7. [Slope One Algorithm](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)

In [20]:
# build the algorithm
algo9 = SlopeOne()

# evaluate the performance of the algorithms
perf9 = evaluate(algo9, data, measures=['RMSE'], verbose=False)
print_perf(perf9)
print('Mean RMSE is:\t', np.mean(perf9['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9056  0.9074  0.9080  0.9074  0.9050  0.9067  
Mean RMSE is:	 0.906666978348


# 8. [Co-Clustering Algorithm](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering)

In [21]:
# build the algorithm
algo10 = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, verbose=False)

# evaluate the performance of the algorithms
perf10 = evaluate(algo10, data, measures=['RMSE'], verbose=False)
print_perf(perf10)
print('Mean RMSE is:\t', np.mean(perf10['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9123  0.9176  0.9171  0.9157  0.9125  0.9150  
Mean RMSE is:	 0.915007880837


# 9. [SVD Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)

In [22]:
# build the algorithm
algo6 = SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0,
            init_std_dev=.1, lr_all=.005,
            reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None,
            reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
            verbose=False)

# evaluate the performance of the algorithms
perf6 = evaluate(algo6, data, measures=['RMSE'], verbose=False)
print_perf(perf6)
print('Mean RMSE is:\t', np.mean(perf6['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8727  0.8755  0.8755  0.8739  0.8721  0.8739  
Mean RMSE is:	 0.873929205355


# 10. [SVD++ Algorithm](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)

#### Note: this algorighm is too slow to run

In [23]:
# build the algorithm
algo7 = SVDpp(n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1,
              lr_all=.007, reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None,
              lr_qi=None, lr_yj=None, reg_bu=None, reg_bi=None, reg_pu=None,
              reg_qi=None, reg_yj=None, verbose=False)

# evaluate the performance of the algorithms
perf7 = evaluate(algo7, data, measures=['RMSE'], verbose=False)
print_perf(perf7)
print('Mean RMSE is:\t', np.mean(perf7['rmse']))

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8617  0.8627  0.8627  0.8625  0.8603  0.8620  
Mean RMSE is:	 0.861989454907


# End of the program

In [24]:
# get the total code used time information
t_end = time.time()
print("Program running time\n")
print('Start time:\t{0:10.0f}'.format(t_start))
print('Stop  time:\t{0:10.0f}'.format(t_end))
print('Total time:\t{0:10.2f} seconds'.format(t_end - t_start))

Program running time

Start time:	1494039168
Stop  time:	1494066608
Total time:	  27440.50 seconds
