# Date: 29.06.2020
# MovieLens: Full Training (Surprise)
# Coder: Maksym Chernozhukov

In [1]:
'''
This module runs a 5-Fold CV for all the algorithms (default parameters) on
the movielens datasets, and reports average RMSE, MAE, and total computation time.
It is used for making tables in the README.md file.
'''

from __future__ import (absolute_import, division, print_function, unicode_literals)

import random
import numpy as np
from time import time
from operator import itemgetter
from datetime import datetime, timedelta

from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import NMF
from surprise import SVDpp
from surprise import KNNBasic
from surprise import SlopeOne
from surprise import KNNBaseline
from surprise import CoClustering
from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import NormalPredictor

# The algorithms to cross-validate
classes = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans,
           KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)

# Build dict: LINK[class] = [name, link]  interpret as  LINK = {'class': [name, link]}
mll_1m_l = 'http://grouplens.org/datasets/movielens/1m'
mll_100k_l = 'http://grouplens.org/datasets/movielens/100k'

stable = 'http://surprise.readthedocs.io/en/stable/'
SVD_l = stable + 'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD'
SVDpp_l = stable + 'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp'
NMF_l = stable + 'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF'
SlopeOne_l = stable + 'slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne'
KNNBasic_l = stable + 'knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic'
KNNWithMeans_l = stable + 'knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans'
KNNBaseline_l = stable + 'knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline'
CoClustering_l = stable + 'co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering'
BaselineOnly_l = stable + 'basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly'
NormalPredictor_l = stable + 'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'

LINK = {'ml-1m':           ['Movielens 1M', mll_1m_l],
        'ml-100k':         ['Movielens 100k', mll_100k_l],
        
        'SVD':             ['SVD', SVD_l],
        'SVDpp':           ['SVD++', SVDpp_l],
        'NMF':             ['NMF', NMF_l],
        'SlopeOne':        ['Slope One', SlopeOne_l],
        'KNNBasic':        ['k-NN', KNNBasic_l],
        'KNNWithMeans':    ['Centered k-NN', KNNWithMeans_l],
        'KNNBaseline':     ['k-NN Baseline', KNNBaseline_l],
        'CoClustering':    ['Co-Clustering', CoClustering_l],
        'BaselineOnly':    ['Baseline', BaselineOnly_l],
        'NormalPredictor': ['Random', NormalPredictor_l],
        }


# Set RNG
random.seed(0)
np.random.seed(0)

# Initialisation of variables
table = []
tic_all = time()
dataset = 'ml-100k'
kf = KFold(random_state=0)
data = Dataset.load_builtin(dataset)
path = ('summary_ml-1m.txt' if dataset == 'ml-1m' else 'summary_ml-100k.txt')

# Run of algorithms
for klass in classes:
    start = time()
    out = cross_validate(klass(), data, ['RMSE', 'MAE'], kf)              # Cross validation
    algo = LINK[klass.__name__][0]                                        # Algorithm
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))                  # MAE
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))                # RMSE
    cv_time = str(timedelta(seconds=int(time() - start)))        # Time
    row = [mean_rmse, mean_mae, cv_time, algo]
    table.append(row)                                                     # Save in table
    
    # Show results of each Algorithm
    print('-' * 80)
    print('\nRMSE: {0}\tMAE: {1}\tTime: {2}\t\tAlgorithm: {3}\n'.format(row[0], row[1], row[2], row[3]))
    print('-' * 80)

    
# Sort by columns (RMSE)
table = sorted(table, key=itemgetter(0))

# Show Whole Results
print(LINK[dataset][0])
print('\n\033[1mRMSE\tMAE\tTime\t\tAlgorithm\033[0m\n')
for row in table:
    print('{0}\t{1}\t{2}\t\t{3}\n'.format(row[0], row[1], row[2], row[3]))

    
# Save Results
with open(path, 'a') as file:
    file.write('\n\n')
    for row in table:
        file.write('{0}\t{1}\t{2}\t\t{3}\n'.format(row[0], row[1], row[2], row[3]))
    file.write('Trained: ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    file.close()

print('\n\033[1mFull run in {:0.2f} min!\033[0m\n'.format((time() - tic_all) / 60.0))


--------------------------------------------------------------------------------

RMSE: 0.936	MAE: 0.738	Time: 0:00:28		Algorithm: SVD

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

RMSE: 0.963	MAE: 0.758	Time: 0:00:29		Algorithm: NMF

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

RMSE: 0.946	MAE: 0.743	Time: 0:00:20		Algorithm: Slope One

--------------------------------------------------------------------------------
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [2]:
# Movielens 100k

#   RMSE     MAE     Time       Algorithm
#   0.922	0.723	0:16:31		SVD++          *
#   0.931	0.733	0:00:32		k-NN Baseline  **
#   0.936	0.738	0:00:28		SVD            ***
#   0.944	0.748	0:00:03		Baseline
#   0.946	0.743	0:00:24		Slope One
#   0.951	0.749	0:00:27		Centered k-NN
#   0.964	0.758	0:00:32		NMF
#   0.965	0.755	0:00:15		Co-Clustering
#   0.980	0.774	0:00:25		k-NN
#   1.523	1.222	0:00:03		Random
# ---------------------------------------------------

# Movielens 1M

#   RMSE     MAE     Time       Algorithm
#  0.862	0.672	5:08:36		SVD++          *
#  0.874	0.686	0:04:37		SVD            **
#  0.895	0.706	0:15:06		k-NN Baseline  ***
#  0.907	0.715	0:06:39		Slope One
#  0.909	0.719	0:00:33		Baseline
#  0.916	0.718	0:02:05		Co-Clustering
#  0.917	0.725	0:04:42		NMF
#  0.923	0.727	0:13:51		k-NN
#  0.929	0.738	0:14:16		Centered k-NN
#  1.506	1.207	0:00:27		Random
# ---------------------------------------------------