Fetching contributors…
Cannot retrieve contributors at this time
99 lines (87 sloc) 4.31 KB
'''This module runs a 5-Fold CV for all the algorithms (default parameters) on
the movielens datasets, and reports average RMSE, MAE, and total computation
time. It is used for making tables in the file'''
from __future__ import (absolute_import, division, print_function,
import time
import datetime
import random
import numpy as np
import six
from tabulate import tabulate
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
# The algorithms to cross-validate
classes = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline,
CoClustering, BaselineOnly, NormalPredictor)
# ugly dict to map algo names and datasets to their markdown links in the table
stable = ''
LINK = {'SVD': '[{}]({})'.format('SVD',
stable +
'SVDpp': '[{}]({})'.format('SVD++',
stable +
'NMF': '[{}]({})'.format('NMF',
stable +
'SlopeOne': '[{}]({})'.format('Slope One',
stable +
'KNNBasic': '[{}]({})'.format('k-NN',
stable +
'KNNWithMeans': '[{}]({})'.format('Centered k-NN',
stable +
'KNNBaseline': '[{}]({})'.format('k-NN Baseline',
stable +
'CoClustering': '[{}]({})'.format('Co-Clustering',
stable +
'BaselineOnly': '[{}]({})'.format('Baseline',
stable +
'NormalPredictor': '[{}]({})'.format('Random',
stable +
'ml-100k': '[{}]({})'.format('Movielens 100k',
'ml-1m': '[{}]({})'.format('Movielens 1M',
# set RNG
dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0) # folds will be the same for all algorithms.
table = []
for klass in classes:
start = time.time()
out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
link = LINK[klass.__name__]
mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
new_line = [link, mean_rmse, mean_mae, cv_time]
print(tabulate([new_line], tablefmt="pipe")) # print current algo perf
header = [LINK[dataset],
print(tabulate(table, header, tablefmt="pipe"))