In [1]:
from surprise import Dataset
from surprise.model_selection import cross_validate

#Matrix Factorization Algorithms
from surprise import SVD
from surprise import NMF

from surprise.model_selection import GridSearchCV

##CrossValidation
from surprise.model_selection import cross_validate



# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9305  0.9335  0.9372  0.9433  0.9317  0.9352  0.0046  
MAE (testset)     0.7357  0.7355  0.7370  0.7423  0.7357  0.7372  0.0026  
Fit time          6.05    5.93    6.29    6.55    7.81    6.52    0.67    
Test time         0.28    0.21    0.28    0.22    0.20    0.24    0.03    


{'test_rmse': array([0.93051888, 0.93354405, 0.93716221, 0.94333692, 0.93167682]),
 'test_mae': array([0.73566134, 0.73550649, 0.73703595, 0.74230855, 0.73565443]),
 'fit_time': (6.053062200546265,
  5.927455902099609,
  6.2852678298950195,
  6.547177076339722,
  7.805340051651001),
 'test_time': (0.2803990840911865,
  0.21345996856689453,
  0.2801809310913086,
  0.216141939163208,
  0.2032921314239502)}

Note : Here explain SVD

In [2]:
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9434


0.9434142435260771

Note that you can train and test an algorithm with the following one-line:

In [3]:
predictions = algo.fit(trainset).test(testset)


## Train on a whole trainset and the predict() method

Obviously, we could also simply fit our algorithm to the whole dataset, rather than running cross-validation. This can be done by using the build_full_trainset() method which will build a trainset object:

In [4]:
from surprise import KNNBasic
from surprise import Dataset

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fae7b682610>

We can now predict ratings by directly calling the predict() method. Let’s say you’re interested in user 196 and item 302 (make sure they’re in the trainset!), and you know that the true rating 𝑟𝑢𝑖=4:

Note : explain raw here

In [5]:
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


In [7]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import os

# path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9491  0.9395  0.9472  0.9451  0.9400  0.9442  0.0038  
MAE (testset)     0.7538  0.7433  0.7495  0.7511  0.7449  0.7485  0.0039  
Fit time          0.23    0.25    0.25    0.27    0.25    0.25    0.01    
Test time         0.12    0.13    0.18    0.19    0.12    0.15    0.03    


{'test_rmse': array([0.94913132, 0.93947681, 0.94715078, 0.94507213, 0.93997431]),
 'test_mae': array([0.75375807, 0.74328229, 0.74950678, 0.75109599, 0.74490864]),
 'fit_time': (0.23137831687927246,
  0.252230167388916,
  0.24802803993225098,
  0.26909422874450684,
  0.2515430450439453),
 'test_time': (0.12460207939147949,
  0.12652277946472168,
  0.18208789825439453,
  0.1868448257446289,
  0.12173223495483398)}

##### Or use a csv file as we did for our project!!!

In [14]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data_ = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

#Or USE  the movieLensDataset
ratings = pd.read_csv('/Users/test1/Desktop/week_11/data/ratings.csv')
data =Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([1.42500869, 1.41873179]),
 'test_mae': array([1.13844198, 1.1330959 ]),
 'fit_time': (0.09530019760131836, 0.09567999839782715),
 'test_time': (0.573645830154419, 0.5580999851226807)}

Note: explain here for cross-validation iterators.

In [15]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9449
RMSE: 0.9458
RMSE: 0.9464


In [16]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)


RMSE: 0.9517
RMSE: 0.9397
RMSE: 0.9350
RMSE: 0.9321
RMSE: 0.9328


Note explain here how to tune algorithm parameters with GridSearchCV

In [17]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


0.9646314857211871
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


#### Result:
0.961300130118
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [18]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fae81540a90>

#### A bit advanced here. Needs explanation.

In [19]:
#Dictionary parameters such as bsl_options and sim_options require particular treatment. See usage example below:
param_grid = {'k': [10, 20],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }

#Naturally, both can be combined, for example for the KNNBaseline algorithm:
param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k': [2, 3],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }


For further analysis, the cv_results attribute has all the needed information and can be imported in a pandas dataframe:

In [20]:
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [2]:
'split0_test_rmse': [1.0, 1.0, 0.97, 0.98, 0.98, 0.99, 0.96, 0.97]
'split1_test_rmse': [1.0, 1.0, 0.97, 0.98, 0.98, 0.99, 0.96, 0.97]
'split2_test_rmse': [1.0, 1.0, 0.97, 0.98, 0.98, 0.99, 0.96, 0.97]
'mean_test_rmse':   [1.0, 1.0, 0.97, 0.98, 0.98, 0.99, 0.96, 0.97]
'std_test_rmse':    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
'rank_test_rmse':   [7 8 3 5 4 6 1 2]
'split0_test_mae':  [0.81, 0.82, 0.78, 0.79, 0.79, 0.8, 0.77, 0.79]
'split1_test_mae':  [0.8, 0.81, 0.78, 0.79, 0.78, 0.79, 0.77, 0.78]
'split2_test_mae':  [0.81, 0.81, 0.78, 0.79, 0.78, 0.8, 0.77, 0.78]
'mean_test_mae':    [0.81, 0.81, 0.78, 0.79, 0.79, 0.8, 0.77, 0.78]
'std_test_mae':     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
'rank_test_mae':    [7 8 2 5 4 6 1 3]
'mean_fit_time':    [1.53, 1.52, 1.53, 1.53, 3.04, 3.05, 3.06, 3.02]
'std_fit_time':     [0.03, 0.04, 0.0, 0.01, 0.04, 0.01, 0.06, 0.01]
'mean_test_time':   [0.46, 0.45, 0.44, 0.44, 0.47, 0.49, 0.46, 0.34]
'std_test_time':    [0.0, 0.01, 0.01, 0.0, 0.03, 0.06, 0.01, 0.08]
'params':           [{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}, {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}, {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}, {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}, {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}, {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}, {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}, {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}]
'param_n_epochs':   [5, 5, 5, 5, 10, 10, 10, 10]
'param_lr_all':     [0.0, 0.0, 0.01, 0.01, 0.0, 0.0, 0.01, 0.01]
'param_reg_all':    [0.4, 0.6, 0.4, 0.6, 0.4, 0.6, 0.4, 0.6]

#### Command line usage

Surprise can also be used from the command line, for example:

Note: try it at terminal.


surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3

#### Literature:

 https://surprise.readthedocs.io/en/stable/matrix_factorization.html
 https://surprise.readthedocs.io/en/stable/getting_started.html#load-from-folds-example
 https://towardsdatascience.com/simple-svd-algorithms-13291ad2eef2
 https://medium.com/swlh/eigenvalues-and-eigenvectors-5fbc8b037eed
 https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#prediction-algorithms
 http://surpriselib.com/