# TODO
- Implement the evaluate and benchmarking_pipeline functions
- Extract all the notebook functions in a python script
- Create a new notebook where you will use the extracted benchmarking_pipeline function to do the benchamrking
- Do the benchmarking of the 5 already used models along with NMF and SVD.

http://surpriselib.com

# Load data

## From surprise

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 356kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618270 sha256=f7dc36c5d16ba19ec21729186ec67926c6521f0c96be3f9c5e5cce8fcf635d00
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
from surprise import Dataset

ratings = Dataset.load_builtin('ml-100k')
ratings

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


<surprise.dataset.DatasetAutoFolds at 0x7f6e26832d68>

## From file

In [3]:
from pathlib import Path
from surprise import Reader

ratings_filepath = Path('../content/ratings.csv')
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
ratings = Dataset.load_from_file(ratings_filepath, reader)
ratings

<surprise.dataset.DatasetAutoFolds at 0x7f6e249a6940>

## Modular function

In [4]:
from surprise.dataset import DatasetAutoFolds
from pathlib import Path

def load_ratings_from_surprise() -> DatasetAutoFolds:
    ratings = Dataset.load_builtin('ml-100k')
    return ratings

def load_ratings_from_file(ratings_filepath : Path) -> DatasetAutoFolds:
    reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    ratings = Dataset.load_from_file(ratings_filepath, reader)
    return ratings


def get_ratings(load_from_surprise : bool = True, ratings_filepath : Path = None) -> DatasetAutoFolds:
    if load_from_surprise:
        ratings = load_ratings_from_surprise()
    else:
        ratings = load_ratings_from_file(ratings_filepath)
    return ratings

ratings = get_ratings(load_from_surprise=True)
ratings

<surprise.dataset.DatasetAutoFolds at 0x7f6e22e83320>

# Manual pipeline

## Split data in train and test

In [5]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(ratings, test_size=0.2, random_state=42)
trainset

<surprise.trainset.Trainset at 0x7f6e364a0cf8>

## Train model

In [6]:
from surprise import KNNBasic

model = KNNBasic(sim_options={'user_based': True})

In [7]:
model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f6e249a6630>

In [52]:
from surprise.trainset import Trainset
from  surprise.prediction_algorithms.algo_base import AlgoBase

from surprise.prediction_algorithms.knns import KNNBasic

def train(model_class: AlgoBase, model_arguments: dict, trainset: Trainset) -> AlgoBase:
    model = model_class(model_arguments)
    model.fit(trainset)
    return model
    
trained(KNNBasic, {'user_based': False, 'name': 'pearson'}, trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f6e052f0d30>

In [53]:
from surprise.prediction_algorithms.matrix_factorization import NMF

train(NMF, 10, trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f6e0ac8f080>

## Evaluation

In [54]:
from surprise import accuracy

predictions = model.test(testset)
predictions[:10]

accuracy.rmse(predictions=predictions)

RMSE: 0.9802


0.980150596704479

In [55]:
accuracy.mae(predictions=predictions)

MAE:  0.7727


0.7726923699816388

In [56]:
from surprise import accuracy

def evaluate(model: AlgoBase, test_set: [(int, int, float)]) -> dict:
    predictions = model.test(test_set)
    metrics_dict = {}
    metrics_dict['RMSE'] = accuracy.rmse(predictions, verbose=False)
    metrics_dict['MAE'] = accuracy.mae(predictions, verbose=False)
    return metrics_dict

## Modular code

In [57]:
from surprise.model_selection import train_test_split


from surprise.prediction_algorithms.knns import KNNBasic

def train_and_evalute_model_pipeline(model_class: AlgoBase, model_kwargs: dict = {},
                                     from_surprise: bool = True,
                                     test_size: float = 0.2) -> (AlgoBase, dict):
    data = get_ratings(from_surprise)
    train_set, test_set = train_test_split(data, test_size, random_state=42)
    model = get_trained_model(model_class, model_kwargs, train_set)
    metrics_dict = evaluate_model(model, test_set)
    return model, metrics_dict

my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic)
metrics_dict

Computing the msd similarity matrix...
Done computing similarity matrix.


{'MAE': 0.980150596704479, 'RMSE': 0.980150596704479}

# Benchmarking

In [58]:
from surprise.prediction_algorithms.knns import KNNBasic

benchmark_dict = {}

model_kwargs = {'user_based': True, 'name': 'cosine'}
knn, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs)
benchmark_dict['KNN user based cosine'] = metrics_dict

model_kwargs = {'user_based': True, 'name': 'pearson'}
knn, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs)
benchmark_dict['KNN user based pearson'] = metrics_dict

model_kwargs = {'user_based': False, 'name': 'cosine'}
knn, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs)
benchmark_dict['KNN item based cosine'] = metrics_dict

model_kwargs = {'user_based': False, 'name': 'pearson'}
knn, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs)
benchmark_dict['KNN item based pearson'] = metrics_dict


benchmark_dict

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'KNN item based cosine': {'MAE': 1.0264295933767333,
  'RMSE': 1.0264295933767333},
 'KNN item based pearson': {'MAE': 1.041104054968961,
  'RMSE': 1.041104054968961},
 'KNN user based cosine': {'MAE': 1.0193536815834319,
  'RMSE': 1.0193536815834319},
 'KNN user based pearson': {'MAE': 1.0150350905205965,
  'RMSE': 1.0150350905205965}}

In [61]:
benchmark_dict = {}

model_dict_list = [
    {
        'model_name' : 'KNN user based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'cosine'}
    },
    {
        'model_name' : 'KNN user based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'pearson'}
    },

    
    {
        'model_name' : 'KNN ratings based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'cosine'}
    },

    {
        'model_name' : 'KNN ratings based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'pearson'}
    },
]

for model_dict in model_dict_list:
    model, metrics_dict = train_and_evalute_model_pipeline(
        model_dict['model_class'], model_dict['model_kwargs'])
    benchmark_dict[model_dict['model_name']] = metrics_dict
    model_dict['fitted_model'] = model
    
benchmark_dict

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'KNN ratings based with cosine similarity': {'MAE': 1.0264295933767333,
  'RMSE': 1.0264295933767333},
 'KNN ratings based with pearson similarity': {'MAE': 1.041104054968961,
  'RMSE': 1.041104054968961},
 'KNN user based with cosine similarity': {'MAE': 1.0193536815834319,
  'RMSE': 1.0193536815834319},
 'KNN user based with pearson similarity': {'MAE': 1.0150350905205965,
  'RMSE': 1.0150350905205965}}

In [63]:
from surprise.model_selection import cross_validate

cross_validate(model, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0366  1.0432  1.0415  1.0399  1.0449  1.0412  0.0029  
MAE (testset)     0.8321  0.8330  0.8346  0.8348  0.8344  0.8338  0.0011  
Fit time          2.44    2.48    2.46    2.50    2.45    2.47    0.02    
Test time         4.12    4.19    4.15    4.09    4.17    4.14    0.04    


{'fit_time': (2.441161870956421,
  2.4798202514648438,
  2.457812786102295,
  2.4984235763549805,
  2.4494428634643555),
 'test_mae': array([0.83207783, 0.83302573, 0.83464081, 0.83476846, 0.83444066]),
 'test_rmse': array([1.03657392, 1.04321731, 1.04149756, 1.03991275, 1.04494219]),
 'test_time': (4.122808218002319,
  4.187647581100464,
  4.146024942398071,
  4.085706472396851,
  4.1691601276397705)}