In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the data

In [2]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
ml_df.head(10)

display(HTML(ml_movies_df.head(10).to_html()))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=1000, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of chosen interactions: {}".format(len(ml_ratings_df)))

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Number of chosen interactions: 9692


# Recommender class

Remark: Docstrings written in reStructuredText (reST) used by Sphinx to automatically generate code documentation. It is also used by default by PyCharm (type triple quotes after defining a class or a method and hit enter).

In [3]:
class Recommender(object):
    """
    Base recommender class.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        
        :param int seed: Seed for the random number generator.
        """
        pass
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        pass
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        for ix, user in users_df.iterrows():
            user_recommendations = pd.DataFrame({'user_id': user['user_id'],
                                                 'item_id': [-1] * n_recommendations,
                                                 'score': [3.0] * n_recommendations})

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations

# Evaluation measures

## Explicit feedback - ratings

### MAE - Mean Absolute error

<center>
$$
    MAE = \frac{\sum_{i}^N |\hat{r}_i - r_i|}{N}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 1.** Implement MAE.

In [4]:
def mae(r_pred, r_real):
    return np.sum(np.abs(r_pred - r_real)) / len(r_pred)


# Test

# Very small differences
print("MAE = {:.3f}".format(mae(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("MAE = {:.3f}".format(mae(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("MAE = {:.3f}".format(mae(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("MAE = {:.3f}".format(mae(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference
print("MAE = {:.3f}".format(mae(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

MAE = 0.016
MAE = 0.340
MAE = 1.940
MAE = 1.060
MAE = 0.940


### RMSE - Root Mean Squared Error

<center>
$$
    RMSE = \sqrt{\frac{\sum_{i}^N (\hat{r}_i - r_i)^2}{N}}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 2.** Implement RMSE.

In [5]:
def rmse(r_pred, r_real):
    return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))


# Test

# Very small differences
print("RMSE = {:.3f}".format(rmse(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("RMSE = {:.3f}".format(rmse(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("RMSE = {:.3f}".format(rmse(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("RMSE = {:.3f}".format(rmse(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference
print("RMSE = {:.3f}".format(rmse(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

RMSE = 0.018
RMSE = 0.371
RMSE = 1.994
RMSE = 1.333
RMSE = 1.630


### MRE - Mean Relative Error

<center>
$$
    MRE = \frac{1}{N} \sum_{i}^N \frac{|\hat{r}_i - r_i|}{|r_i|}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 3.** Implement MRE.

In [6]:
def mre(r_pred, r_real):
    return np.sum(np.abs(r_pred - r_real) / np.abs(r_real)) / len(r_pred)


# Test

# Very small differences
print("MRE = {:.3f}".format(mre(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("MRE = {:.3f}".format(mre(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("MRE = {:.3f}".format(mre(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("MRE = {:.3f}".format(mre(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference for a small base value
print("MRE = {:.3f}".format(mre(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

MRE = 0.006
MRE = 0.189
MRE = 0.999
MRE = 0.702
MRE = 0.789


### TRE - Total Relative Error

<center>
$$
    TRE = \frac{\sum_{i}^N |\hat{r}_i - r_i|}{\sum_{i}^N |r_i|}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 4.** Implement TRE.

In [7]:
def tre(r_pred, r_real):
      return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))


# Test

# Very small differences
print("TRE = {:.3f}".format(tre(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("TRE = {:.3f}".format(tre(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("TRE = {:.3f}".format(tre(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("TRE = {:.3f}".format(tre(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference for a small base value
print("TRE = {:.3f}".format(tre(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

TRE = 0.005
TRE = 0.113
TRE = 0.647
TRE = 0.353
TRE = 0.313


## Implicit feedback - binary indicators of interactions

### HR@n - Hit Ratio 
How many hits did we score in the first n recommendations.
<br/>
<br/>
<center>
$$
    \text{HR@}n = \frac{\sum_{u} \sum_{i \in I_u} r_{u, i} \cdot 1_{\hat{D}_n(u)}(i)}{M}
$$
</center>

where:
  * $r_{u, i}$ is $1$ if there was an interaction between user $u$ and item $i$ in the test set and $0$ otherwise, 
  * $\hat{D}_n$ is the set of the first $n$ recommendations for user $u$, 
  * $1_{\hat{D}_n}(i)$ is $1$ if and only if $i \in \hat{D}_n$, otherwise it's equal to $0$,
  * $M$ is the number of users.

**Task 5.** Implement HR.

In [8]:
def hr(recommendations, real_interactions, n=1):
    """
    Assumes recommendations are ordered by user_id and then by score.
    """
    hr = 0
    uniqIds = real_interactions["user_id"].unique()
    for id in uniqIds:
        interactionsById = real_interactions.loc[real_interactions["user_id"] == id]
        recommendationsById = recommendations.loc[recommendations["user_id"] == id]
        for idx, row in recommendationsById.head(n).iterrows():
            if row["item_id"] in interactionsById["item_id"].unique():
                hr+=1
    hr /= uniqIds.size
    return hr

In [9]:
# Case 1
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])


    
recommendations = pd.DataFrame(
    [
        [1, 45, 0.9],
        [1, 13, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])


    
print("HR@3 = {:.4f}".format(hr(recommendations, real_interactions, n=3)))

HR@3 = 1.5000


In [10]:
# Case 2
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 13, 0.9],
        [1, 45, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("HR@3 = {:.4f}".format(hr(recommendations, real_interactions, n=3)))

Unnamed: 0,user_id,item_id
0,1,45
1,1,22
2,1,77
3,2,13
4,2,77


Unnamed: 0,user_id,item_id,score
0,1,13,0.9
1,1,45,0.8
2,1,22,0.71
3,1,77,0.55
4,1,9,0.52
5,2,11,0.85
6,2,13,0.69
7,2,25,0.64
8,2,6,0.6
9,2,77,0.53


HR@3 = 1.5000


### NDCG@n - Normalized Discounted Cumulative Gain

How many hits did we score in the first n recommendations discounted by the position of each recommendation.
<br/>
<br/>
<center>
$$
    \text{NDCG@}n = \frac{\sum_{u} \sum_{i \in I_u} \frac{r_{u, i}}{log\left(1 + v_{\hat{D}_n(u)}(i)\right)}}{M}
$$
</center>

where:
  * $r_{u, i}$ is $1$ if there was an interaction between user $u$ and item $i$ in the test set and $0$ otherwise, 
  * $\hat{D}_n(u)$ is the set of the first $n$ recommendations for user $u$, 
  * $v_{\hat{D}_n(u)}(i)$ is the position of item $i$ in recommendations $\hat{D}_n$,
  * $M$ is the number of users.


**Task 6.** Implement NDCG.

In [11]:
def ndcg(recommendations, real_interactions, n=1):
    """
    Assumes recommendations are ordered by user_id and then by score.
    """
    ndcg = 0
    uniqIds = real_interactions["user_id"].unique()
    for id in uniqIds:
        interactionsById = real_interactions.loc[real_interactions["user_id"] == id]
        recommendationsById = recommendations.loc[recommendations["user_id"] == id].reset_index()
        for idx, row in recommendationsById.head(n).iterrows():
            if row["item_id"] in interactionsById["item_id"].unique():
                ndcg+=1/np.log2(idx+2)
    ndcg /= len(uniqIds)
    
    
    return ndcg

In [12]:
# Case 1
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 45, 0.9],
        [1, 13, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("NDCG@3 = {:.4f}".format(ndcg(recommendations, real_interactions, n=3)))

Unnamed: 0,user_id,item_id
0,1,45
1,1,22
2,1,77
3,2,13
4,2,77


Unnamed: 0,user_id,item_id,score
0,1,45,0.9
1,1,13,0.8
2,1,22,0.71
3,1,77,0.55
4,1,9,0.52
5,2,11,0.85
6,2,13,0.69
7,2,25,0.64
8,2,6,0.6
9,2,77,0.53


NDCG@3 = 1.0655


In [13]:
# Case 2
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 13, 0.9],
        [1, 45, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("NDCG@3 = {:.4f}".format(ndcg(recommendations, real_interactions, n=3)))

Unnamed: 0,user_id,item_id
0,1,45
1,1,22
2,1,77
3,2,13
4,2,77


Unnamed: 0,user_id,item_id,score
0,1,13,0.9
1,1,45,0.8
2,1,22,0.71
3,1,77,0.55
4,1,9,0.52
5,2,11,0.85
6,2,13,0.69
7,2,25,0.64
8,2,6,0.6
9,2,77,0.53


NDCG@3 = 0.8809


# Testing routines (offline)

## Train and test set split

### Explicit feedback

**Task 7.** Implement a method performing train-test split evaluation for explicit feedback for a given recommender.

In [14]:
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    if type(interactions_df) == dict:
        # If interactions_df is a dict with already split data, use the split
        interactions_df_train = interactions_df['train']
        interactions_df_test = interactions_df['test']
    else:    
        # Otherwise split the dataset into train and test
        items_index = np.arange(len(interactions_df))
        rng.shuffle(items_index)
        split_ratio = 0.8
        split_index = int(len(interactions_df) * split_ratio)
        train_interactions_dataset = interactions_df.iloc[items_index[:split_index]]
        test_interactions_dataset = interactions_df.iloc[items_index[split_index:]]
    # Train the recommender
    recommender.fit(train_interactions_dataset, None, items_df)

    # Gather predictions
    
    r_pred = []
    
    for idx, row in test_interactions_dataset.iterrows():
        users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
        eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
        eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
        recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)

        r_pred.append(recommendations.iloc[0]['score'])
    
    # Gather real ratings
    
    r_real = np.array(test_interactions_dataset['rating'].tolist())
    
    
    # Return evaluation metrics
    
    return rmse(r_pred, r_real), mre(r_pred, r_real), tre(r_pred, r_real)

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_train_test_split_explicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(HTML(results.to_html()))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,BaseRecommender,1.170155,0.349264,0.271796


### Implicit feedback

**Task 8.** Implement a method performing train-test split evaluation for implicit feedback for a given recommender.

In [17]:
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    if type(interactions_df) == 'dict':
        # If interactions_df is a dict with already split data, use the split
        interactions_df_train = interactions_df['train']
        interactions_df_test = interactions_df['test']
    else:    
        # Otherwise split the dataset into train and test

        items_index = np.arange(len(interactions_df))
        rng.shuffle(items_index)
        split_ratio = 0.8
        split_index = int(split_ratio * len(interactions_df))
        interactions_df_train = interactions_df.iloc[items_index[:split_index]]
        interactions_df_test = interactions_df.iloc[items_index[split_index:]]
        
    
    hr_1 = []
    hr_3 = []
    hr_5 = []
    hr_10 = []
    ndcg_1 = []
    ndcg_3 = []
    ndcg_5 = []
    ndcg_10 = []
    
    # Train the recommender
    
    recommender.fit(interactions_df_train, None, items_df)
    
    
    # Make recommendations for each user in the test set and calculate the metric 
    # against all items of that user in the test set
    
    interactions_by_user = interactions_df_test.groupby(by='user_id')
    
    for user_id, user_interactions in interactions_by_user:
        recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']), 
                                                items_df, n_recommendations=10)
        hr_1.append(hr(recommendations, user_interactions, n=1))
        hr_3.append(hr(recommendations, user_interactions, n=3))
        hr_5.append(hr(recommendations, user_interactions, n=5))
        hr_10.append(hr(recommendations, user_interactions, n=10))
        ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
        ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
        ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
        ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
                
    hr_1 = np.mean(hr_1)
    hr_3 = np.mean(hr_3)
    hr_5 = np.mean(hr_5)
    hr_10 = np.mean(hr_10)
    ndcg_1 = np.mean(ndcg_1)
    ndcg_3 = np.mean(ndcg_3)
    ndcg_5 = np.mean(ndcg_5)
    ndcg_10 = np.mean(ndcg_10)
        
    return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_train_test_split_implicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,BaseRecommender,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Leave-one-out, leave-k-out, cross-validation

### Explicit feedback

**Task 9.** Implement a method performing leave one out evaluation for explicit feedback for a given recommender.

In [31]:
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    # Prepare splits of the datasets
    kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
    
    
    # For each split of the dataset train the recommender, generate recommendations and evaluate
    
    r_pred = []
    r_real = []
    n_eval = 1
    
    for train_index, test_index in kf.split(interactions_df.index):
        interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
        interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
        
        recommender.fit(interactions_df_train, None, items_df)
        recommendations = recommender.recommend(interactions_df_test.loc[:, ['user_id']], 
                                                items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
        
        r_pred.append(recommendations.iloc[0]['score'])
        r_real.append(interactions_df_test.iloc[0]['rating'])
        
        if n_eval == max_evals:
            break
        n_eval += 1
        
    r_pred = np.array(r_pred)
    r_real = np.array(r_real)
        
    # Return evaluation metrics
    
    return rmse(r_pred, r_real), mre(r_pred, r_real), tre(r_pred, r_real)

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_leave_one_out_explicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(HTML(results.to_html()))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,BaseRecommender,1.161536,0.382929,0.267866


### Implicit feedback

**Task 10.** Implement a method performing leave one out evaluation for implicit feedback for a given recommender.

In [35]:
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    # Prepare splits of the datasets
    kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
    
    
    hr_1 = []
    hr_3 = []
    hr_5 = []
    hr_10 = []
    ndcg_1 = []
    ndcg_3 = []
    ndcg_5 = []
    ndcg_10 = []
    
    # For each split of the dataset train the recommender, generate recommendations and evaluate
    
    n_eval = 1
    for train_index, test_index in kf.split(interactions_df.index):
        interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
        interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
        recommender.fit(interactions_df_train, None, items_df)
        recommendations = recommender.recommend(interactions_df_test.loc[:, ['user_id']], 
                                                items_df,n_recommendations=10)
        
        print(recommendations)
        hr_1.append(hr(recommendations, interactions_df_test, n=1))
        hr_3.append(hr(recommendations, interactions_df_test, n=3))
        hr_5.append(hr(recommendations, interactions_df_test, n=5))
        hr_10.append(hr(recommendations, interactions_df_test, n=10))
        ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
        ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
        ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
        ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
        
        if n_eval == max_evals:
            break
        n_eval += 1
        
    hr_1 = np.mean(hr_1)
    hr_3 = np.mean(hr_3)
    hr_5 = np.mean(hr_5)
    hr_10 = np.mean(hr_10)
    ndcg_1 = np.mean(ndcg_1)
    ndcg_3 = np.mean(ndcg_3)
    ndcg_5 = np.mean(ndcg_5)
    ndcg_10 = np.mean(ndcg_10)
    
    return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_leave_one_out_implicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(results.to_html()))

  user_id item_id  score
0     288      -1    3.0
1     288      -1    3.0
2     288      -1    3.0
3     288      -1    3.0
4     288      -1    3.0
5     288      -1    3.0
6     288      -1    3.0
7     288      -1    3.0
8     288      -1    3.0
9     288      -1    3.0
  user_id item_id  score
0     474      -1    3.0
1     474      -1    3.0
2     474      -1    3.0
3     474      -1    3.0
4     474      -1    3.0
5     474      -1    3.0
6     474      -1    3.0
7     474      -1    3.0
8     474      -1    3.0
9     474      -1    3.0
  user_id item_id  score
0     448      -1    3.0
1     448      -1    3.0
2     448      -1    3.0
3     448      -1    3.0
4     448      -1    3.0
5     448      -1    3.0
6     448      -1    3.0
7     448      -1    3.0
8     448      -1    3.0
9     448      -1    3.0
  user_id item_id  score
0     462      -1    3.0
1     462      -1    3.0
2     462      -1    3.0
3     462      -1    3.0
4     462      -1    3.0
5     462      -1    3.0


  user_id item_id  score
0     576      -1    3.0
1     576      -1    3.0
2     576      -1    3.0
3     576      -1    3.0
4     576      -1    3.0
5     576      -1    3.0
6     576      -1    3.0
7     576      -1    3.0
8     576      -1    3.0
9     576      -1    3.0
  user_id item_id  score
0     290      -1    3.0
1     290      -1    3.0
2     290      -1    3.0
3     290      -1    3.0
4     290      -1    3.0
5     290      -1    3.0
6     290      -1    3.0
7     290      -1    3.0
8     290      -1    3.0
9     290      -1    3.0
  user_id item_id  score
0     154      -1    3.0
1     154      -1    3.0
2     154      -1    3.0
3     154      -1    3.0
4     154      -1    3.0
5     154      -1    3.0
6     154      -1    3.0
7     154      -1    3.0
8     154      -1    3.0
9     154      -1    3.0
  user_id item_id  score
0     153      -1    3.0
1     153      -1    3.0
2     153      -1    3.0
3     153      -1    3.0
4     153      -1    3.0
5     153      -1    3.0


  user_id item_id  score
0     597      -1    3.0
1     597      -1    3.0
2     597      -1    3.0
3     597      -1    3.0
4     597      -1    3.0
5     597      -1    3.0
6     597      -1    3.0
7     597      -1    3.0
8     597      -1    3.0
9     597      -1    3.0
  user_id item_id  score
0     462      -1    3.0
1     462      -1    3.0
2     462      -1    3.0
3     462      -1    3.0
4     462      -1    3.0
5     462      -1    3.0
6     462      -1    3.0
7     462      -1    3.0
8     462      -1    3.0
9     462      -1    3.0
  user_id item_id  score
0     414      -1    3.0
1     414      -1    3.0
2     414      -1    3.0
3     414      -1    3.0
4     414      -1    3.0
5     414      -1    3.0
6     414      -1    3.0
7     414      -1    3.0
8     414      -1    3.0
9     414      -1    3.0
  user_id item_id  score
0     177      -1    3.0
1     177      -1    3.0
2     177      -1    3.0
3     177      -1    3.0
4     177      -1    3.0
5     177      -1    3.0


  user_id item_id  score
0     465      -1    3.0
1     465      -1    3.0
2     465      -1    3.0
3     465      -1    3.0
4     465      -1    3.0
5     465      -1    3.0
6     465      -1    3.0
7     465      -1    3.0
8     465      -1    3.0
9     465      -1    3.0
  user_id item_id  score
0     438      -1    3.0
1     438      -1    3.0
2     438      -1    3.0
3     438      -1    3.0
4     438      -1    3.0
5     438      -1    3.0
6     438      -1    3.0
7     438      -1    3.0
8     438      -1    3.0
9     438      -1    3.0
  user_id item_id  score
0     366      -1    3.0
1     366      -1    3.0
2     366      -1    3.0
3     366      -1    3.0
4     366      -1    3.0
5     366      -1    3.0
6     366      -1    3.0
7     366      -1    3.0
8     366      -1    3.0
9     366      -1    3.0
  user_id item_id  score
0     599      -1    3.0
1     599      -1    3.0
2     599      -1    3.0
3     599      -1    3.0
4     599      -1    3.0
5     599      -1    3.0


  user_id item_id  score
0     105      -1    3.0
1     105      -1    3.0
2     105      -1    3.0
3     105      -1    3.0
4     105      -1    3.0
5     105      -1    3.0
6     105      -1    3.0
7     105      -1    3.0
8     105      -1    3.0
9     105      -1    3.0
  user_id item_id  score
0     103      -1    3.0
1     103      -1    3.0
2     103      -1    3.0
3     103      -1    3.0
4     103      -1    3.0
5     103      -1    3.0
6     103      -1    3.0
7     103      -1    3.0
8     103      -1    3.0
9     103      -1    3.0
  user_id item_id  score
0     200      -1    3.0
1     200      -1    3.0
2     200      -1    3.0
3     200      -1    3.0
4     200      -1    3.0
5     200      -1    3.0
6     200      -1    3.0
7     200      -1    3.0
8     200      -1    3.0
9     200      -1    3.0
  user_id item_id  score
0     218      -1    3.0
1     218      -1    3.0
2     218      -1    3.0
3     218      -1    3.0
4     218      -1    3.0
5     218      -1    3.0


  user_id item_id  score
0     599      -1    3.0
1     599      -1    3.0
2     599      -1    3.0
3     599      -1    3.0
4     599      -1    3.0
5     599      -1    3.0
6     599      -1    3.0
7     599      -1    3.0
8     599      -1    3.0
9     599      -1    3.0
  user_id item_id  score
0     490      -1    3.0
1     490      -1    3.0
2     490      -1    3.0
3     490      -1    3.0
4     490      -1    3.0
5     490      -1    3.0
6     490      -1    3.0
7     490      -1    3.0
8     490      -1    3.0
9     490      -1    3.0
  user_id item_id  score
0     599      -1    3.0
1     599      -1    3.0
2     599      -1    3.0
3     599      -1    3.0
4     599      -1    3.0
5     599      -1    3.0
6     599      -1    3.0
7     599      -1    3.0
8     599      -1    3.0
9     599      -1    3.0
  user_id item_id  score
0     462      -1    3.0
1     462      -1    3.0
2     462      -1    3.0
3     462      -1    3.0
4     462      -1    3.0
5     462      -1    3.0


  user_id item_id  score
0     509      -1    3.0
1     509      -1    3.0
2     509      -1    3.0
3     509      -1    3.0
4     509      -1    3.0
5     509      -1    3.0
6     509      -1    3.0
7     509      -1    3.0
8     509      -1    3.0
9     509      -1    3.0
  user_id item_id  score
0     260      -1    3.0
1     260      -1    3.0
2     260      -1    3.0
3     260      -1    3.0
4     260      -1    3.0
5     260      -1    3.0
6     260      -1    3.0
7     260      -1    3.0
8     260      -1    3.0
9     260      -1    3.0
  user_id item_id  score
0     212      -1    3.0
1     212      -1    3.0
2     212      -1    3.0
3     212      -1    3.0
4     212      -1    3.0
5     212      -1    3.0
6     212      -1    3.0
7     212      -1    3.0
8     212      -1    3.0
9     212      -1    3.0
  user_id item_id  score
0     448      -1    3.0
1     448      -1    3.0
2     448      -1    3.0
3     448      -1    3.0
4     448      -1    3.0
5     448      -1    3.0


  user_id item_id  score
0      54      -1    3.0
1      54      -1    3.0
2      54      -1    3.0
3      54      -1    3.0
4      54      -1    3.0
5      54      -1    3.0
6      54      -1    3.0
7      54      -1    3.0
8      54      -1    3.0
9      54      -1    3.0
  user_id item_id  score
0     307      -1    3.0
1     307      -1    3.0
2     307      -1    3.0
3     307      -1    3.0
4     307      -1    3.0
5     307      -1    3.0
6     307      -1    3.0
7     307      -1    3.0
8     307      -1    3.0
9     307      -1    3.0
  user_id item_id  score
0     541      -1    3.0
1     541      -1    3.0
2     541      -1    3.0
3     541      -1    3.0
4     541      -1    3.0
5     541      -1    3.0
6     541      -1    3.0
7     541      -1    3.0
8     541      -1    3.0
9     541      -1    3.0
  user_id item_id  score
0     187      -1    3.0
1     187      -1    3.0
2     187      -1    3.0
3     187      -1    3.0
4     187      -1    3.0
5     187      -1    3.0


  user_id item_id  score
0     517      -1    3.0
1     517      -1    3.0
2     517      -1    3.0
3     517      -1    3.0
4     517      -1    3.0
5     517      -1    3.0
6     517      -1    3.0
7     517      -1    3.0
8     517      -1    3.0
9     517      -1    3.0
  user_id item_id  score
0     187      -1    3.0
1     187      -1    3.0
2     187      -1    3.0
3     187      -1    3.0
4     187      -1    3.0
5     187      -1    3.0
6     187      -1    3.0
7     187      -1    3.0
8     187      -1    3.0
9     187      -1    3.0
  user_id item_id  score
0     422      -1    3.0
1     422      -1    3.0
2     422      -1    3.0
3     422      -1    3.0
4     422      -1    3.0
5     422      -1    3.0
6     422      -1    3.0
7     422      -1    3.0
8     422      -1    3.0
9     422      -1    3.0
  user_id item_id  score
0     448      -1    3.0
1     448      -1    3.0
2     448      -1    3.0
3     448      -1    3.0
4     448      -1    3.0
5     448      -1    3.0


  user_id item_id  score
0     561      -1    3.0
1     561      -1    3.0
2     561      -1    3.0
3     561      -1    3.0
4     561      -1    3.0
5     561      -1    3.0
6     561      -1    3.0
7     561      -1    3.0
8     561      -1    3.0
9     561      -1    3.0
  user_id item_id  score
0     368      -1    3.0
1     368      -1    3.0
2     368      -1    3.0
3     368      -1    3.0
4     368      -1    3.0
5     368      -1    3.0
6     368      -1    3.0
7     368      -1    3.0
8     368      -1    3.0
9     368      -1    3.0
  user_id item_id  score
0     514      -1    3.0
1     514      -1    3.0
2     514      -1    3.0
3     514      -1    3.0
4     514      -1    3.0
5     514      -1    3.0
6     514      -1    3.0
7     514      -1    3.0
8     514      -1    3.0
9     514      -1    3.0
  user_id item_id  score
0     474      -1    3.0
1     474      -1    3.0
2     474      -1    3.0
3     474      -1    3.0
4     474      -1    3.0
5     474      -1    3.0


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,BaseRecommender,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Examples of evaluation

## Explicit feedback

### Train-test split test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()

recommenders = [highest_rated_recommender, lr_recommender, svr_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_explicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

### Leave-one-out test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()

recommenders = [highest_rated_recommender, lr_recommender, svr_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_leave_one_out_explicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

## Implicit feedback

### Train-test split test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

recommenders = [most_popular_recommender, highest_rated_recommender, lr_recommender, svr_recommender, ibcnn_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

### Leave-one-out test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

recommenders = [most_popular_recommender, highest_rated_recommender, lr_recommender, svr_recommender, ibcnn_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_leave_one_out_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))