# Neural Collaborative Filtering

Neural Collaborative Filtering (NCF) is an algorithm based on deep neural networks to tackle collaborative filtering on the basis of implicit feedback. Since we are using neural networks to find relation between users and items, we can easily scale the solution to large datasets. Thus making this method better than item based collaborative filtering.

NCF works by first representing users and items as vectors in a latent space. These vectors are then used to calculate a score for each user-item pair. The score is then used to predict whether the user will interact with the item. NCF is useful because it can learn non-linear relationships between users and items. This makes it a more powerful model than traditional matrix factorization methods.

Reference: [https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/ncf_deep_dive.ipynb]

## Load the Dataset

In [1]:
import sys
import os
import shutil

import pandas as pd
import numpy as np

from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_chrono_split, python_stratified_split

from recommenders.models.ncf.dataset import Dataset as NCFDataset

# Importing the NCF model class from the recommenders library
from recommenders.models.ncf.ncf_singlenode import NCF

# importing the evaluation metrics
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k,
                                                     recall_at_k, get_top_k_items,
                                                     catalog_coverage, distributional_coverage, novelty, diversity, serendipity)
from recommenders.utils.constants import SEED as DEFAULT_SEED

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))




System version: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
Pandas version: 1.4.2


In [2]:
df_full = pd.read_csv('users_feature_eng.csv')[['Username', 'track_name', 'artist_name',
                                                    'rank',
                                                    # 'playcount'
                                                    ]]
df_full['track'] = df_full['track_name'] + ' ' + df_full['artist_name']
df_full['itemID'] = df_full.groupby('track').ngroup() + 1
df_full['userID'] = df_full.groupby('Username').ngroup() + 1

# df = df_full.copy()
df_full.rename(columns={'rank': 'rating'}, inplace=True)
# df.rename(columns={'playcount': 'rating'}, inplace=True)
df_full = df_full.drop(['track', 'track_name', 'artist_name', 'Username'], axis = 1)

# # using a subset of data to reduce runtime to manageable duration, select users who have more than 48 top songs
threshold = 48
df_full = df_full[df_full.groupby('userID')['userID'].transform('size') > threshold]
df_full = df_full[['userID', 'itemID', 'rating']]
df_full.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 51548 entries, 0 to 393030
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  51548 non-null  int64
 1   itemID  51548 non-null  int64
 2   rating  51548 non-null  int64
dtypes: int64(3)
memory usage: 1.6 MB


Stratified Split: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same ratio of items used in both training and test splits. The converse is true if splitting by item.


In [3]:
#Split the dataset into 75% train and 25% test

# header = {
#     "col_user": "userID",
#     "col_item": "itemID",
#     "col_rank": "rank",
#     # "col_rank": 'playcount',
#     "col_prediction": "Prediction",
# }


train, test = python_stratified_split(
    df_full, ratio=0.8,
    #   col_user="userID", col_item="itemID", seed=42
)

# Filtering out users and items in the test set that do not appear in the training set.
# This is done so that we can see if our model has learnt user's previous item interactions and can recommend relevant items.
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

# Creating a test set which only contains the last interaction for each user. Remaining data of the user is used in the train set
leave_one_out_test = test.groupby("userID").last().reset_index()

test.head()

Unnamed: 0,userID,itemID,rating
119014,6,92294,33
185573,6,18824,3
153328,10,121378,17
219164,10,83514,10
270950,10,129097,32


In [4]:
# top k items to recommend
TOP_K = 30

# Model parameters
# Number of iterations during the training process
EPOCHS = 100
# Batch size means how many user-item pairs you want to predict at once
BATCH_SIZE = 256

# Setting seed to remove any stochasticity and reproduce results
SEED = DEFAULT_SEED  # Set None for non-deterministic results

In [5]:
# Writing the data into csv files
train_file = "train.csv"
test_file = "test.csv"
leave_one_out_test_file = "leave_one_out_test.csv"

In [6]:
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [7]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

INFO:recommenders.models.ncf.dataset:Indexing train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing leave_one_out_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file leave_one_out_test_full.csv ...
100%|██████████| 997/997 [00:13<00:00, 73.23it/s]
INFO:recommenders.models.ncf.dataset:Indexing leave_one_out_test_full.csv ...


## Training the NCF Model

NCF parameters:

`n_users`, number of users. We are one hot encoding our user data. Therefore the input size of the model will be number of users.

`n_items`, number of items. Same logic as `n_users`.

`batch_size`, number of examples you want the model to process at a time. Higher value will consume more memory.

`learning_rate`, this can be thought of as how much you want the model to change after one iteration. Large value will lead to unstability and very small values will take more time to converge.

`n_factors`, which controls the dimension of the latent space. Usually, the quality of the training set predictions grows with as n_factors gets higher.

`layer_sizes`, sizes of input layer (and hidden layers) of MLP, input type is list. We have set it to [32,16,8,4], [64,32,16,8,4], [64, 32, 16, 8] as from training and testing, higher values gave better results. We have explored running the model with different layer sizes, finally deciding on [64, 32, 16, 8] due to performance and time cost.

`n_epochs`, which defines the number of iteration of the SGD procedure. Note that both parameter also affect the training time.

`model_type`, we can train single "MLP", "GMF" or combined model "NCF" by changing the type of model.

[Reference: https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/ncf_deep_dive.ipynb]

In [8]:
model = NCF (
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[64,32,16,8],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)





In [9]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [8.85s]: train_loss = 0.087081 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [8.54s]: train_loss = 0.035409 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [8.73s]: train_loss = 0.020464 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [8.64s]: train_loss = 0.015794 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [8.79s]: train_loss = 0.012869 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [8.78s]: train_loss = 0.010372 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [8.59s]: train_loss = 0.009050 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [8.67s]: train_loss = 0.008748 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [8.64s]: train_loss = 0.007571 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [8.69s]: train_loss = 0.006530 


Took 868.1762786999999 seconds for training.


In [10]:
# save the model
# https://github.com/recommenders-team/recommenders/issues/1735
dir_path = 'NCF_model_trained'
model.save(dir_path)
# # and then while loading depending on the type of your model in this case neumf pass it that dir parameter
# model.load(neumf_dir='dir_path')

In [9]:
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[64,32,16,8],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

dir_path = 'NCF_model_trained'
model.load(neumf_dir = dir_path)

model.user2id = data.user2id
model.item2id = data.item2id
model.id2user = data.id2user
model.id2item = data.id2item



INFO:tensorflow:Restoring parameters from NCF_model_trained\model.ckpt


## Prediction

After fitting the model, we can call `predict` to get some predictions. `predict` returns an internal object Prediction which can be easily converted back to a dataframe.

In [None]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]

predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

In [12]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 35.29185000000007 seconds for prediction.


## General Evaluation

We remove songs that are already users' top songs in the top k recommendations. To compute ranking metrics, we need predictions on all user, item pairs. We do not want to recommend the same item again to the user.

- Ranking Metrics: These are used to evaluate how relevant recommendations are for users

MAP - It is the average precision for each user normalized over all users.

Normalized Discounted Cumulative Gain (NDCG) - evaluates how well the predicted items for a user are ranked based on relevance

Precision - this measures the proportion of recommended items that are relevant

Recall - this measures the proportion of relevant items that are recommended


- Rating Metrics: These are used to evaluate how accurate a recommender is at predicting ratings that users gave to items

Root Mean Square Error (RMSE) - measure of average error in predicted ratings

R Squared (R2) - essentially how much of the total variation is explained by the model

Mean Absolute Error (MAE) - similar to RMSE but uses absolute value instead of squaring and taking the root of the average

Explained Variance - how much of the variance in the data is explained by the model


- Non accuracy based metrics: These do not compare predictions against ground truth but instead evaluate the following properties of the recommendations

Novelty - measures of how novel recommendation items are by calculating their recommendation frequency among users

Diversity - measures of how different items in a set are with respect to each other

Serendipity - measures of how surprising recommendations are to to a specific user by comparing them to the items that the user has already interacted with

Coverage - measures related to the distribution of items recommended by the system.

In [13]:
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"Precision: {eval_precision} \n Recall: {eval_recall}")

Precision: 0.0197592778335005 
 Recall: 0.08884550476827306


In [14]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"MAP@K: {eval_map}")

MAP@K: 0.02211773740976093


In [15]:
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"NDCG@K: {eval_ndcg}")

NDCG@K: 0.05895727619663889


In [16]:
# Prepare for diversity based evaluations

# Merge all_predictions with train on userID and itemID
merged_df = pd.merge(all_predictions, train, left_on=['userID', 'itemID'], right_on=['userID', 'itemID'], how='outer')

# Filter out the rows where train.rating is null
top_all = merged_df[merged_df['rating'].isnull()]
top_all = top_all[['userID', 'itemID', 'prediction']]
print(top_all.shape[0])

# Sort top_all DataFrame by 'prediction' column within each 'userID' group in descending order
top_all_sorted = top_all.sort_values(by=['userID', 'prediction'], ascending=[True, False])

# Group by 'userID' and take the top_k items for each group
top_k_reco = top_all_sorted.groupby('userID').head(TOP_K)
print(top_k_reco.shape[0])

26609537
31350


In [17]:
eval_diversity = diversity(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"Diversity: {eval_diversity}")

eval_novelty = novelty(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"Novelty: {eval_novelty}")

eval_distributional_coverage = distributional_coverage(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"distributional_coverage: {eval_distributional_coverage}")

eval_catalog_coverage = catalog_coverage(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"catalog_coverage: {eval_catalog_coverage}")

eval_serendipity = serendipity(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"serendipity: {eval_serendipity}")

Diversity: 0.9534591327062832
Novelty: 12.78564042556168
distributional_coverage: 11.260640133456693
catalog_coverage: 0.2012704387719092
serendipity: 0.9810615281060834


### Summary of Ranking Metrics

<center>

|Metric|Range|Selection criteria|Limitation|
|------|-------------------------------|---------|----------|
|Precision|$\geq 0$ and $\leq 1$|Higher the better.|Only for hits in recommendations.|
|Recall|$\geq 0$ and $\leq 1$|Higher the better.|Only for hits in the ground truth.|
|NDCG|$\geq 0$ and $\leq 1$|Higher the better.|Does not penalize for bad/missing items, and does not perform for several equally good items.|
|MAP|$\geq 0$ and $\leq 1$|Higher the better.|Depend on variable distributions.|

</center>

## "Leave-one-out" Evaluation
For each item in test data, we randomly samples 100 items that are not interacted by the user, ranking the test item among the 101 items (1 positive item and 100 negative items). The performance of a ranked list is judged by Hit Ratio (HR) and Normalized Discounted Cumulative Gain (NDCG). Finally, we average the values of those ranked lists to obtain the overall HR and NDCG on test data.

We truncated the ranked list at 10 for both metrics. As such, the HR intuitively measures whether the test item is present on the top-10 list, and the NDCG accounts for the position of the hit by assigning higher scores to hits at top ranks.

In [18]:
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.751254
NDCG:	0.549389


## Pre-train with GMF and MLP

To get better performance of NeuMF, we can adopt pre-training strategy. We train GMF and MLP and then use their model parameters as the initialization for the corresponding parts of NeuMF’s parameters. 

We then evaluate pre-trained model using the same evaluation metrics. Compared with not pre-trained NMF, all evaluation metrics had slight improvements. The performance of pre-trained NCF is better than the not pre-trained.


In [19]:
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="GMF",
    n_factors=4,
    layer_sizes=[64,32,16,8],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training GMF.".format(train_time.interval))

model.save(dir_name=".pretrain/GMF")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [6.98s]: train_loss = 0.367336 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [6.55s]: train_loss = 0.281015 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [7.27s]: train_loss = 0.242760 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [6.57s]: train_loss = 0.227738 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [6.48s]: train_loss = 0.218232 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [6.98s]: train_loss = 0.211604 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [6.60s]: train_loss = 0.206801 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [6.83s]: train_loss = 0.201693 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [7.12s]: train_loss = 0.197851 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [6.67s]: train_loss = 0.196545 


Took 677.2953388999999 seconds for training GMF.


In [20]:
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="MLP",
    n_factors=4,
    layer_sizes=[64,32,16,8],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training MLP.".format(train_time.interval))

model.save(dir_name=".pretrain/MLP")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [9.18s]: train_loss = 0.140024 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [8.63s]: train_loss = 0.057390 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [8.98s]: train_loss = 0.036796 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [8.67s]: train_loss = 0.026088 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [8.51s]: train_loss = 0.019848 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [8.95s]: train_loss = 0.015442 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [8.68s]: train_loss = 0.013119 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [8.83s]: train_loss = 0.010964 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [8.45s]: train_loss = 0.009063 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [8.35s]: train_loss = 0.008663 


Took 879.0392510000001 seconds for training MLP.


## Training and evaluating pre-trained model

In [22]:
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[64,32,16,8],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

model.load(gmf_dir=".pretrain/GMF", mlp_dir=".pretrain/MLP", alpha=0.5)

model.user2id = data.user2id
model.item2id = data.item2id
model.id2user = data.id2user
model.id2item = data.id2item

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training pre-trained NeuMF.".format(train_time.interval))

INFO:tensorflow:Restoring parameters from .pretrain/GMF\model.ckpt
INFO:tensorflow:Restoring parameters from .pretrain/MLP\model.ckpt
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [8.90s]: train_loss = 0.006516 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [8.80s]: train_loss = 0.005565 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [8.95s]: train_loss = 0.005321 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [9.13s]: train_loss = 0.005083 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [8.95s]: train_loss = 0.004585 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [8.87s]: train_loss = 0.004313 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [9.42s]: train_loss = 0.004628 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [8.69s]: train_loss = 0.003939 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [8.72s]: train_loss = 0.003665 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [8.45s]: train_loss = 0.003519 


Took 886.9909207000001 seconds for training pre-trained NeuMF.


In [23]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 35.10330669999985 seconds for prediction.


In [24]:
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"Precision: {eval_precision} \n Recall: {eval_recall}")

Precision: 0.022801738548980273 
 Recall: 0.09940217477830315


In [25]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"MAP@K: {eval_map}")

MAP@K: 0.02549432604837352


In [26]:
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"NDCG@K: {eval_ndcg}")

NDCG@K: 0.06705734850071839


In [27]:
# Prepare for diversity based evaluations

# Merge all_predictions with train on userID and itemID
merged_df = pd.merge(all_predictions, train, left_on=['userID', 'itemID'], right_on=['userID', 'itemID'], how='outer')

# Filter out the rows where train.rating is null
top_all = merged_df[merged_df['rating'].isnull()]
top_all = top_all[['userID', 'itemID', 'prediction']]
print(top_all.shape[0])

# Sort top_all DataFrame by 'prediction' column within each 'userID' group in descending order
top_all_sorted = top_all.sort_values(by=['userID', 'prediction'], ascending=[True, False])

# Group by 'userID' and take the top_k items for each group
top_k_reco = top_all_sorted.groupby('userID').head(TOP_K)
print(top_k_reco.shape[0])

26609537
31350


In [28]:
eval_diversity = diversity(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"Diversity: {eval_diversity}")

eval_novelty = novelty(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"Novelty: {eval_novelty}")

eval_distributional_coverage = distributional_coverage(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"distributional_coverage: {eval_distributional_coverage}")

eval_catalog_coverage = catalog_coverage(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"catalog_coverage: {eval_catalog_coverage}")

eval_serendipity = serendipity(train, top_k_reco, col_user='userID', col_item='itemID')
print(f"serendipity: {eval_serendipity}")

Diversity: 0.9493022304063957
Novelty: 12.691264663902116
distributional_coverage: 11.13977560085456
catalog_coverage: 0.18774261851546878
serendipity: 0.9783733991118525


In [29]:
# "Leave-one-out" Evaluation
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.743230
NDCG:	0.563786
