In [None]:
!pip install recommenders recommenders[examples] tf_slim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting recommenders
  Downloading recommenders-1.1.0-py3-none-manylinux1_x86_64.whl (335 kB)
[K     |████████████████████████████████| 335 kB 5.5 MB/s 
[?25hCollecting tf_slim
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 26.5 MB/s 
[?25hCollecting category-encoders<2,>=1.3.0
  Downloading category_encoders-1.3.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 4.6 MB/s 
Collecting transformers<5,>=2.5.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 38.2 MB/s 
Collecting nltk<4,>=3.4
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 47.7 MB/s 
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting pandera[strategies]>=0.6.5
  Downloading pandera-0.9.0-py3

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import random
import sklearn

import recommenders
import recommenders.datasets.movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

## Dataset creation

### Random split
RandomDatasetCreator ensures that every item and user is present at least once in the training set.

In [None]:
class RandomDatasetCreator:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self, train_size=0.8):
        sparsity = self.max_sparsity * train_size
        if sparsity <= self.min_sparsity:
            return self.minimal_df.copy()
        if sparsity >= self.max_sparsity:
            return self.df
        num_of_additional_ratings = max(int(train_size * self.n_ratings) - self.minimal_indexes.shape[0], 0)
        additional_df = self.remaining_df.sample(n=num_of_additional_ratings)
        train_df = pd.concat([self.minimal_df, additional_df])
        test_df = self.df[~self.df.index.isin(train_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating'))
dataset_generator = RandomDatasetCreator(df)
train_df, test_df = dataset_generator.train_test_split(0.8)

100%|██████████| 4.81k/4.81k [00:00<00:00, 8.96kKB/s]


### Time split
Some of the users or items might not be present in the training set.

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'ratings', 'timestamp'))
random.seed(42)
np.random.seed(seed=42)
train_df, test_df = python_chrono_split(df, ratio=0.8, col_user='userID', col_item='itemID', col_timestamp='timestamp')

100%|██████████| 4.81k/4.81k [00:00<00:00, 9.69kKB/s]


### Leave-one-out user
LeaveOneOutUser ensures that every item and user is present at least once in the training set.

In [None]:
class LeaveOneOutUser:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", timestamp="timestamp", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        self.df = self.df.sort_values(timestamp)
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.timestamp = timestamp
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self):
        test_df = self.remaining_df.sort_values(self.timestamp).reset_index().groupby(self.user_id).last().reset_index()
        test_df = test_df.set_index('index')
        test_df.index.name = None
        train_df = self.df[~self.df.index.isin(test_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating', 'timestamp'))
dataset_generator = LeaveOneOutUser(df)
train_df, test_df = dataset_generator.train_test_split()

100%|██████████| 4.81k/4.81k [00:00<00:00, 25.9kKB/s]


### Leave-one-out item
LeaveOneOutUser ensures that every item and user is present at least once in the training set.

In [None]:
class LeaveOneOutItem:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", timestamp="timestamp", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        self.df = self.df.sort_values(timestamp)
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.timestamp = timestamp
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self):
        test_df = self.remaining_df.sort_values(self.timestamp).reset_index().groupby(self.item_id).last().reset_index()
        test_df = test_df.set_index('index')
        test_df.index.name = None
        train_df = self.df[~self.df.index.isin(test_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating', 'timestamp'))
dataset_generator = LeaveOneOutItem(df)
train_df, test_df = dataset_generator.train_test_split()

100%|██████████| 4.81k/4.81k [00:00<00:00, 21.8kKB/s]


### Saving files

In [None]:
TRAIN_FILE_PATH = "./train.csv"
TEST_FILE_PATH = "./test.csv"

train_df.sort_values("userID").to_csv(TRAIN_FILE_PATH, index=False)
test_df.sort_values("userID").to_csv(TEST_FILE_PATH, index=False)

### Libffm format
If your model requires input in Libffm format you can use [recommenders LibffmConverter](https://microsoft-recommenders.readthedocs.io/en/latest/datasets.html#recommenders.datasets.pandas_df_utils.LibffmConverter):

In [None]:
from recommenders.datasets.pandas_df_utils import LibffmConverter

converter = LibffmConverter()
converter = converter.fit(train_df, col_rating='rating')
limffm_train_df = converter.transform(train_df)

converter = converter.fit(test_df, col_rating='rating')
limffm_test_df = converter.transform(test_df)
limffm_test_df

## Model training

Here is the part that needs to be custom written for every model

In [None]:
from surprise import SVD, SVDpp, NMF, Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
import pandas as pd
import os

In [None]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df[['userID', 'itemID', 'rating']], reader)
test_data = Dataset.load_from_df(test_df[['userID', 'itemID', 'rating']], reader)

In [None]:
svd = SVD(lr_all=.001, reg_all=0.005, n_epochs=20, n_factors=15, verbose=True)
svd = SVD()
svd.fit(train_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3a03844490>

In [None]:
svdpp = SVDpp(lr_all=.001, reg_all=0.005, n_epochs=20, n_factors=15, verbose=True)
svdpp.fit(train_data.build_full_trainset())

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f2c85af6990>

In [None]:
import numpy as np

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.datasets import fetch_movielens

In [None]:
data = Dataset()
data.fit(train_df.userID.unique(), train_df.itemID.unique())
user_id_mapping, _, item_id_mapping, _ = data.mapping()
interactions_matrix, weights_matrix = data.build_interactions([tuple(i) for i in train_df.drop(['timestamp'], axis = 1).values])
lightfm = LightFM(loss='warp')
lightfm.fit(interactions_matrix, sample_weight = weights_matrix)

<lightfm.lightfm.LightFM at 0x7f3a09239a90>

## Evaluation

In [None]:
# Model wrapper for custom model

class ModelWrapper:
    def __init__(self, model):
        self.model = model

    def predict(self, users, items):
        # Place to implement creating predictions with your model

        # Surprise models:
        # return [self.model.predict(str(user), str(item)).est for user, item in zip(users, items)]

        # LightFM
        return self.model.predict(user_id_mapping[int(users[0])], np.array([item_id_mapping[item] for item in items]))

In [None]:
# Example implementation

class ModelWrapper:
    def __init__(self, model):
        self.model = model

    def predict(self, users, items):
        self.model.eval()
        return [self.model.train_step(user, n_users+item).cpu().detach().numpy()[0] for user, item in zip(users, items)]

In [None]:
modelwrapper = ModelWrapper(lightfm)

In [None]:
test = pd.read_csv(TEST_FILE_PATH)
train = pd.read_csv(TRAIN_FILE_PATH)

In [None]:
users, items, preds = [], [], []
all_items = list(test.itemID.unique())
train_item_user_pairs = set([tuple(arr) for arr in train[["itemID", "userID"]].values])

for user in test.userID.unique():
    item_arr = [i for i in all_items if (i, user) not in train_item_user_pairs]
    user_arr = [user] * len(item_arr)
    users.extend(user_arr)
    items.extend(item_arr)
    preds.extend(list(modelwrapper.predict(user_arr, item_arr)))

all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})
all_predictions

Unnamed: 0,userID,itemID,prediction
0,1,74,-3.064796
1,1,281,-0.692533
2,1,317,-1.029688
3,1,457,-2.835075
4,1,341,-3.283792
...,...,...,...
435087,943,458,-1.407171
435088,943,246,-1.386418
435089,943,863,-2.491826
435090,943,788,-3.056770


In [None]:
metrics = {
    "map@5": map_at_k(test, all_predictions, col_prediction='prediction', k=5),
    "ndcg@5": ndcg_at_k(test, all_predictions, col_prediction='prediction', k=5),
    "precision@5": precision_at_k(test, all_predictions, col_prediction='prediction', k=5),
    "recall@5": recall_at_k(test, all_predictions, col_prediction='prediction', k=5),

    "map@20": map_at_k(test, all_predictions, col_prediction='prediction', k=20),
    "ndcg@20": ndcg_at_k(test, all_predictions, col_prediction='prediction', k=20),
    "precision@20": precision_at_k(test, all_predictions, col_prediction='prediction', k=20),
    "recall@20": recall_at_k(test, all_predictions, col_prediction='prediction', k=20)
}

In [None]:
with open('metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, ensure_ascii=False, indent=4)

In [None]:
metrics

{'map@20': 0.040046829953187375,
 'map@5': 0.030858960763520675,
 'ndcg@20': 0.06544530590240354,
 'ndcg@5': 0.03774510732147239,
 'precision@20': 0.007900318133616116,
 'precision@5': 0.011664899257688223,
 'recall@20': 0.15800636267232238,
 'recall@5': 0.05832449628844114}