In [None]:
!pip install recommenders recommenders[examples] tf_slim --quiet

[?25l[K     |█                               | 10 kB 25.7 MB/s eta 0:00:01[K     |██                              | 20 kB 32.0 MB/s eta 0:00:01[K     |███                             | 30 kB 36.0 MB/s eta 0:00:01[K     |████                            | 40 kB 40.4 MB/s eta 0:00:01[K     |████▉                           | 51 kB 28.9 MB/s eta 0:00:01[K     |█████▉                          | 61 kB 32.5 MB/s eta 0:00:01[K     |██████▉                         | 71 kB 30.5 MB/s eta 0:00:01[K     |███████▉                        | 81 kB 31.3 MB/s eta 0:00:01[K     |████████▉                       | 92 kB 33.2 MB/s eta 0:00:01[K     |█████████▊                      | 102 kB 35.4 MB/s eta 0:00:01[K     |██████████▊                     | 112 kB 35.4 MB/s eta 0:00:01[K     |███████████▊                    | 122 kB 35.4 MB/s eta 0:00:01[K     |████████████▊                   | 133 kB 35.4 MB/s eta 0:00:01[K     |█████████████▊                  | 143 kB 35.4 MB/s eta 0:

In [None]:
import recommenders
import recommenders.datasets.movielens
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.utils.timer import Timer

from tqdm import tqdm
import sklearn.utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Random split
RandomDatasetCreator ensures that every item and user is present at least once in the training set.

In [None]:
class RandomDatasetCreator:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self, train_size=0.8):
        sparsity = self.max_sparsity * train_size
        if sparsity <= self.min_sparsity:
            return self.minimal_df.copy()
        if sparsity >= self.max_sparsity:
            return self.df
        num_of_additional_ratings = max(int(train_size * self.n_ratings) - self.minimal_indexes.shape[0], 0)
        additional_df = self.remaining_df.sample(n=num_of_additional_ratings)
        train_df = pd.concat([self.minimal_df, additional_df])
        test_df = self.df[~self.df.index.isin(train_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating'))
dataset_generator = RandomDatasetCreator(df)
train_df, test_df = dataset_generator.train_test_split(0.8)

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:00<00:00, 8.49kKB/s]


# Time split
Some of the users or items might not be present in the training set.

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'ratings', 'timestamp'))
random.seed(42)
np.random.seed(seed=42)
train_df, test_df = python_chrono_split(df, ratio=0.8, col_user='userID', col_item='itemID', col_timestamp='timestamp')

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:00<00:00, 13.3kKB/s]


# Leave-one-out user
LeaveOneOutUser ensures that every item and user is present at least once in the training set.

In [None]:
class LeaveOneOutUser:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", timestamp="timestamp", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        self.df = self.df.sort_values(timestamp)
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.timestamp = timestamp
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self):
        test_df = self.remaining_df.sort_values(self.timestamp).reset_index().groupby(self.user_id).last().reset_index()
        test_df = test_df.set_index('index')
        test_df.index.name = None
        train_df = self.df[~self.df.index.isin(test_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating', 'timestamp'))
dataset_generator = LeaveOneOutUser(df)
train_df, test_df = dataset_generator.train_test_split()

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:00<00:00, 18.9kKB/s]


# Leave-one-out item
LeaveOneOutUser ensures that every item and user is present at least once in the training set.

In [None]:
class LeaveOneOutItem:
    def __init__(self, df, user_id="userID", item_id="itemID", rating="rating", timestamp="timestamp", shuffle=True, copy=False, seed=42):
        self.seed = seed
        if seed is not None:
            random.seed(self.seed)
            np.random.seed(seed=self.seed)
        if shuffle:
            self.df = sklearn.utils.shuffle(df, random_state=self.seed)
        else:
            self.df = df
        self.df = self.df.sort_values(timestamp)
        if copy:
            self.df = self.df.copy()
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating
        self.timestamp = timestamp
        self.n_users, self.n_items = self.df[self.user_id].nunique(), self.df[self.item_id].nunique()
        self.n_max_ratings = self.n_users * self.n_items
        self.n_ratings = self.df.shape[0]
        self.max_sparsity = self.n_ratings / self.n_max_ratings
        self.minimal_indexes = self.__get_minimal_indexes()
        self.min_sparsity = self.minimal_indexes.shape[0] / self.n_max_ratings
        self.minimal_df = self.df.loc[self.minimal_indexes]
        self.remaining_df = self.df[~self.df.index.isin(self.minimal_indexes)]

    def __get_minimal_indexes(self):
        if "index" in self.df.columns:
            raise Exception("Dataframe can not contain 'index' column name.")
        df_with_index_as_column = self.df.reset_index()
        numpy_array_of_indexes = pd.concat([df_with_index_as_column.groupby(self.user_id).first()["index"],
                   df_with_index_as_column.groupby(self.item_id).first()["index"]]).unique()
        return numpy_array_of_indexes

    def train_test_split(self):
        test_df = self.remaining_df.sort_values(self.timestamp).reset_index().groupby(self.item_id).last().reset_index()
        test_df = test_df.set_index('index')
        test_df.index.name = None
        train_df = self.df[~self.df.index.isin(test_df.index)]
        return train_df, test_df

In [None]:
df = recommenders.datasets.movielens.load_pandas_df('100k', ('userID', 'itemID', 'rating', 'timestamp'))
dataset_generator = LeaveOneOutItem(df)
train_df, test_df = dataset_generator.train_test_split()

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:00<00:00, 21.2kKB/s]


# Saving dataset to file
To save the dataset to files use use the code below:

In [None]:
TRAIN_FILE_PATH = "./train.csv"
TEST_FILE_PATH = "./test.csv"

train_df.sort_values("userID").to_csv(TRAIN_FILE_PATH, index=False)
test_df.sort_values("userID").to_csv(TEST_FILE_PATH, index=False)

# Libffm format
If your model requires input in Libffm format you can use [recommenders LibffmConverter](https://microsoft-recommenders.readthedocs.io/en/latest/datasets.html#recommenders.datasets.pandas_df_utils.LibffmConverter):

In [None]:
from recommenders.datasets.pandas_df_utils import LibffmConverter

converter = LibffmConverter()
converter = converter.fit(train_df, col_rating='rating')
limffm_train_df = converter.transform(train_df)

converter = converter.fit(test_df, col_rating='rating')
limffm_test_df = converter.transform(test_df)
limffm_test_df