In [1]:
import numpy as np
import pytest
import torch
from torch import nn

from typing import Callable, List, Optional, Union

### Implementing MLP network


In [2]:
class MLP(nn.Module):
    """A multi-layer perceptron module.
    This module is a sequence of linear layers plus activation functions.
    The user can optionally add normalization and/or dropout to each of the layers.
    
    Code used from https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
    
    :param in_dim: Input dimension.
    :type in_dim: int
    :param out_dim: Output dimension.
    :type out_dim: int
    :param hidden_dims: Output dimension for each hidden layer.
    :type hidden_dims: Optional[Union[int, List[int]]] 
    :param dropout: Probability for dropout layers between each hidden layer.
    :type dropout: float
    :param activation: Which activation function to use. 
        Supports module type or partial.
    :type activation: Callable[..., nn.Module]
    """
    def __init__(
        self, 
        in_dim: int, 
        out_dim: int,
        hidden_dims: Optional[Union[int, List[int]]] = None,
        dropout: float = 0.5,
        activation: Callable[..., nn.Module] = nn.ReLU,
    ):
        super().__init__()

        layers = nn.ModuleList()

        if hidden_dims is None:
            hidden_dims = []

        if isinstance(hidden_dims, int):
            hidden_dims = [hidden_dims]

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(activation())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, out_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

In [3]:
EMBEDDING_SIZE = 3
N_USERS = 5
N_ITEMS = 10
a = MLP(2*EMBEDDING_SIZE, N_ITEMS, [6, 4, 2], dropout=0)

## Implementing the Neural Architecture



In [4]:
class NMFModule(nn.Module):
    """Model that encodes the Neural Matrix Factorization Network.
    
    Implements the 3 tiered network defined in the He et al. paper.

    :param predictive_powers: size of the last hidden layer in MLP.
        Embedding sizes computed as 2 * predictive powers.
    :type predictive_powers: int
    :param n_users: number of users in the network
    :type n_users: int
    :param n_items: number of items in the network
    :type n_items: int
    :param hidden_dims: dimensions of the MLP hidden layers.
    :type hidden_dims: Union[int, List[int]]
    :param dropout: Dropout chance between layers of the MLP
    :type dropout: float
    """
    def __init__(
        self, predictive_powers: int, n_users: int, n_items: int, dropout: float
    ):
        super().__init__()
        num_components = 2 * predictive_powers
        
        self.user_embedding = nn.Embedding(n_users, num_components)
        self.item_embedding = nn.Embedding(n_items, num_components)

        # we use a three tiered MLP as described in the experiments of the paper.
        hidden_dims = [
            4 * predictive_powers, 
            2 * predictive_powers, 
            predictive_powers
        ]

        # Output is always 1, since we need a single score for u,i
        self.mlp = MLP(4 * predictive_powers, 1, 
                       hidden_dims, dropout=dropout)

        self.final = nn.Sigmoid()

        # weight initialization
        self.user_embedding.weight.data.normal_(0, 
            1.0 / self.user_embedding.embedding_dim)
        self.item_embedding.weight.data.normal_(0, 
            1.0 / self.item_embedding.embedding_dim)
        
    def forward(self, users: torch.LongTensor, items: torch.LongTensor) -> torch.FloatTensor:
        """Predict scores for the user item pairs obtained 
        by zipping together the two inputs

        :param users: 1D tensor with user ids
        :type users: torch.LongTensor
        :param items: 1D tensor with item ids
        :type items: torch.LongTensor
        :return: 1D tensor with predicted similarities.
            Position i is the similarity between 
            `users[i]` and `items[i]`
        :rtype: torch.FloatTensor
        """

        # Embedding lookups
        user_emb = self.user_embedding(users)
        item_emb = self.item_embedding(items)

        # Pass concatenated through MLP and apply sigmoid
        return self.final(
            self.mlp(
                torch.hstack([user_emb, item_emb])
            )
        )

In [7]:
def test_output_shapes_NMF(
    predictive_factors, num_users, num_items
):
    """Check that no mather the inner settings of the network, the output is always correct"""
    mod = NMFModule(predictive_factors, num_users, num_items, 0.0)
    
    user_tensor = torch.LongTensor([1, 2])
    item_tensor = torch.LongTensor([1, 2])
    
    res = mod(user_tensor, item_tensor) # predict scores for items given the users
    
    assert res.shape == (2, 1)

    assert (res.detach().numpy() <= 1).all()
    assert (res.detach().numpy() >= 0).all()


test_output_shapes_NMF(5, 10, 10)
test_output_shapes_NMF(5, 3, 10)
test_output_shapes_NMF(1, 3, 3)



In [8]:
from typing import List, Union, Optional

import pandas as pd
from recpack.algorithms.base import TorchMLAlgorithm
from recpack.algorithms.samplers import PositiveNegativeSampler
from recpack.algorithms.util import get_users
from recpack.matrix import InteractionMatrix
from scipy.sparse import csr_matrix, lil_matrix


## Implementing the algorithm

In [9]:
class NeuMF(TorchMLAlgorithm):
    """Implementation of Neural Matrix Factoration.

    Neural Matrix Factorization based on MLP architecture
    as presented in Figure 2 in He, Xiangnan, et al. 
    "Neural collaborative filtering."
    In Proceedings of the 26th international conference on world wide web. 2017.

    Represents the users and items using an embedding, 
    similarity between the two is modelled using a neural network.

    The network consists of an embedding for both users and items.
    To compute similarity those two embeddings are 
    concatenated and passed through the MLP
    Finally the similarity is transformed to the [0,1] domain
    using a sigmoid function.

    As in the paper, the sum of square errors is used as loss function.
    Positive items should get a prediction close to 1, 
    while sampled negatives should get a value close to 0.

    The MLP has 3 layers, as suggested in the experiments section.
    Bottom layer has dimension `4 * predictive_powers`, 
    middle layer `2 * predictive_powers`
    and the top layer has `predictive_powers`.

    :param predictive_powers: Size of the last hidden layer in the MLP network.
        Embedding size is 2 * predictive_powers
    :type predictive_powers: int
    :param batch_size: How many samples to use in each update step.
        Higher batch sizes make each epoch more efficient,
        but increases the amount of epochs needed to converge to the optimum,
        by reducing the amount of updates per epoch.
        Defaults to 512.
    :type batch_size: Optional[int]
    :param max_epochs: The max number of epochs to train.
        If the stopping criterion uses early stopping, less epochs could be used.
        Defaults to 10.
    :type max_epochs: Optional[int]
    :param learning_rate: How much to update the weights at each update. Defaults to 0.01
    :type learning_rate: Optional[float]
    :param stopping_criterion: Name of the stopping criterion to use for training.
        For available values,
        check :meth:`recpack.algorithms.stopping_criterion.StoppingCriterion.FUNCTIONS`
        Defaults to 'ndcg'
    :type stopping_criterion: Optional[str]
    :param stop_early: If True, early stopping is enabled,
        and after ``max_iter_no_change`` iterations where improvement of loss function
        is below ``min_improvement`` the optimisation is stopped,
        even if max_epochs is not reached.
        Defaults to False
    :type stop_early: bool, optional
    :param max_iter_no_change: If early stopping is enabled,
        stop after this amount of iterations without change.
        Defaults to 5
    :type max_iter_no_change: int, optional
    :param min_improvement: If early stopping is enabled, no change is detected,
        if the improvement is below this value.
        Defaults to 0.01
    :type min_improvement: float, optional
    :param seed: Seed to the randomizers, useful for reproducible results,
        defaults to None
    :type seed: int, optional
    :param save_best_to_file: If true, the best model will be saved after training,
        defaults to False
    :type save_best_to_file: bool, optional
    :param keep_last: Retain last model, rather than best
        (according to stopping criterion value on validation data), defaults to False
    :type keep_last: bool, optional
    :param predict_topK: The topK recommendations to keep per row in the matrix.
        Use when the user x item output matrix would become too large for RAM.
        Defaults to None, which results in no filtering.
    :type predict_topK: int, optional
    :param n_negatives_per_positive: Amount of negatives to sample for each positive example, defaults to 1
    :type n_negatives_per_positive: int, optional
    :param dropout: Dropout parameter used in MLP, defaults to 0.0
    :type dropout: float, optional
    :param exact_sampling: Enable or disable exact checks while sampling. 
        With exact sampling the sampled negatives are guaranteed to not have been visited by the user. 
        Non exact sampling assumes that the space for item selection is large enough, 
        such that most items are likely not seen before.
        Defaults to False,
    :type exact_sampling: bool, optional
    """
    def __init__(
        self,
        predictive_factors: int,
        batch_size: Optional[int] = 512,
        max_epochs: Optional[int] = 10,
        learning_rate: Optional[float] = 0.01,
        stopping_criterion: Optional[str] = "ndcg",
        stop_early: Optional[bool] = False,
        max_iter_no_change: Optional[int] = 5,
        min_improvement: Optional[float] = 0.0,
        seed: Optional[int] = None,
        save_best_to_file: Optional[bool] = False,
        keep_last: Optional[bool] = False,
        predict_topK: Optional[int] = None,
        n_negatives_per_positive: Optional[int] = 1,
        exact_sampling: Optional[bool] = False,
        dropout: Optional[float] = 0.0,
    ):
        print(batch_size, max_epochs, learning_rate, stopping_criterion)
        super().__init__(batch_size, max_epochs, learning_rate,
            stopping_criterion, stop_early, max_iter_no_change,
            min_improvement, seed, save_best_to_file, keep_last,
            predict_topK,
        )

        self.predictive_factors = predictive_factors

        self.n_negatives_per_positive = n_negatives_per_positive
        self.dropout = dropout
        self.exact_sampling = exact_sampling

        self.sampler = PositiveNegativeSampler(
            U=self.n_negatives_per_positive, replace=False, exact=exact_sampling, 
            batch_size=self.batch_size
        )

    def _init_model(self, X: csr_matrix):
        num_users, num_items = X.shape
        self.model_ = NMFModule(
            self.predictive_factors, num_users, num_items, self.dropout
        ).to(self.device)

        self.optimizer = torch.optim.Adam(
            self.model_.parameters(), lr=self.learning_rate
        )
        
    def _train_epoch(self, X: csr_matrix) -> List[int]:
        losses = []
        for users, positives, negatives in self.sampler.sample(X):

            self.optimizer.zero_grad()

            # Predict for the positives
            positive_scores = self.model_(
                users.to(self.device), positives.to(self.device))
            # Predict for the negatives
            negative_scores = self.model_(
                *self._construct_negative_prediction_input(
                    users.to(self.device), negatives.to(self.device))
            )

            loss = self._compute_loss(
                positive_scores, negative_scores)

            # Backwards propagation of the loss
            loss.backward()
            self.optimizer.step()

            losses.append(loss.item())

        return losses

    def _compute_loss(
        self, positive_scores: torch.FloatTensor, negative_scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        """Compute the Square Error loss given recommendations 
        for positive items, and sampled negatives.
        """

        mse = nn.MSELoss(reduction="sum")
        return mse(positive_scores, torch.ones_like(positive_scores, dtype=torch.float)) + mse(
            negative_scores, torch.zeros_like(negative_scores, dtype=torch.float)
        )

    def _construct_negative_prediction_input(self, users, negatives):
        """Construct the prediction input given a 1D user tensor and a 2D negatives tensor.
        
        Since negatives has shape |batch| x U, and users is a 1d vector,
        these need to be turned into two 1D vectors of shape |batch| * U

        First the users as a row are stacked U times and transposed,
        so that this is also a batch x U tensor.
        Then both are reshaped to remove the 2nd dimension, 
        resulting in a single long 1d vector.
        """
        return (
            users.repeat(self.n_negatives_per_positive, 1).T.reshape(-1), 
            negatives.reshape(-1)
        )
    
    def _batch_predict(
        self, X: csr_matrix, users: List[int]
    ) -> csr_matrix:
        """Generate recommendations for each of the users."""

        X_pred = lil_matrix(X.shape)
        if users is None:
            users = get_users(X)

        _, n_items = X.shape
        n_users = len(users)

        # Create tensors such that each user, item pair gets a score.
        # The user tensor contains the users in order 
        # (eg. [1, 1, 2, 2]), 
        # item indices are repeated (eg. [0, 1, 2, 0, 1, 2]).
        user_tensor = torch.LongTensor(users).repeat(
            n_items, 1).T.reshape(-1).to(self.device)
        item_tensor = torch.arange(n_items).repeat(
            n_users).to(self.device)

        X_pred[users] = self.model_(
            user_tensor, item_tensor
        ).detach().cpu().numpy().reshape(n_users, n_items)
        return X_pred.tocsr()

In [15]:
TIMESTAMP_IX = 'ts'
ITEM_IX = 'iid'
USER_IX = 'uid'

data = {
    TIMESTAMP_IX: [3, 2, 1, 4, 0, 1, 2, 4, 0, 1, 2],
    ITEM_IX: [0, 1, 2, 3, 0, 1, 2, 4, 0, 1, 2],
    USER_IX: [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5],
}
df = pd.DataFrame.from_dict(data)

mat = InteractionMatrix(df, ITEM_IX, USER_IX, timestamp_ix=TIMESTAMP_IX)


In [None]:
def test_negative_input_construction(users, negatives, U):
    
    a = NeuMF(
        predictive_factors=8, 
        n_negatives_per_positive=U
    )
    
    num_users = users.shape[0]
    users_input, negatives_input = a._construct_negative_prediction_input(users, negatives)
    assert users_input.shape == negatives_input.shape
    assert len(users_input.shape) == 1 # 1d vectors
    
    # Check that both are in the right order (each user is repeated U times before the next user is present)
    for ix in range(users_input.shape[0]):
        assert users_input[ix] == users[ix // U]
        assert negatives_input[ix] == negatives[ix // U, ix % U]

test_negative_input_construction(torch.LongTensor([4, 5, 6]), torch.LongTensor([[1, 2], [1, 2], [1, 2]]), U=2)
test_negative_input_construction(torch.LongTensor([4, 5, 6]), torch.LongTensor([[1], [1], [1]]), U=1)


In [None]:
def test_overfit(mat):
    m = NeuMF(
        predictive_factors=5,
        batch_size=1,
        max_epochs=20,
        learning_rate=0.02,
        stopping_criterion="ndcg",
        n_negatives_per_positive=1,
    )

    # set sampler to exact sampling
    m.sampler.exact = True
    m.fit(mat, (mat, mat))
    bin_mat = mat.binary_values
    pred = m.predict(mat.binary_values).toarray()
    for user in mat.active_users:
        # The model should have overfitted, so that the visited items have the highest similarities
        positives = bin_mat[user].nonzero()[1]
        negatives = list(set(range(mat.shape[1])) - set(positives))

        for item in positives:
            assert (pred[user][negatives] < pred[user, item]).all()
            
test_overfit(mat)
    

## Experiment

Use RecPack Pipeline to compare the newly implemented algorithm to frequently used baselines

In [17]:
from recpack.pipelines import PipelineBuilder
from recpack.datasets import MovieLens25M
from recpack.scenarios import WeakGeneralization

In [18]:
DATASET_PATH = '/home/robinverachtert/datasets'

In [19]:
dataset = MovieLens25M(
    path=DATASET_PATH
)
data = dataset.load()

  0%|          | 0/12415224 [00:00<?, ?it/s]

  0%|          | 0/12415224 [00:00<?, ?it/s]

In [20]:
# Subsample to 1000 users to make it faster
# import numpy as np

# users = np.random.choice(list(data.active_users), 1000)
# data = data.users_in(users)

In [21]:
scenario = WeakGeneralization(frac_data_in=0.8, validation=True)
scenario.split(data)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [22]:
from recpack.pipelines import ALGORITHM_REGISTRY
ALGORITHM_REGISTRY.register('NeuMF', NeuMF)

In [31]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

builder.add_metric('NDCGK', K=10)
builder.add_metric('CoverageK', K=10)

builder.add_algorithm(
    algorithm = 'NeuMFMLPOnly', 
    params = {
        'batch_size': 128,
        'max_epochs': 10,
        'learning_rate': 0.001,
        'stopping_criterion': 'ndcg',
        'predict_topK': 50,
        'n_negatives_per_positive': 4,
        'dropout': 0.0
    },
    grid = {
        'predictive_factors': [8, 16, 32],
    }
)

builder.add_algorithm('Popularity', params={'K': 50})
builder.add_algorithm(
    'ItemKNN', 
    grid={'similarity': ['conditional_probability', 'cosine']}
)
builder.set_optimisation_metric('NDCGK', K=10)

pipeline = builder.build()

pipeline.run()

  0%|          | 0/3 [00:00<?, ?it/s]

2022-07-01 16:02:50,165 - base - recpack - INFO - Processed epoch 0 in 228.77 s.Batch Training Loss = 43.9709
2022-07-01 16:31:45,497 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.13242382393503446, which is better than previous iterations.
2022-07-01 16:31:45,498 - base - recpack - INFO - Model improved. Storing better model.
2022-07-01 16:31:45,518 - base - recpack - INFO - Evaluation at end of 0 took 1735.35 s.
2022-07-01 16:35:34,852 - base - recpack - INFO - Processed epoch 1 in 229.33 s.Batch Training Loss = 36.0092
2022-07-01 17:05:18,048 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.14472784895769647, which is better than previous iterations.
2022-07-01 17:05:18,049 - base - recpack - INFO - Model improved. Storing better model.
2022-07-01 17:05:18,070 - base - recpack - INFO - Evaluation at end of 1 took 1783.22 s.
2022-07-01 17:09:10,961 - base - recpack - INFO - Processed epoch 2 in 232.89 s.Batch Training Loss = 33.9820
2022-0

2022-07-02 03:26:00,475 - base - recpack - INFO - Model improved. Storing better model.
2022-07-02 03:26:00,512 - base - recpack - INFO - Evaluation at end of 8 took 1778.94 s.
2022-07-02 03:31:11,043 - base - recpack - INFO - Processed epoch 9 in 310.53 s.Batch Training Loss = 27.6428
2022-07-02 04:00:46,243 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.1783149841683678, which is better than previous iterations.
2022-07-02 04:00:46,244 - base - recpack - INFO - Model improved. Storing better model.
2022-07-02 04:00:46,281 - base - recpack - INFO - Evaluation at end of 9 took 1775.24 s.
2022-07-02 04:00:46,299 - base - recpack - INFO - Fitting NeuMFMLPOnly complete - Took 2.13e+04s
2022-07-02 04:40:27,948 - base - recpack - INFO - Processed epoch 0 in 492.59 s.Batch Training Loss = 40.9445
2022-07-02 05:12:24,653 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.13997920180943083, which is better than previous iterations.
2022-07-02 05:12:24,

2022-07-02 15:59:22,012 - base - recpack - INFO - Processed epoch 7 in 316.95 s.Batch Training Loss = 28.2006
2022-07-02 16:30:47,891 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.17964142990428575, which is better than previous iterations.
2022-07-02 16:30:47,892 - base - recpack - INFO - Model improved. Storing better model.
2022-07-02 16:30:47,933 - base - recpack - INFO - Evaluation at end of 7 took 1885.92 s.
2022-07-02 16:36:04,405 - base - recpack - INFO - Processed epoch 8 in 316.47 s.Batch Training Loss = 27.7895
2022-07-02 17:06:36,013 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.18007371065396469, which is better than previous iterations.
2022-07-02 17:06:36,014 - base - recpack - INFO - Model improved. Storing better model.
2022-07-02 17:06:36,053 - base - recpack - INFO - Evaluation at end of 8 took 1831.65 s.
2022-07-02 17:11:46,899 - base - recpack - INFO - Processed epoch 9 in 310.84 s.Batch Training Loss = 27.4160
2022-0

  self._set_arrayXarray(i, j, x)


2022-07-02 18:11:58,956 - base - recpack - INFO - Fitting ItemKNN complete - Took 16.9s


  self._set_arrayXarray(i, j, x)


2022-07-02 18:12:32,603 - base - recpack - INFO - Fitting ItemKNN complete - Took 13.7s




2022-07-02 18:13:29,405 - base - recpack - INFO - Fitting ItemKNN complete - Took 16.6s


In [32]:
pipeline.get_metrics(short=True)

Unnamed: 0,ndcgk_10,coveragek_10
NeuMFMLPOnly,0.115335,0.176857
Popularity,0.084736,0.000502
ItemKNN,0.160481,0.147765


In [33]:
pipeline.optimisation_results

Unnamed: 0,identifier,params,NDCGK
0,"NeuMFMLPOnly(batch_size=128,dropout=0.0,exact_...","{'predictive_factors': 8, 'batch_size': 128, '...",0.095286
1,"NeuMFMLPOnly(batch_size=128,dropout=0.0,exact_...","{'predictive_factors': 16, 'batch_size': 128, ...",0.099083
2,"NeuMFMLPOnly(batch_size=128,dropout=0.0,exact_...","{'predictive_factors': 32, 'batch_size': 128, ...",0.097915
3,"ItemKNN(K=200,normalize=False,normalize_X=Fals...",{'similarity': 'conditional_probability'},0.100922
4,"ItemKNN(K=200,normalize=False,normalize_X=Fals...",{'similarity': 'cosine'},0.136519


In [None]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

builder.add_metric('NDCGK', K=10)
builder.add_metric('CoverageK', K=10)

builder.add_algorithm(
    algorithm = 'NMF', 
    grid = {
        'num_components': [16, 32, 64, 128],
    }
)

builder.set_optimisation_metric('NDCGK', K=10)

pipeline = builder.build()

pipeline.run()

In [None]:
pipeline.get_metrics(short=True)

In [None]:
pipeline.optimisation_results

In [None]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

builder.add_metric('NDCGK', K=10)
builder.add_metric('CoverageK', K=10)

builder.add_algorithm(
    algorithm = 'BPRMF', 
    params = {
        'batch_size': 128,
        'max_epochs': 10,
        'learning_rate': 0.001,
        'stopping_criterion': 'ndcg',
        'predict_topK': 50,
        'lambda_h': 0.1,
        'lambda_w': 0.1,
    },
    grid = {
        'num_components': [16, 32, 64, 128],
    }
)

builder.set_optimisation_metric('NDCGK', K=10)

pipeline = builder.build()

pipeline.run()

In [None]:
pipeline.get_metrics(short=True)

In [None]:
pipeline.optimisation_results