# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm
from scipy import sparse
import torch

import warnings
warnings.filterwarnings('ignore')

# Data

## Load data

In [5]:
DATA_DIR = os.path.join("..", "data", "movielens")

In [86]:
interactions = pd.read_csv(os.path.join(DATA_DIR, "ratings.csv"))
interactions = interactions[["userId", "movieId", "rating"]]
users_to_consider = np.arange(0, 1000)
interactions = interactions[interactions["userId"].isin(users_to_consider)]

movies_to_consider = np.arange(0, 1000)
interactions = interactions[interactions["movieId"].isin(movies_to_consider)]
interactions.reset_index(drop=True, inplace=True)
movies = pd.read_csv(os.path.join(DATA_DIR, "movies.csv"))
movies = movies[movies["movieId"].isin(movies_to_consider)]
movies.set_index("movieId", inplace=True)

print(f"Number of ratings: {len(interactions)}")
print(f"Number of unique users: {interactions['userId'].nunique()}")
print(f"Number of books: {interactions['movieId'].nunique()}")
print(f"Shape of interactions: {interactions.shape}")

Number of ratings: 33184
Number of unique users: 983
Number of books: 847
Shape of interactions: (33184, 3)


In [87]:
m = 1000
n = 1000
print(f"There are {m} unique users and {n} unique books in this data set")

There are 1000 unique users and 1000 unique books in this data set


## Split

In [88]:
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

train_interactions, test_interactions = split_dataframe(interactions)

# Theory

Using neural networks instead of matrix factorization (MF) for recommendation systems has a number of advantages. See the notes section for detail. Here, we will be providing the model architecture that will be used for the recommendation system.

The model will consist of two steps:

## The Embedding Layer

This layer will take the user and item IDs in one-hot encoded form along with any other user and item feature and will pass it through a fully connected layer. The output of this layer will be the latent representation of the user and item. Let use denote $\mathbf{u}$ as the user and $\mathbf{v}$ as the item. Their dimensions will be $m+m_{uf}$ and $n+n_{if}$ respectively where $m$ and $n$ are the number of users and items and $m_{uf}$ and $n_{if}$ are the number of user and item features respectively. The output of the embedding layer will be $\mathbf{u} \in \mathbb{R}^d$ and $\mathbf{v} \in \mathbb{R}^d$ where $d$ is the dimension of the latent space.

We will have two different layers, one for users and the other of items. This is required because the number of users and items are different and we want to learn different embeddings for them. The user embedding layer will have $m+m_{uf}$ neurons and the item embedding layer will have $n+n_{if}$ neurons.

## CF Layers

CF layers, or collaborative filtering layers are made up of one or more layers of fully connected layers. The input to these layers will be the concatenation of the user and item latent representations. The output of the CF layers will be the predicted rating.

![](images/dl_01.png)

## Loss Function

The loss function can either be MSE or cross entropy. We will be experimenting with both.

## Data

We will be using all the positive data and a random sample of the negative data. The ratio of positive to negative data will be decided by a parameter.

# Implementation

## Dataset

The model will be trained using the one-hot representation of the user and item ids along with the user and item features. The output will be the rating. So, the input will be two vectors of length $M = m+m_{uf}$ and $N = n+n_{if}$ and the output will be a scalar. This means that we will be using pointwise approach.

### Negative Sampling

We will be using negative sampling to train the model. This is important, as we want our model to learn that the rating of a user-item pair is zero if the user has not rated the item. We will define a variable `negative_samples_ratio` that can be used to control the ratio of positive to negative samples. The negative samples will be randomly sampled from the negative data. We can set `negative_samples_ratio` to 0.3-0.5. This means that for every positive sample, we will be using 0.3-0.5 negative samples.

### Dataset Class

Here is a class that creates a dataset using the one-hot representation of the user and item ids along with the user and item features.

In [89]:
class BookDataset(torch.utils.data.Dataset):
    """Dataset class for the model."""""
    def __init__(self, interactions, negative_samples_ratio=0.5):
        """Initializes the BookDataset.
        
        Parameters
        ----------
        users: pandas.DataFrame
            DataFrame containing user features.
        books: pandas.DataFrame
            DataFrame containing book features.
        interactions: pandas.DataFrame
            DataFrame containing user-book interactions.
        negative_samples_ratio: float
            Ratio of negative samples to positive samples. Must be between 0 and 1.
        """
        self.interactions = interactions
        self.m = m
        self.n = n
        self.m_f = 0
        self.n_f = 0
        # print(f"There are {self.m} unique users and {self.n} unique books in this data set")
        # print(f"There are {self.m_f} user features and {self.n_f} book features")
        if negative_samples_ratio < 0 or negative_samples_ratio > 1:
            raise ValueError("negative_samples_ratio must be between 0 and 1.")
        self.negative_samples_ratio = negative_samples_ratio

    def __len__(self):
        # tried this but this will give error as the interaction dataframe will become out of index
        # num_times = 1+self.negative_samples_ratio
        num_times = 1
        return int(len(self.interactions)*num_times)
    
    def get_positive_sample(self, idx):
        """Gets a positive sample from the interactions dataframe."""
        row = self.interactions.iloc[idx]
        userId = row["userId"]
        bookId = row["movieId"]
        rating = row["rating"]
        return userId, bookId, rating
    
    def get_negative_sample(self, idx):
        """Gets a negative sample from the interactions dataframe."""""
        row = self.interactions.iloc[idx]
        userId = row["userId"]
        negative_bookId = np.random.choice(movies.index.values)
        while negative_bookId in self.interactions[self.interactions["userId"] == userId]["movieId"].values:
            negative_bookId = np.random.choice(movies.index.values)

        rating = 0
        return userId, negative_bookId, rating
    
    def get_one_sample(self, idx):
        """Gets one sample from the dataset. Uses negative sampling with probability `negative_samples_ratio`."""
        if np.random.random() < self.negative_samples_ratio:
            return self.get_negative_sample(idx)
        else:
            return self.get_positive_sample(idx)

    def __getitem__(self, idx):
        """Gets one sample from the dataset."""
        # A workaround to make interaction dataframe circular when using negative sampling
        # with `num_times` > 1. Leaving it as this should not be necessary here.
        # if idx >= len(self.interactions):
        #     idx = idx%len(self.interactions)
        user_input, book_input, rating = self.get_one_sample(idx)
        user_input = torch.tensor(user_input, dtype=torch.float32)
        book_input = torch.tensor(book_input, dtype=torch.float32)
        targets = torch.tensor(rating, dtype=torch.float32)
        return user_input, book_input, targets

In [73]:
book_dataset = BookDataset(interactions, 0.5)
book_dataset_batched = torch.utils.data.DataLoader(book_dataset, batch_size=32, shuffle=True)
user_input, book_input, targets = next(iter(book_dataset_batched))
(targets==0).sum()

tensor(12)

> When we use `negative_samples_ratio = 0.5` we usually get 17-20 negative samples for each positive sample. This suggests that we can get away with a lower ratio.

Let use see if the negative sampling is working as expected.

In [74]:
negative_samples = np.where(targets.numpy()==0)
for idx in negative_samples[0][:5]:
    rating = targets[idx].item()
    print(f"Rating: {rating}")
    user_id = user_input[idx].item()
    book_id = book_input[idx].item()
    display(interactions[(interactions["userId"] == user_id) & (interactions["movieId"] == book_id)])

Rating: 0.0


Unnamed: 0,userId,movieId,rating


Rating: 0.0


Unnamed: 0,userId,movieId,rating


Rating: 0.0


Unnamed: 0,userId,movieId,rating


Rating: 0.0


Unnamed: 0,userId,movieId,rating


Rating: 0.0


Unnamed: 0,userId,movieId,rating


In [75]:
positive_samples = np.where(targets.numpy()!=0)
for idx in positive_samples[0][:5]:
    rating = targets[idx].item()
    print(f"Rating: {rating}")
    user_id = user_input[idx].item()
    book_id = book_input[idx].item()
    display(interactions[(interactions["userId"] == user_id) & (interactions["movieId"] == book_id)])

Rating: 4.5


Unnamed: 0,userId,movieId,rating
28013,871,32,4.5


Rating: 2.0


Unnamed: 0,userId,movieId,rating
22231,714,608,2.0


Rating: 4.0


Unnamed: 0,userId,movieId,rating
4032,136,32,4.0


Rating: 4.0


Unnamed: 0,userId,movieId,rating
3886,132,1,4.0


Rating: 5.0


Unnamed: 0,userId,movieId,rating
19694,636,780,5.0


It is working as expected.

We will create the final train and test datasets.

In [36]:
negative_samples_ratio = 0.4
batch_size = 32
train_dataset = BookDataset(train_interactions, negative_samples_ratio=negative_samples_ratio)
test_dataset = BookDataset(test_interactions, negative_samples_ratio=negative_samples_ratio)

train_df = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_df = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

Now, we will create the model.

## Model

We will use $d = 40$ for the embedding layer. We will use 2 layers for the CF layers. We can treat these as hyperparameters and tune them later.

In [76]:
m_f = 0
n_f = 0

In [123]:
class BookModel(torch.nn.Module):
    """Model class for the BookNet model."""
    def __init__(self, m=m, n=n, m_f=m_f, n_f=n_f, embedding_dim=40, cf_layer_neurons = [128, 128]):
        """Initializes the BookNet model.
        
        Parameters
        ----------
        m: int
            Number of users.
        n: int
            Number of books.
        m_f: int
            Number of user features.
        n_f: int
            Number of book features.
        embedding_dim: int
            Hidden dimension for the hidden layer.
        cf_layer_neurons: list
            List of integers specifying the number of neurons in each layer of the collaborative filtering part of the model.
        """
        super(BookModel, self).__init__()
        self.m = m
        self.n = n
        self.m_f = m_f
        self.n_f = n_f
        self.embedding_dim = embedding_dim
        self.cf_layer_neurons = cf_layer_neurons

        self.user_embedding, self.book_embedding = self.create_embedding_layer()
        # self._init_embedding_weights(self.user_embedding)
        # self.cf_layer = self.create_CF_layer()
        # self.init_weights()
        self.fc_layers = torch.nn.ModuleList()
        for i in range(len(self.cf_layer_neurons)):
            if i == 0:
                self.fc_layers.append(torch.nn.Linear(self.embedding_dim*2, self.cf_layer_neurons[i]))
            else:
                self.fc_layers.append(torch.nn.Linear(self.cf_layer_neurons[i-1], self.cf_layer_neurons[i]))
        self.affine_output = torch.nn.Linear(in_features=self.cf_layer_neurons[-1], out_features=1)
        self.logistic = torch.nn.ReLU()



    def create_embedding_layer(self):
        """Creates the embedding layer"""
        user_in_shape = self.m
        book_in_shape = self.n
        out_shape = self.embedding_dim
        user_embedding = torch.nn.Embedding(num_embeddings=user_in_shape, embedding_dim=out_shape)
        book_embedding = torch.nn.Embedding(num_embeddings=book_in_shape, embedding_dim=out_shape)
        return user_embedding, book_embedding
    
    def _init_embedding_weights(self, embedding_layer):
        """Initializes the embedding layer weights with a uniform distribution."""
        embedding_layer.weight.data.uniform_(0, 1)
    
    # def init_weights(self):
    #     """Initializes the weights of the model."""
    #     self._init_embedding_weights(self.user_embedding)
    #     self._init_embedding_weights(self.book_embedding)
    #     for layer in self.cf_layer:
    #         if isinstance(layer, torch.nn.Linear):
    #             torch.nn.init.xavier_uniform_(layer.weight)
    
    def create_CF_layer(self):
        """Creates the collaborative filtering layers. Uses the number of neurons specified in `cf_layer_neurons`."""
        num_layers = len(self.cf_layer_neurons)
        activation = torch.nn.ReLU()
        layers = []
        for i in range(num_layers):
            if i == 0:
                layers.append(torch.nn.Linear(self.embedding_dim*2, self.cf_layer_neurons[i]))
            else:
                layers.append(torch.nn.Linear(self.cf_layer_neurons[i-1], self.cf_layer_neurons[i]))
            layers.append(activation)
        layers.append(torch.nn.Linear(self.cf_layer_neurons[-1], 1))
        layers.append(activation)
        return torch.nn.Sequential(*layers)
    
    def forward(self, user_input, book_input):
        """Forward pass of the model.
        
        Parameters
        ----------
        user_input: torch.Tensor
            Tensor containing the user input.
        book_input: torch.Tensor
            Tensor containing the book input.
        """
        user_index = user_input.long()
        user_embedded = self.user_embedding(user_index)
        book_index = book_input.long()
        book_embedded = self.book_embedding(book_index)
        # Concatenate the user and book embeddings to form one vector.
        x = torch.cat([user_embedded, book_embedded], dim=1)
        # x = self.cf_layer(x)
        for layer in self.fc_layers:
            x = layer(x)
            x = torch.nn.ReLU()(x)
        x = self.affine_output(x)
        x = self.logistic(x)
        return x

In [145]:
class MLP(torch.nn.Module):
    def __init__(self, config):
        super(MLP, self).__init__()
        self.config = config
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim = config['latent_dim']

        self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.affine_output = torch.nn.Linear(in_features=config['layers'][-1], out_features=1)
        self.logistic = torch.nn.ReLU()

    def forward(self, user_indices, item_indices):
        user_input = user_indices.long()
        item_input = item_indices.long()
        user_embedding = self.embedding_user(user_input)
        item_embedding = self.embedding_item(item_input)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
        for idx, _ in enumerate(range(len(self.fc_layers))):
            vector = self.fc_layers[idx](vector)
            vector = torch.nn.ReLU()(vector)
            # vector = torch.nn.BatchNorm1d()(vector)
            # vector = torch.nn.Dropout(p=0.5)(vector)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass

In [146]:
config = {'num_users': m,
            'num_items': n,
            'latent_dim': 40,
            'layers': [80, 40, 20]}
model = MLP(config)

In [124]:
model = BookModel(m=m, n=n, m_f=m_f, n_f=n_f, embedding_dim=40, cf_layer_neurons=[128, 128])

In [125]:
model

BookModel(
  (user_embedding): Embedding(1000, 40)
  (book_embedding): Embedding(1000, 40)
  (fc_layers): ModuleList(
    (0): Linear(in_features=80, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=128, bias=True)
  )
  (affine_output): Linear(in_features=128, out_features=1, bias=True)
  (logistic): ReLU()
)

We have our model. Next, we will define a loss function and an optimizer. Then we will train the model.

### Loss Function

We will use MSE loss. We will also add the l1 and l2 regularization terms. This way, we can experiment with different regularization parameters.

In [147]:
class MSE_L1L2Loss(torch.nn.Module):
    def __init__(self, model, l1_weight=0, l2_weight=0):
        """Initializes the loss function.

        Parameters
        ----------
        model: torch.nn.Module
            The model to use for the loss function.
        l1_weight: float
            Weight for the L1 regularization term.
        l2_weight: float
            Weight for the L2 regularization term.
        """
        super().__init__()
        self.model = model
        self.l1_weight = l1_weight
        self.l2_weight = l2_weight

    def forward(self, y_hat, y):
        """The forward pass of the loss function."""
        mse_loss = torch.nn.functional.mse_loss(y_hat, y)
        l2_regularization = torch.tensor(0.)
        l1_regularization = torch.tensor(0.)
        for param in self.model.parameters():
            l2_regularization += torch.norm(param, 2)
            l1_regularization += torch.norm(param, 1)
        l1_regularization *= self.l1_weight
        l2_regularization *= self.l2_weight
        loss = mse_loss + l1_regularization + l2_regularization
        return loss

### Optimizer

We can use the Adam optimizer. We will also create a learning rate scheduler.

In [148]:
l1_weight = 0
l2_weight = 0
optimizer_str = "adam"
lr = 0.005
scheduler_str = "plateau"
embedding_dim = 100

negative_samples_ratio = 0
batch_size = 256
train_dataset = BookDataset(train_interactions, negative_samples_ratio=negative_samples_ratio)
test_dataset = BookDataset(test_interactions, negative_samples_ratio=negative_samples_ratio)

train_df = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_df = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

config = {'num_users': m,
            'num_items': n,
            'latent_dim': 40,
            'layers': [80, 40, 20]}
model = MLP(config)
loss_func = MSE_L1L2Loss(model, l1_weight=l1_weight, l2_weight=l2_weight)
if optimizer_str == "adam":
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
elif optimizer_str == "sgd":
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

if scheduler_str == "plateau":
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, verbose=True, min_lr=1e-7)
elif scheduler_str == "step":
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)


In [149]:
model

MLP(
  (embedding_user): Embedding(1000, 40)
  (embedding_item): Embedding(1000, 40)
  (fc_layers): ModuleList(
    (0): Linear(in_features=80, out_features=40, bias=True)
    (1): Linear(in_features=40, out_features=20, bias=True)
  )
  (affine_output): Linear(in_features=20, out_features=1, bias=True)
  (logistic): ReLU()
)

### Training

Excellent! We have our model. Now, we will train it.

In [150]:
def train_step(user, item, rating, optimizer):
    optimizer.zero_grad()
    prediction = model(user, item)
    loss = loss_func(prediction, rating)
    loss.backward()
    optimizer.step()
    return loss.item(), prediction

def test_step(user, item, rating):
    prediction = model(user, item)
    loss = loss_func(prediction, rating)
    return loss.item()

In [151]:
epochs = 10
train_losses = []
test_losses = []
for epoch in tqdm(range(epochs)):
    batch = 0
    train_loss = 0
    test_loss = 0
    for user, item, rating in train_df:
        batch += 1
        loss, prediction = train_step(user, item, rating, optimizer)
        train_loss += loss
        print(f"Epoch: {epoch}, Batch: {batch}/{len(train_df)}, Loss: {loss}", end = "\r")
        # if batch % 10 == 0:
        #     print(rating)
        #     print(rating.mean())
    train_loss /= len(train_df)
    train_losses.append(train_loss)
    for user, item, rating in test_df:
        loss = test_step(user, item, rating)
        test_loss += loss
    test_loss /= len(test_df)
    test_losses.append(test_loss)
    scheduler.step(test_loss)
    print(f"Epoch: {epoch}, Train loss: {train_loss}, Test loss: {test_loss}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0, Train loss: 1.3086674771804114, Test loss: 1.1599612150054712
Epoch: 1, Train loss: 1.15400496903783, Test loss: 1.1445719940731158
Epoch: 2, Train loss: 1.1425855527980935, Test loss: 1.1361437485768244
Epoch: 3, Train loss: 1.1357976405919237, Test loss: 1.1609616222289891
Epoch: 4, Train loss: 1.1268075398574293, Test loss: 1.13704321705378
Epoch: 5, Train loss: 1.1205213466854627, Test loss: 1.1368224362914379
Epoch: 6, Train loss: 1.113202959011913, Test loss: 1.129157162629641
Epoch: 7, Train loss: 1.11125224793621, Test loss: 1.1241886724646275
Epoch: 8, Train loss: 1.109127842671611, Test loss: 1.125126754435209
Epoch: 9, Train loss: 1.1098112429171099, Test loss: 1.1375093448620577


In [130]:
user_input, book_input, targets = next(iter(test_df))

In [131]:
pred = model(user_input, book_input)