### Collaborative Filtering using 3 hidden layers

In [1]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import torch.nn as nn
import matplotlib.pyplot as plt

!pip install pytorch-lightning

# To print all outputs to console
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.9.4-py3-none-any.whl (827 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m827.8/827.8 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.3-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.6/518.6 KB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning-utilities>=0.6.0.post0
  Downloading lightning_utilities-0.7.1-py3-none-any.whl (18 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.7.1 pytorch-lightning-1.9.4 torchmetrics-0.11.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load datasets
movies = pd.read_csv('/content/drive/MyDrive/dataset/movie.csv')
ratings = pd.read_csv('/content/drive/MyDrive/small_dataset/rating.csv')
train_ratings = pd.read_csv('/content/drive/MyDrive/small_dataset/train_dataset.csv')
test_ratings = pd.read_csv('/content/drive/MyDrive/small_dataset/test_dataset.csv')

# Augmented dataset has been created using the `Data Augmentation` notebook attached
augmented_dataset = pd.read_csv('/content/drive/MyDrive/small_dataset/augmented_dataset.csv')

augmented_dataset.head()

Unnamed: 0,userId,movieId,label
0,31,1,1.0
1,31,110,1.0
2,31,260,1.0
3,31,364,1.0
4,31,527,1.0


In [4]:
# Note the reduction of data to 7M. I had to reduce the data dimensionality given the compute and session-time contraints
augmented_dataset['label'].value_counts()

0.0    5727144
1.0    1431786
Name: label, dtype: int64

In [5]:
movies_with_id = movies.set_index('movieId')
movies_with_id.index.name = None
movies_with_id.head()
print(f'Shape: {movies_with_id.shape}')

Unnamed: 0,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


Shape: (27278, 2)


In [6]:
from torch.utils.data import Dataset, DataLoader

class TrainingData(Dataset):
  def __init__(self, train_ratings):
    # The input has been separated into separate variables so that the later model can use these items to convert to embeddings. 
    # Also, each input is sent as a tensor to the successive models
    self.users, self.movies, self.labels = self.get_data(train_ratings)

  def __len__(self):
    # The __len__ function returns the number of samples in our dataset
    return len(self.users)

  def __getitem__(self, idx):
    # Return both inputs (user and item) and the output (target indicating whether the user interacted with the item or not)
    return self.users[idx], self.movies[idx], self.labels[idx]
  
  def get_data(self, ratings):
    return torch.tensor(ratings['userId']), torch.tensor(ratings['movieId']), torch.tensor(ratings['label'])

In [7]:
data_loader = DataLoader(TrainingData(augmented_dataset), batch_size=2048, num_workers=0)

In [10]:
import pytorch_lightning as pl

# PyTorch Lightning is an open-source Python library that provides a high-level interface for PyTorch
class CollaborativeFiltering(pl.LightningModule):
  def __init__(self, train_ratings, dataloader):
    super().__init__()
    self.train_ratings = train_ratings
    self.dataloader = dataloader

    # Tried with len() first, it fails when the IDs are not in order or exceed length
    # This is because embedding is just a lookup table we are building for n items
    self.number_of_users = train_ratings['userId'].max() + 1
    self.number_of_items = train_ratings['movieId'].max() + 1

    # Longer embedding vectors don't add more valuable information and smaller ones don't represent the semantics well enough
    # The rule of thumb for determining the embedding size is the cardinality size divided by 2, but no bigger than 50
    # I have chosen 16 here, as the cardinality is too huge
    self.user_embedding = nn.Embedding(num_embeddings=self.number_of_users, embedding_dim=8)
    self.item_embedding = nn.Embedding(num_embeddings=self.number_of_items, embedding_dim=8)
    
    # Tower pattern is implemented, where the bottom layer is the widest and each successive layer has a smaller number of neurons
    # The reference paper halves the neurons by half each time, but I have tried a more generalized model
    self.layer1 = nn.Linear(in_features=16, out_features=32)
    self.layer2 = nn.Linear(in_features=32, out_features=16)
    self.layer3 = nn.Linear(in_features=16, out_features=8)

    # Reference: https://stats.stackexchange.com/questions/207049/neural-network-for-binary-classification-use-1-or-2-output-neurons
    self.output_layer = nn.Linear(in_features=8, out_features=1)

  def forward(self, user_input, item_input):
    dense_user = self.user_embedding(user_input)
    dense_item = self.item_embedding(item_input)
    vector = torch.cat([dense_user, dense_item], dim=-1)

    # Results from various posts and research papers
    # The sigmoid function restricts each neuron to be in (0,1), which may limit the model’s performance; and it is known to suffer from saturation, where neurons stop learning when their output is near either 0 or 1
    # Even though tanh is a better choice and has been widely adopted it only alleviates the issues of sigmoid to a certain extent, since it can be seen as a rescaled version of sigmoid (tanh(x/2) = 2σ(x) − 1)
    # ReLU, which is more plausible and proven to be non-saturated, it encourages sparse activations, making the model less likely to be overfitting.
    vector = nn.ReLU()(self.layer1(vector))
    vector = nn.ReLU()(self.layer2(vector))
    vector = nn.ReLU()(self.layer3(vector))

    # sigmoid is the same as softmax. The better choice for the binary classification is to use one output unit with sigmoid instead of softmax with two output units, because it will update faster
    pred = nn.Sigmoid()(self.output_layer(vector))

    return pred
    
  def training_step(self, batch, batch_idx):
    user_input, item_input, labels = batch
    predicted_labels = self(user_input, item_input)
    
    # Binary Cross-Entropy/Log Loss
    bce_loss_obj = nn.BCELoss()
    loss = bce_loss_obj(predicted_labels, labels.view(-1, 1).float())
    
    return loss

  def configure_optimizers(self):
    # The results of the Adam optimizer are generally better than every other optimization algorithm, have faster computation time, and require fewer parameters for tuning. 
    # Because of all that, Adam is recommended as the default optimizer for most of the applications
    # Adam combines the best properties of the AdaGrad and RMSProp algorithms to provide an optimization algorithm that can handle sparse gradients on noisy problems
    return torch.optim.Adam(self.parameters(), capturable=True)
  
  def train_dataloader(self):
    return self.dataloader

In [11]:
# Used for progress bar display
from pytorch_lightning.callbacks import TQDMProgressBar

# Creating the model and the trainer
model = CollaborativeFiltering(augmented_dataset, data_loader)
trainer = pl.Trainer(gpus=1,
                     max_epochs=20,
                     enable_progress_bar=True)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model)
trainer.save_checkpoint('/content/drive/MyDrive/small_dataset/checkpoint_2layer.ckpt')

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.1 M 
1 | item_embedding | Embedding | 1.1 M 
2 | layer1         | Linear    | 544   
3 | layer2         | Linear    | 528   
4 | layer3         | Linear    | 136   
5 | output_layer   | Linear    | 9     
---------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.636     T

Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


In [13]:
# Save model for reuse
torch.save(model.state_dict(), '/content/drive/MyDrive/small_dataset/model_2layer.pth')

In [14]:
from torch.utils.data import Dataset, DataLoader

class TestingData(Dataset):
  def __init__(self, test_ratings):
    # The input has been separated into separate variables so that the later model can use these items to convert to embeddings. 
    # Also, each input is sent as a tensor to the successive models
    self.users, self.movies = self.get_data(test_ratings)

  def __len__(self):
    # The __len__ function returns the number of samples in our dataset
    return len(self.users)

  def __getitem__(self, idx):
    # Return both inputs (user and item) and the output (target indicating whether the user interacted with the item or not)
    return self.users[idx], self.movies[idx]
  
  def get_data(self, ratings):
    return torch.tensor(ratings['userId']), torch.tensor(ratings['movieId'])

In [15]:
from functools import partial
tqdm = partial(tqdm, position=0, leave=True)

test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))
test_dataset = pd.read_csv('/content/drive/MyDrive/small_dataset/augmented_test_dataset.csv')

hits = []
user_ids = test_dataset['userId'].unique()
for user_id in tqdm(user_ids):
    test_item = test_ratings[test_ratings['userId']==user_id]['movieId'].iloc[0]
    user_df = test_dataset[test_dataset['userId'] == user_id].reset_index()
    data_loader = DataLoader(TestingData(user_df), batch_size=100, num_workers=4, shuffle=False)

    # Returns a list of dictionaries, one for each provided dataloader containing their respective predictions
    predictions = model(torch.tensor(user_df['userId']), torch.tensor(user_df['movieId']))
    # To convert to numpy array and solve issue: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead
    predictions = predictions.detach().numpy()

    # To solve : Buffer has wrong number of dimensions (expected 1, got 2) because dimensions of predictions are (100, 1)
    # Reference: https://deeplizard.com/learn/video/fCVuiW9AFzY
    predictions = np.squeeze(predictions)

    # Since we need the movieId, 
    top_10 = set(user_df.iloc[np.argsort(predictions)[::-1][:10]]['movieId'])
    
    hits.append(1) if test_item in top_10 else hits.append(0)

print(f'Hit Ratio @ 10 is {np.average(hits)}')

100%|██████████| 14315/14315 [01:09<00:00, 206.72it/s]

Hit Ratio @ 10 is 0.5249039469088369



