In [None]:
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append("../src")

import constants
from utils import train_test_split

# Read data

In [None]:
ratings = pd.read_csv(constants.RATINGS_PATH_SANDBOX, parse_dates=["timestamp"])

# Data preprocessing

In [None]:
# # In train propouses we will use only 30% of all ratings dataset
# rand_userIds = np.random.choice(
#     ratings["userId"].unique(),
#     size=int(len(ratings["userId"].unique()) * 0.3),
#     replace=False,
# )

# ratings = ratings.loc[ratings["userId"].isin(rand_userIds)]
# print("There are {} rows of data from {} users".format(len(ratings), len(rand_userIds)))

### Train-test split

In [None]:
train_ratings, test_ratings = train_test_split(ratings)

In [None]:
train_ratings.sample(5)

## Converting the dataset into an implicit feedback dataset

In [None]:
train_ratings.loc[:, "rating"] = 1
test_ratings.loc[:, "rating"] = 1


train_ratings.sample(5)

The code below generates 4 negative samples for each row of data. In other words, the ratio of negative to positive samples is 4:1. This ratio is chosen arbitrarily but I found that it works rather well (feel free to find the best ratio yourself!)

In [None]:
# Get a list of all movie IDs
all_movieIds = ratings["movieId"].unique()

users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings["userId"], train_ratings["movieId"]))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for u, i in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1)  # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds)
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0)  # items not interacted with are negative

# My custom dataset (MovieLense)

In [None]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
        is_training (bool): Default is True. Indicate for progress bar

    """

    def __init__(self, ratings, all_movieIds, is_training: bool = True):
        self.is_training = is_training
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings["userId"], ratings["movieId"]))

        num_negatives = 4
        for u, i in tqdm(
            user_item_set,
            desc=f"Generating negative sample for {'training' if self.is_training else 'validating'}",
        ):
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

# Model Architecture

In [None]:
class NCF(pl.LightningModule):
    """Neural Collaborative Filtering (NCF)

    Args:
        num_users (int): Number of unique users
        num_items (int): Number of unique items
        ratings (pd.DataFrame): Dataframe containing the movie ratings for training
        all_movieIds (list): List containing all movieIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()

        self.ratings = ratings
        self.all_movieIds = all_movieIds

        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)

    def forward(self, user_input, item_input):
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

    def train_dataloader(self):
        return DataLoader(
            MovieLensTrainDataset(self.ratings, self.all_movieIds, is_training=True),
            batch_size=constants.NCF_BATCH_SIZE,
            num_workers=0,
            persistent_workers=False,
        )

In [None]:
num_users = ratings["userId"].max() + 1
num_items = ratings["movieId"].max() + 1

all_movieIds = ratings["movieId"].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath=r"sandbox/weights/", filename="{epoch}-{val_loss:.2f}", monitor="val_loss"
)
trainer = pl.Trainer(
    fast_dev_run=True,
    max_epochs=5,
    reload_dataloaders_every_n_epochs=1,
    devices="auto",
    accelerator="auto",
    logger=False,
    callbacks=[checkpoint_callback],
)

In [None]:
trainer.fit(model)

---DEBUG ZONE---

In [None]:
trainer.save_checkpoint(r"sandbox/weights/NCF_FINAL_epoch.ckpt")

In [None]:
loaded_model = NCF.load_from_checkpoint(
    r"../src/weights/epoch=2-train_loss=0.12.ckpt",
    num_users=num_users,
    num_items=num_items,
    ratings=train_ratings,
    all_movieIds=all_movieIds,
)

--- END OF DEBUG ---

# Validation

In [None]:
test_ratings.columns

In [None]:
grouped = (
    test_ratings.groupby("userId")[["movieId", "rating"]]
    .apply(lambda x: x.values.tolist())
    .to_dict()
)


In [None]:
model.eval()

In [None]:
from sklearn.metrics import ndcg_score

ndcg_scores = []


for user_id, items_targets in tqdm(grouped.items()):
    items, true_targets = zip(*items_targets)

    # Прогнозы модели для данного пользователя

    items_tensor = torch.tensor(items, dtype=torch.long)

    user_tensor = torch.tensor([user_id] * len(items), dtype=torch.long)

    with torch.no_grad():
        predictions = model(user_tensor, items_tensor).flatten().numpy()

    # Рассчитываем NDCG@20 для пользователя

    if len(true_targets) > 1:  # NDCG не имеет смысла для одного элемента
        ndcg_val = ndcg_score([true_targets], [predictions], k=20)

        ndcg_scores.append(ndcg_val)


average_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0

print("Средний NDCG@20:", average_ndcg)
