In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.preprocessing import MultiLabelBinarizer


In [None]:
import sys
sys.path.append('../src')

import constants
from utils import train_test_split

# Read data

In [None]:
ratings = pd.read_csv(constants.RATINGS_PATH_SANDBOX, parse_dates=['timestamp'])

# Data preprocessing

In [None]:
# In train propouses we will use only 30% of all ratings dataset
rand_userIds = np.random.choice(ratings['userId'].unique(),
                                size=int(len(ratings['userId'].unique())*0.3),
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

### Train-test split

In [None]:
train_ratings, test_ratings = train_test_split(ratings)

In [None]:
train_ratings.sample(5)

# Our custom MovieLense Dataset

In [None]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        movies (pd.DataFrame): Dataframe containing information on all the movies

    """

    def __init__(self, ratings: pd.DataFrame, movies: pd.DataFrame):
        self.users, self.items, self.genre, self.release_date, self.labels = self.get_dataset(ratings, movies)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.genre[idx], self.release_date[idx], self.labels[idx]

    @staticmethod
    def get_release_date_from_title(title: str) -> int | None:
        match = re.search(r'\((\d{4})\)', title)
        if match:
            return int(match.group(1))
        return None

    def get_dataset(self, ratings: pd.DataFrame, movies: pd.DataFrame):
        ratings = ratings.copy()
        movies = movies.copy()

        # Work with ratings dataframe
        users = ratings['userId'].tolist()
        items = ratings['movieId'].tolist()
        labels = ratings['rating'].tolist()

        # Work with movies dataframe
        movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
        # One-Hot Encoding for genres
        mlb = MultiLabelBinarizer()
        genres_encoded = mlb.fit_transform(movies['genres'])
        genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies['movieId'])
        # Map genres to rating
        genres = [genres_df.loc[movie_id].values for movie_id in items]

        # Regex for release_date
        movies['release_date'] = movies['title'].apply(self.get_release_date_from_title)
        release_dates_df = movies.set_index('movieId')['release_date'].fillna(-1).astype(int)
        # Map release dates to ratings
        release_dates = [release_dates_df.loc[movie_id] for movie_id in items]

        return (
            torch.tensor(users),
            torch.tensor(items),
            torch.tensor(genres),
            torch.tensor(release_dates),
            torch.tensor(labels)
        )

    def get_num_users(self) -> int:
        return len(self.users)

    def get_num_items(self) -> int:
        return len(self.items)

    def get_num_genres(self) -> int:
        return self.genre.shape[1]

### Example + test

In [None]:
exmpl_ratings = pd.DataFrame({
    'userId': [1, 1, 2, 2, 2, 3, 3, 4],
    'movieId': [101, 102, 101, 103, 104, 101, 105, 106],
    'rating': [5, 3, 4, 5, 2, 3, 4, 5]
})

exmpl_movies = pd.DataFrame({
    'movieId': [101, 102, 103, 104, 105, 106],
    'title': ["Movie A (2000)", "Movie B (2001)", "Movie C (2002)", "Movie D (1999)", "Movie E (2000)", "Movie F"],
    'genres': ["Action|Adventure|Thriller", "Drama|Romance", "Comedy|Romance", "Action|Drama", "Thriller", "Adventure|Drama"]
})

In [None]:
exmpl_dataset = MovieLensTrainDataset(exmpl_ratings, exmpl_movies)

In [None]:
for i in range(len(exmpl_dataset)):
    print(exmpl_dataset[i])

# Model Architecture

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            dataset (MovieLensTrainDataset): Dataset for training
    """

    def __init__(self, dataset: MovieLensTrainDataset):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=dataset.get_num_users(), embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=dataset.get_num_items(), embedding_dim=8)
        self.genre_embedding = nn.Linear(num_embeddings=dataset.get_num_genres(), embedding_dim=8)
        self.date_embedding = nn.Embedding(num_embeddings=124, embedding_dim=8) # from 1891 to 2015

        # 8 * 4(count of embedding) = 32 in_features
        self.fc1 = nn.Linear(in_features=32, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.dataset = dataset

    def forward(self, user_input, item_input, genre_input, date_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        genre_embedded = self.genre_embedding(genre_input)
        date_embedded = self.date_embedding(date_input)

        # Item embedding = movie_id + movie_genre + movie_release_date
        item_embedded = item_embedded + genre_embedded + date_embedded

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer with clamp to ensure rating range
        pred = torch.clamp(self.output(vector), min=1.0, max=5.0)

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, genre_input, date_input, labels = batch
        predicted_labels = self(user_input, item_input, genre_input, date_input)
        loss = nn.MSELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=512, num_workers=0)
        # Если вы запускаете код на Google colab то можете выставить num_workers=5 (НЕ ПРОВЕРЕННО). В данный момент стоит 0, т.к:
        # jupyter notebook might not work properly with multiprocessing as documented (https://stackoverflow.com/a/71193241/16733101)

In [None]:
movies = pd.read_csv(constants.MOVIE_PATH_SANDBOX)
train_dataset = MovieLensTrainDataset(train_ratings, movies)

In [None]:
print(train_dataset.get_num_users(), train_dataset.get_num_items(), train_dataset.get_num_genres())

In [None]:
model = NCF(train_dataset)