In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.metrics import ndcg_score

In [None]:
import sys
sys.path.append('../src')

import config as cfg

In [None]:
# %%time

df_train = pd.read_csv(f'{cfg.DATASET_PATH}\\train.csv')
df_test = pd.read_csv(f'{cfg.DATASET_PATH}\\test.csv')

df_songs = pd.read_csv(f'{cfg.DATASET_PATH}\\songs.csv')
df_members = pd.read_csv(f'{cfg.DATASET_PATH}\\members.csv')

In [None]:
rand_userIds = np.random.choice(df_train['msno'].unique(),
                                size=int(len(df_train['msno'].unique())*0.3),
                                replace=False)

df = df_train.loc[df_train['msno'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(df), len(rand_userIds)))

In [None]:
df.sample(5)

In [None]:
ALL_USERS = df_members['msno'].unique().tolist()
ALL_ITEMS = df_songs['song_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['msno'].map(user_map)
df['item_id'] = df['song_id'].map(item_map)

df.dropna(subset=['item_id'], inplace=True)
df['item_id'] = df['item_id'].astype(int)

In [None]:
df = df[['user_id', 'item_id', 'target']]

In [None]:
class MusicTrainDataset(Dataset):
    """MusicTrainDataset PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe which contains the 'user_id', 'item_id', 'target' columns

    """

    def __init__(self, ratings):
        self.users, self.items, self.labels = self.get_dataset(ratings)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user_id'], ratings['item_id']))

        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the ratings for training
    """

    def __init__(self, num_users, num_items, ratings: DataFrame):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MusicTrainDataset(self.ratings),
                          batch_size=512, num_workers=0)
        # Если вы запускаете код на Google colab то можете выставить num_workers=5 (НЕ ПРОВЕРЕННО). В данный момент стоит 0, т.к:
        # jupyter notebook might not work properly with multiprocessing as documented (https://stackoverflow.com/a/71193241/16733101)

In [None]:
num_users = len(ALL_USERS)
num_items = len(ALL_ITEMS)

model = NCF(num_users, num_items, df)

In [None]:
trainer = pl.Trainer(max_epochs=5, logger=False)

trainer.fit(model)

---DEBUG ZONE---

In [None]:
trainer.save_checkpoint()

In [None]:
model = NCF.load_from_checkpoint(r"checkpoints/epoch=4-step=21555.ckpt", num_users=num_users, num_items=num_items, ratings=df)

--- END OF DEBUG ---

# Validation

In [None]:
# Т.к. в df_test нет target, то мы не можем оценить нашу рекоменадетльную систему на этом датафрейме. Поэтому я возьму еще 30% от df_train и они будут в качестве валидации

rand_userIds = np.random.choice(df_train['msno'].unique(),
                                size=int(len(df_train['msno'].unique())*0.3),
                                replace=False)

val_data = df_train.loc[df_train['msno'].isin(rand_userIds)]

In [None]:
val_data['user_id'] = val_data['msno'].map(user_map)
val_data['item_id'] = val_data['song_id'].map(item_map)
val_data.dropna(subset=['item_id'], inplace=True)
val_data['item_id'] = val_data['item_id'].astype(int)

In [None]:
val_user_item_pairs = torch.tensor(list(zip(val_data['user_id'], val_data['item_id'])))
user_inputs = val_user_item_pairs[:, 0]
item_inputs = val_user_item_pairs[:, 1]


In [None]:
model.eval()
with torch.no_grad():
    predictions = model(user_inputs, item_inputs).flatten()

In [None]:
user_inputs[:20]

In [None]:
item_inputs[:20]

In [None]:
predictions[:20]

In [None]:
average_ndcg_scores = []

for user_id in tqdm(df_train['user_id'].unique()):
    user_df = df_train[df_train['user_id'] == user_id]

    user_items = user_df['item_id'].tolist()
    true_relevance = user_df['target'].tolist()
    if len(true_relevance) > 1:
        # Оценки на основе позиций в ТОП20
        scores = np.zeros(len(user_items))
        for i, item in enumerate(user_items):
            scores[i] = top20_indices.get(item, 0)

        if np.sum(scores) > 0:  # Проверяем, есть ли среди оценок ненулевые значения
            ndcg_value = ndcg_score([true_relevance], [scores], k=20)
            average_ndcg_scores.append(ndcg_value)

average_ndcg = np.mean(average_ndcg_scores) if average_ndcg_scores else 0
print('Средний NDCG@20:', average_ndcg)

In [None]:
# y_true = []
# y_score = []

# for user_id in tqdm(sorted(val_data['user_id'].unique())):
#     user_items = val_data[val_data['user_id'] == user_id]
#     true_relevance = user_items['target'].tolist()
#     predicted_scores = [predictions[i] for i in range(len(user_items))]

#     y_true.append(true_relevance)
#     y_score.append(predicted_scores)

# y_score = [[int((t >= 0.5).item()) for t in sublist] for sublist in y_score]
# # Вычисление оценки NDCG
# ndcg = ndcg_score(y_true, y_score, k=20)
# print('Оценка NDCG:', ndcg)