In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.metrics import ndcg_score

In [2]:
import sys
sys.path.append('../src')

import config as cfg

In [3]:
# %%time

df_train = pd.read_csv(f'{cfg.DATASET_PATH}\\train.csv')
df_test = pd.read_csv(f'{cfg.DATASET_PATH}\\test.csv')

df_songs = pd.read_csv(f'{cfg.DATASET_PATH}\\songs.csv')
df_members = pd.read_csv(f'{cfg.DATASET_PATH}\\members.csv')

In [4]:
rand_userIds = np.random.choice(df_train['msno'].unique(),
                                size=int(len(df_train['msno'].unique())*0.3),
                                replace=False)

df = df_train.loc[df_train['msno'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(df), len(rand_userIds)))

There are 2280464 rows of data from 9226 users


In [5]:
df.sample(5)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
2173217,07+a5Wl5ujG0rOjTrZ1nsd3VxpLuAsgEwN/pyXHwvRc=,MNX4VRUyD8fchsTo9zmvhlDR2Fnmuu+YpD9zY3v9VYE=,radio,Radio,radio,0
2574452,k4hXX5Ry0IOXwrxkf0umgrJeyPG5uT5XyVKJ5Vf9oPs=,FnAI7SkF8cquJEJw8wx6m/BGn56+OEXLdtZHf9Hl1vQ=,my library,Local playlist more,local-playlist,0
4103469,lyyEz2p7AEEvU4a1xdlk7P8foomLKPRvaZBX0Mt7iu0=,uZF+pQaLTyHBXuPU71bhv2TrcZNivkCC7KEhNAqRQRg=,my library,Album more,album,0
3962489,uszq2ylKoWwlEZTGgMxfZkbElMaf4L0FYCoDMiS2Goc=,Umpx3/OIYe/mrDn2ZFSoEkz4ckfhAJsmu3tw5p1ZPLs=,my library,Local playlist more,local-library,1
3630366,tgZ23S9kfDbKRvRHCNOWozw1BAlDi7EIgvDVx91ZZb8=,3VkD5ekIf5duJm1hmYTZlXjyl0zqV8wCzuAh3uocfCg=,discover,Online playlist more,online-playlist,1


In [6]:
ALL_USERS = df_members['msno'].unique().tolist()
ALL_ITEMS = df_songs['song_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['msno'].map(user_map)
df['item_id'] = df['song_id'].map(item_map)

df.dropna(subset=['item_id'], inplace=True)
df['item_id'] = df['item_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = df['msno'].map(user_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['item_id'] = df['song_id'].map(item_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['item_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [7]:
df = df[['user_id', 'item_id', 'target']]

In [8]:
class MusicTrainDataset(Dataset):
    """MusicTrainDataset PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe which contains the 'user_id', 'item_id', 'target' columns

    """

    def __init__(self, ratings):
        self.users, self.items, self.labels = self.get_dataset(ratings)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user_id'], ratings['item_id']))

        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [9]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the ratings for training
    """

    def __init__(self, num_users, num_items, ratings: DataFrame):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MusicTrainDataset(self.ratings),
                          batch_size=512, num_workers=0)
        # Если вы запускаете код на Google colab то можете выставить num_workers=5 (НЕ ПРОВЕРЕННО). В данный момент стоит 0, т.к:
        # jupyter notebook might not work properly with multiprocessing as documented (https://stackoverflow.com/a/71193241/16733101)

In [10]:
num_users = len(ALL_USERS)
num_items = len(ALL_ITEMS)

model = NCF(num_users, num_items, df)

In [None]:
trainer = pl.Trainer(max_epochs=5, logger=False)

trainer.fit(model)

---DEBUG ZONE---

In [None]:
trainer.save_checkpoint()

In [11]:
model = NCF.load_from_checkpoint(r"checkpoints/epoch=4-step=21555.ckpt", num_users=num_users, num_items=num_items, ratings=df)

--- END OF DEBUG ---

# Validation

In [12]:
# Т.к. в df_test нет target, то мы не можем оценить нашу рекоменадетльную систему на этом датафрейме. Поэтому я возьму еще 30% от df_train и они будут в качестве валидации

rand_userIds = np.random.choice(df_train['msno'].unique(),
                                size=int(len(df_train['msno'].unique())*0.3),
                                replace=False)

val_data = df_train.loc[df_train['msno'].isin(rand_userIds)]

In [13]:
val_data['user_id'] = val_data['msno'].map(user_map)
val_data['item_id'] = val_data['song_id'].map(item_map)
val_data.dropna(subset=['item_id'], inplace=True)
val_data['item_id'] = val_data['item_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['user_id'] = val_data['msno'].map(user_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['item_id'] = val_data['song_id'].map(item_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data.dropna(subset=['item_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [18]:
grouped = val_data.groupby('user_id')[['item_id', 'target']].apply(
    lambda x: x.values.tolist()).to_dict()

In [26]:
model.eval()

NCF(
  (user_embedding): Embedding(34403, 8)
  (item_embedding): Embedding(2296320, 8)
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)

In [28]:
ndcg_scores = []

for user_id, items_targets in tqdm(grouped.items()):
    items, true_targets = zip(*items_targets)

    # Прогнозы модели для данного пользователя
    items_tensor = torch.tensor(items, dtype=torch.long)
    user_tensor = torch.tensor([user_id] * len(items), dtype=torch.long)
    with torch.no_grad():
        predictions = model(user_tensor, items_tensor).flatten().numpy()

    # Рассчитываем NDCG@20 для пользователя
    if len(true_targets) > 1:  # NDCG не имеет смысла для одного элемента
        ndcg_val = ndcg_score([true_targets], [predictions], k=20)
        ndcg_scores.append(ndcg_val)

average_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0
print('Средний NDCG@20:', average_ndcg)

100%|██████████| 9226/9226 [00:07<00:00, 1209.33it/s]

Средний NDCG@20: 0.47137306496113224



