In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.tensorboard import SummaryWriter

from torch_geometric import seed_everything
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import RandomLinkSplit

%config InlineBackend.figure_format='retina'

# Datasets

In [3]:
class PlainData(Data):
    """
    Custom Data class for use in PyG. Basically the same as the original Data class from PyG, but
    overrides the __inc__ method because otherwise the DataLoader was incrementing indices unnecessarily.
    Now it functions more like the original DataLoader from PyTorch itself.
    See here for more information: https://pytorch-geometric.readthedocs.io/en/latest/notes/batching.html
    """
    def __inc__(self, key, value, *args, **kwargs):
        return 0

class MyDataset(Dataset):
    """
    Dataset object containing supervision/evaluation edges. This will be used by the DataLoader to load
    batches of edges to calculate loss or evaluation metrics on. Here, get(idx) will return ALL outgoing edges of the graph
    corresponding to user "idx". This is because when calculating metrics such as recall@k, we need all of the
    user's positive edges in the same batch.
    """
    def __init__(self, root, edge_index, transform=None, pre_transform=None):
        self.edge_index = edge_index
        self.unique_idxs = torch.unique(edge_index[0,:]).tolist()
        self.num_nodes = len(self.unique_idxs)
        super().__init__(root, transform, pre_transform)

    def len(self):
        return self.num_nodes

    def get(self, idx): # returns all outgoing edges associated with playlist idx
        edge_index = self.edge_index[:, self.edge_index[0,:] == idx]
        return PlainData(edge_index=edge_index)

In [25]:
class MovieLens:
    def __init__(self):
        # df = load_pandas_df('100k', ('UserId', 'ItemId', 'Rating'))
        df = pd.read_table('ml-100k/u.data', header=None, usecols=[0, 1, 2])
        df.columns = ('UserId', 'ItemId', 'Rating')
        df = df[df['Rating'] >= 4]
        encoder = LabelEncoder()
        df['UserId'] = encoder.fit_transform(df['UserId'])
        df['ItemId'] = encoder.fit_transform(df['ItemId'])
        df['ItemId'] += df['UserId'].max() + 1
        self.num_users = df['UserId'].nunique()
        self.df = df
        self._build_graph()
        self._train_test_split()

    def _build_edge_index(self):
        users = torch.tensor(self.df['UserId'].values, dtype=torch.long)
        items = torch.tensor(self.df['ItemId'].values, dtype=torch.long)
        source = torch.cat([users, items]).reshape(1, -1)
        target = torch.cat([items, users]).reshape(1, -1)
        edges = torch.cat([source, target], dim=0)
        return edges

    def _build_graph(self):
        edges = self._build_edge_index()
        self.num_nodes = len(edges.unique())
        self.edges = edges
        graph = Data(edge_index=edges, num_nodes=self.num_nodes)
        self.graph = graph

    def _train_test_split(self, val_ratio=0.15, test_ratio=0.15):
        splitter = RandomLinkSplit(is_undirected=True, add_negative_train_samples=False,
                                   neg_sampling_ratio=0, num_val=val_ratio, num_test=test_ratio)
        train_split, val_split, test_split = splitter(self.graph)
        # Confirm that every node appears in every set above
        assert train_split.num_nodes == val_split.num_nodes and train_split.num_nodes == test_split.num_nodes

        self.train_split = train_split
        self.val_split = val_split
        self.test_split = test_split

    def get_train(self):
        train_ev = MyDataset('movielens', edge_index=self.train_split.edge_label_index)
        train_mp = Data(edge_index=self.train_split.edge_index)
        return train_ev, train_mp

    def get_val(self):
        val_ev = MyDataset('movielens', edge_index=self.val_split.edge_label_index)
        val_mp = Data(edge_index=self.val_split.edge_index)
        return val_ev, val_mp

    def get_test(self):
        test_ev = MyDataset('movielens', edge_index=self.test_split.edge_label_index)
        test_mp = Data(edge_index=self.test_split.edge_index)
        return test_ev, test_mp

In [None]:
mov = MovieLens()

train_ev, train_mp = mov.get_train()
val_ev, val_mp = mov.get_val()
test_ev, test_mp = mov.get_test()

# LightGCN
__TODO:__
- Переписать метод `recommend`, чтобы была возможность рекомендовать только ранее не виденные айтемы. Оставить и обычное поведение тоже, чтобы была возможность тестировать на трейне.


- Добавить в `__init__()` поля с количеством юзеров и пользователей. Подумать, в какой класс это добавить.

In [None]:
from torch_geometric.nn import LightGCN

class CustomLightGCN(LightGCN):
    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.embedding.weight, gain=1)
        for conv in self.convs:
            conv.reset_parameters()

In [None]:
class Trainer:
    def __init__(self, model, optimizer, train_ev, val_ev, batch_size=256):
        self.model = model
        self.opt = optimizer
        self.train_loader = DataLoader(train_ev, batch_size=batch_size, shuffle=True)
        self.val_loader = DataLoader(val_ev, batch_size=batch_size, shuffle=False)
        self.batch_size = batch_size

        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = self.model.to(self.device)

        self.global_train_step = 0
        self.global_val_step = 0
        self.writer = SummaryWriter("./logs")

        # сохраняем состояние оптимизатора и модели
        # self.cache = self.cache_states()

    @staticmethod
    def sample_negative_edges(batch, data_mp, num_users, num_nodes):
        negs = []
        for i in batch.edge_index[0, :]:  # looping over users
            assert i < num_users     # just ensuring that i is a user
            rand_item = torch.randint(num_users, num_nodes, (1,))  # randomly sample an item
            negs.append(rand_item.item())
        edge_index_negs = torch.row_stack([batch.edge_index[0, :], torch.LongTensor(negs)])
        return Data(edge_index=edge_index_negs, y=torch.LongTensor([0] * len(negs)))


    def train(self, data_mp, k, num_users, num_nodes):
        model = self.model
        opt = self.opt
        items = torch.tensor(np.arange(num_users, num_nodes))
        recall_all = []

        model.train()
        for batch in self.train_loader:
            del batch.batch; del batch.ptr # delete unwanted attributes

            opt.zero_grad()
            negs = self.sample_negative_edges(batch, data_mp, num_users, num_nodes)  # sample negative edges
            data_mp, batch, negs = data_mp.to(self.device), batch.to(self.device), negs.to(self.device)

            pos_scores = model.forward(data_mp['edge_index'], batch['edge_index'])
            neg_scores = model.forward(data_mp['edge_index'], negs['edge_index'])

            batch_index = batch['edge_index']
            users = batch_index[0].unique()
            recoms = model.recommend(data_mp['edge_index'], users, items, k=k)

            recall_batch = []
            for u, rec in zip(users, recoms):
                true_items = batch_index[1, batch_index[0] == u].cpu()
                hits = len(np.intersect1d(rec.cpu(), true_items))
                recall = hits / len(true_items)
                recall_batch.append(recall)
                recall_all.append(recall)

            # loss = pos_scores.shape[0] * model.recommendation_loss(pos_scores, neg_scores)
            loss = torch.mean(torch.nn.functional.softplus(neg_scores - pos_scores))
            loss.backward()
            # for p in list(filter(lambda p: p.grad is not None, model.parameters())):
            #     print(f'Gradient norm on this layer = {p.grad.data.norm(2).item()}')  # Посмотрим на норму градиентов
            opt.step()

            self.writer.add_scalar("Recall/train", np.mean(recall_batch), global_step=self.global_train_step)
            self.writer.add_scalar("Loss/train", loss.item(), global_step=self.global_train_step)
            self.global_train_step += 1

        print(f'Train avg recall = {np.mean(recall_all).round(4)}')
        print(f'Current loss = {loss.item()}')
        return loss.item()


    def test(self, data_mp, k, num_users, num_nodes, train_split=None):
        model = self.model
        items = torch.LongTensor(np.arange(num_users, num_nodes))

        model.eval()
        with torch.no_grad():
            data_mp = data_mp.to(self.device)

            recall_all = []
            for batch in self.val_loader:
                del batch.batch; del batch.ptr # delete unwanted attributes
                batch = batch['edge_index']
                batch = batch.to(self.device)
                users = batch[0].unique_consecutive()
                recoms = model.recommend(data_mp['edge_index'], users, items, k=k)

                # --- FAIR BUT LONG METHOD ---
                # for user in tqdm(users):
                    # seen_items = set(train_split[1, train_split[0] == user].tolist())
                    # unseen_items = torch.LongTensor(list(items.difference(seen_items)))
                    # rec = model.recommend(data_mp['edge_index'], user, torch.tensor(list(items)), k=k)
                    # true_items = batch[1, batch[0] == user].cpu()

                recall_batch = []
                for u, rec in zip(users, recoms):
                    true_items = batch[1, batch[0] == u].cpu()
                    hits = len(np.intersect1d(rec.cpu(), true_items))
                    recall = hits / len(true_items)
                    recall_batch.append(recall)
                    recall_all.append(recall)

                self.writer.add_scalar("Recall/valid", np.mean(recall_batch), global_step=self.global_val_step)
                self.global_val_step += 1

        print(f'Valid avg recall = {np.mean(recall_all).round(4)}')

In [4]:
%load_ext tensorboard

In [5]:
%tensorboard --logdir logs/

Launching TensorBoard...