In [1]:
!pip -q install torch_geometric rectools
!pip -q install comet_ml
!pip -q install python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.0/208.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.1/727.1 kB[0m [

In [2]:
import comet_ml
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

from dotenv import load_dotenv
import os

In [None]:
load_dotenv(".env")

True

In [4]:
experiment = Experiment(
  api_key=os.getenv('API_KEY'),
  project_name="gnn-recommender",
  workspace="annanet",
  log_code=True
)

experiment.set_name('baseline-movielens')
experiment.add_tags(['movielens', 'leave-n-out'])

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/annanet/gnn-recommender/4b3f9c68f8684549869e2e3830603d43

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


In [5]:
hyperparameters = {
    'seed': 42,
    'types_of_feedback': ["explicit_positive", "expliсit_negative",
                          "implicit_positive", "implicit_negative"],
    'train_edge_type': ('item','to_feedback_explicit_positive','explicit_positive'),
    'train_num_epochs': 100,
    'train_lr': 8e-5,
    'train_batch_size': 16384,
    'train_print_every': 10,  
    'train_test_every': 50,
    'test_topk': 10,
    'test_batch_size': 8192
}

In [6]:
import os
os.listdir('/kaggle/input/data/leave-n-out/mvln')

['train.csv', 'test.csv']

In [7]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv, GATConv

from sklearn.preprocessing import LabelEncoder

from rectools import Columns
from rectools.metrics import MAP, Precision, Recall, NDCG, calc_metrics

import gc
import random

In [8]:
SEED = hyperparameters['seed']
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [9]:
rootpath = '/kaggle/input/data/leave-n-out/mvln/'
train = pd.read_csv(
    rootpath+'train.csv'
)
train['date'] = pd.to_datetime(train['timestamp'], unit='s')
print(train.head())

   user_id  movie_id  rating  timestamp                date
0        1      3186       4  978300019 2000-12-31 22:00:19
1        1      1270       5  978300055 2000-12-31 22:00:55
2        1      1721       4  978300055 2000-12-31 22:00:55
3        1      1022       5  978300055 2000-12-31 22:00:55
4        1      2340       3  978300103 2000-12-31 22:01:43


In [10]:
explicit_positive = train[(train["rating"] == 5)].index
explisit_negative = train[(train["rating"] <= 2)].index

explicit_combined_feedback = explicit_positive.union(explisit_negative)
print('Количество explicit позитивного фидбека', explicit_positive.shape[0])
print('Количество explicit негативного фидбека', explisit_negative.shape[0])

Количество explicit позитивного фидбека 211802
Количество explicit негативного фидбека 153484


In [11]:
implicit_positive = train[(train["rating"] == 4)].index
implicit_negative = train[(train["rating"] == 3)].index

implicit_combined_feedback = implicit_positive.union(implicit_negative)
print('Количество implicit позитивного фидбека', implicit_positive.shape[0])
print('Количество implicit негативного фидбека', implicit_negative.shape[0])

Количество implicit позитивного фидбека 327987
Количество implicit негативного фидбека 246536


In [12]:
train.loc[:, "target"] = ""
train.loc[explicit_positive, "target"] = "explicit_positive"
train.loc[explisit_negative, "target"] = "expliсit_negative"
train.loc[implicit_positive, "target"] = "implicit_positive"
train.loc[implicit_negative, "target"] = "implicit_negative"

train = train[['user_id','movie_id','target','date']]
train.head()

Unnamed: 0,user_id,movie_id,target,date
0,1,3186,implicit_positive,2000-12-31 22:00:19
1,1,1270,explicit_positive,2000-12-31 22:00:55
2,1,1721,implicit_positive,2000-12-31 22:00:55
3,1,1022,explicit_positive,2000-12-31 22:00:55
4,1,2340,implicit_negative,2000-12-31 22:01:43


In [13]:
train = train.sort_values(by=["user_id", "date"]).reset_index(drop=True)
train.columns = ['user_id', 'item_id', 'target', 'date']

In [14]:
test = pd.read_csv(
    rootpath+'test.csv'
)
test['date'] = pd.to_datetime(test['timestamp'], unit='s')
print(test.head())

   user_id  movie_id  rating  timestamp                date
0        1      2687       3  978824268 2001-01-06 23:37:48
1        1       745       3  978824268 2001-01-06 23:37:48
2        1       588       4  978824268 2001-01-06 23:37:48
3        1         1       5  978824268 2001-01-06 23:37:48
4        1      2355       5  978824291 2001-01-06 23:38:11


In [15]:
test = test[['user_id','movie_id', 'date']]
test.columns = ['user_id', 'item_id', 'date']
test.head()

Unnamed: 0,user_id,item_id,date
0,1,2687,2001-01-06 23:37:48
1,1,745,2001-01-06 23:37:48
2,1,588,2001-01-06 23:37:48
3,1,1,2001-01-06 23:37:48
4,1,2355,2001-01-06 23:38:11


# MVP model v2

In [16]:
test = test[(test.user_id.isin(train.user_id)) & (test.item_id.isin(train.item_id))].copy()
test.shape

(60394, 3)

In [17]:
# 2. Преобразование данных - для куарека не особо нужно, но для других - напоминалка
# делаем всегда! чтобы не сломать ничего дальше и чтобы все индексы были от 0 до N без пропусков
user_encoder = LabelEncoder()
video_encoder = LabelEncoder()

train.loc[:, 'user_id'] = user_encoder.fit_transform(train['user_id'])
train.loc[:, 'item_id'] = video_encoder.fit_transform(train['item_id'])

test.loc[:, 'user_id'] = user_encoder.transform(test['user_id'])
test.loc[:, 'item_id'] = video_encoder.transform(test['item_id'])

train['user_id'] = train['user_id'].astype(int)
train['item_id'] = train['item_id'].astype(int)
test['user_id'] = test['user_id'].astype(int)
test['item_id'] = test['item_id'].astype(int)

In [18]:
# т.е. сразу знаем количество и в каких пределах изменяется user_id и video_id
num_videos = train['item_id'].nunique()
num_users = train['user_id'].nunique()

print('Количество уникальных item_id', num_videos)
print('Количество уникальных user_id', num_users)

Количество уникальных item_id 3700
Количество уникальных user_id 6040


In [19]:
def prepare_hetero_data(df) -> HeteroData:
    """
    Build a simple hetero-graph with only item->user edges based on interactions in df.
    df must contain columns 'item_id' and 'user_id'.
    """
    data = HeteroData()

    # Create user and item nodes
    users = torch.from_numpy(df['user_id'].unique())
    items = torch.from_numpy(df['item_id'].unique())
    num_users = int(users.max().item()) + 1
    num_items = int(items.max().item()) + 1

    data['user'].node_id = torch.arange(num_users)
    data['item'].node_id = torch.arange(num_items)

    # Build item -> user edge index from interactions
    item_ids = torch.LongTensor(df['item_id'].values)
    user_ids = torch.LongTensor(df['user_id'].values)
    edge_index = torch.stack([item_ids, user_ids], dim=0)

    data['item', 'interacts', 'user'].edge_index = edge_index

    return data


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, HeteroConv

class SimpleItemUserGNN(nn.Module):
    """
    Heterogeneous GNN for a bipartite graph with single edge type item->user.
    """
    def __init__(self,
                 num_users: int,
                 num_items: int,
                 emb_dim: int = 32,
                 hidden_dim: int = 16,
                 heads: int = 2,
                 dropout: float = 0.2):
        super().__init__()
        # Embeddings
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        # Two-layer HeteroConv with one relation: ('item','interacts','user')
        conv1 = {
            ('item', 'interacts', 'user'): GATConv(
                in_channels=emb_dim,
                out_channels=hidden_dim,
                heads=heads,
                add_self_loops=False
            ),
        }
        conv2 = {
            ('item', 'interacts', 'user'): GATConv(
                in_channels=hidden_dim * heads,
                out_channels=emb_dim,
                heads=1,
                add_self_loops=False
            ),
        }
        self.conv1 = HeteroConv(conv1, aggr='mean')
        self.conv2 = HeteroConv(conv2, aggr='mean')

        # LayerNorm & Dropout
        self.norm1 = nn.ModuleDict({
            'user': nn.LayerNorm(hidden_dim * heads),
            'item': nn.LayerNorm(emb_dim)
        })
        self.norm2 = nn.ModuleDict({
            'user': nn.LayerNorm(emb_dim),
            'item': nn.LayerNorm(emb_dim)
        })
        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        # Initial node features
        x = {
            'user': self.user_emb(data['user'].node_id),
            'item': self.item_emb(data['item'].node_id)
        }
        # First hetero-conv
        h1 = self.conv1(x, data.edge_index_dict)
        # Apply activation, norm, dropout
        h1_user = F.elu(self.norm1['user'](h1['user']))
        h1_user = self.dropout(h1_user)
        h1 = {'user': h1_user, 'item': self.item_emb(data['item'].node_id)}

        # Second hetero-conv
        h2 = self.conv2(h1, data.edge_index_dict)
        # Final normalization
        h2_user = self.norm2['user'](h2['user'])

        return h2_user

In [21]:
data = prepare_hetero_data(train)
data

HeteroData(
  user={ node_id=[6040] },
  item={ node_id=[3700] },
  (item, interacts, user)={ edge_index=[2, 939809] }
)

In [22]:
train.item_id.nunique(), train.item_id.min(), train.item_id.max()

(3700, 0, 3699)

In [23]:
num_users = len(train['user_id'].unique())
num_items = train['item_id'].max() + 1
model = SimpleItemUserGNN(num_users, num_items)



In [24]:
model

SimpleItemUserGNN(
  (user_emb): Embedding(6040, 32)
  (item_emb): Embedding(3700, 32)
  (conv1): HeteroConv(num_relations=1)
  (conv2): HeteroConv(num_relations=1)
  (norm1): ModuleDict(
    (user): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (item): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (norm2): ModuleDict(
    (user): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (item): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

In [25]:
test_df = test[['user_id', 'item_id']]
interactions = test_df.rename(columns={
    'user_id': Columns.User,
    'item_id': Columns.Item,
})

viewed_items = train.groupby("user_id")["item_id"].agg(set).to_dict()

In [26]:
def evaluate(model, train_data,
             test_batch_size, top_k,
             viewed_items, interactions,
             device, test_step):
    """
    Оцениваем модель по всем пользователям:
    - строим топ-K рекомендации
    - фильтруем уже просмотренные
    - считаем recall@K, precision@K, map@K
    """
    model.eval()
    model.to(device)
    num_users = train_data['user'].node_id.shape[0]
    test_top_k = top_k * 150

    item_emb = model.item_emb.weight
    item_emb_t = item_emb.t().detach()
    del item_emb
    gc.collect()

    all_scores = []
    with torch.no_grad():
        for i in range(0, num_users, test_batch_size):
            end = min(i + test_batch_size, num_users)
            batch_users = torch.arange(i, end).to(device)
            user_e = model(
                data=train_data.to(device)
            )
            rating = torch.mm(user_e[batch_users].detach(), item_emb_t)
            _, topk = torch.topk(rating, k=test_top_k, dim=1)
            all_scores.append(topk)

            del user_e, rating
            gc.collect()
    all_scores = torch.cat(all_scores, dim=0).cpu().numpy()

    users_list, items, ranks = [], [], []
    for u in range(num_users):
        seen = viewed_items.get(u, set())
        recs = all_scores[u]
        mask = ~np.isin(recs, list(seen))
        filtered = recs[mask][:top_k]
        for rank, it in enumerate(filtered, 1):
            users_list.append(u)
            items.append(int(it))
            ranks.append(rank)
    reco_df = pd.DataFrame({
        'user_id': users_list,
        'item_id': items,
        'rank': ranks
    })

    metrics = {
        f'map@{top_k}': MAP(k=top_k),
        f'precision@{top_k}': Precision(k=top_k),
        f'recall@{top_k}': Recall(k=top_k),
        f'ndcg@{top_k}': NDCG(k=top_k)
    }
    results = calc_metrics(metrics=metrics,
                           reco=reco_df,
                           interactions=interactions)
    print(f"Step {test_step} — Test metrics:")
    for name, val in results.items():
        print(f"  {name}: {val:.9f}")
        experiment.log_metric(f"Test {name} vs step", val, step=test_step)
    del all_scores
    gc.collect()

    model.to(device)
    train_data.to(device)
    model.train()
    return results

In [27]:
import torch
import torch.nn.functional as F
import gc

def train_simple_model(model,
                       data: HeteroData,
                       num_epochs: int = 10,
                       lr: float = 1e-3,
                       batch_size: int = 1024,
                       device: str = None,
                       print_every: int = 100,
                       test_every: int = 100,
                      top_k: int = 10,
                      test_batch_size: int = 2048):
    """
    Train a SimpleItemUserGNN on item->user interactions with BPR loss.

    Args:
        model: SimpleItemUserGNN instance
        data: HeteroData containing 'item','interacts','user' edges
        num_epochs: number of epochs
        lr: learning rate
        batch_size: negative sampling batch size
        device: 'cpu' or 'cuda'
        print_every: print stats every N steps
    """
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # extract positive edge indices
    src, dst = data['item', 'interacts', 'user'].edge_index
    num_train = src.size(0)
    print(f"Num of training interactions: {num_train}")

    global_step = 0
    for epoch in range(1, num_epochs + 1):
        model.train()
        perm = torch.randperm(num_train, device=device)
        total_loss = 0.0

        for step, start in enumerate(range(0, num_train, batch_size), 1):
            idx = perm[start:start + batch_size]
            pos_items = src[idx]
            users = dst[idx]
            neg_items = torch.randint(
                0,
                model.item_emb.num_embeddings,
                size=pos_items.size(),
                device=device
            )

            optimizer.zero_grad()

            # forward pass: get updated embeddings
            embeddings = model(data)
            user_embs = embeddings[users]
            pos_embs = model.item_emb.weight[pos_items]
            neg_embs = model.item_emb.weight[neg_items]

            # BPR loss
            pos_scores = (user_embs * pos_embs).sum(dim=1)
            neg_scores = (user_embs * neg_embs).sum(dim=1)
            loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
            loss.backward()
            optimizer.step()

            experiment.log_metric('Train BPR Loss vs step', loss.item(), step=global_step)

            total_loss += loss.item() * users.size(0)

            if step % print_every == 0 or step == 1:
                avg_loss = total_loss / (step * batch_size)
                print(f"Epoch {epoch} Step {step} Loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}")

            if step % test_every == 0 or step == 1:
                evaluate(model, data,
                         test_batch_size, top_k,
                         viewed_items, interactions,
                         device, test_step=global_step)

            # cleanup
            del embeddings, user_embs, pos_embs, neg_embs, pos_scores, neg_scores
            gc.collect()
            torch.cuda.empty_cache()

            global_step += 1

        epoch_loss = total_loss / num_train
        print(f"Epoch {epoch} completed. Train BPR Loss: {epoch_loss:.4f}\n")

    return model


In [28]:
experiment.log_parameters(hyperparameters)

In [29]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
edge_type = hyperparameters['train_edge_type']
num_epochs = hyperparameters['train_num_epochs']
lr = hyperparameters['train_lr']
batch_size = hyperparameters['train_batch_size']
print_every = hyperparameters['train_print_every']
test_every = hyperparameters['train_test_every']
top_k = hyperparameters['test_topk']
test_batch_size = hyperparameters['test_batch_size']
model = train_simple_model(model,
                    data,
                    num_epochs=num_epochs,
                    lr=lr,
                    batch_size=batch_size,
                    device=device,
                    print_every=print_every,
                    test_every=test_every,
                    top_k=top_k,
                    test_batch_size=test_batch_size)

Num of training interactions: 939809
Epoch 1 Step 1 Loss: 3.2878, Avg Loss: 3.2878
Step 0 — Test metrics:
  precision@10: 0.002500000
  recall@10: 0.002500000
  ndcg@10: 0.002433203
  map@10: 0.000693735
Epoch 1 Step 10 Loss: 3.2169, Avg Loss: 3.2378
Epoch 1 Step 20 Loss: 3.2794, Avg Loss: 3.2405
Epoch 1 Step 30 Loss: 3.1672, Avg Loss: 3.2269
Epoch 1 Step 40 Loss: 3.1380, Avg Loss: 3.2173
Epoch 1 Step 50 Loss: 3.1365, Avg Loss: 3.2060
Step 49 — Test metrics:
  precision@10: 0.002682119
  recall@10: 0.002683959
  ndcg@10: 0.002452411
  map@10: 0.000653441
Epoch 1 completed. Train BPR Loss: 3.1952

Epoch 2 Step 1 Loss: 3.1270, Avg Loss: 3.1270
Step 58 — Test metrics:
  precision@10: 0.002682119
  recall@10: 0.002683959
  ndcg@10: 0.002460874
  map@10: 0.000657759
Epoch 2 Step 10 Loss: 3.1583, Avg Loss: 3.1245
Epoch 2 Step 20 Loss: 3.0870, Avg Loss: 3.1119
Epoch 2 Step 30 Loss: 3.0536, Avg Loss: 3.0989
Epoch 2 Step 40 Loss: 3.0678, Avg Loss: 3.0906
Epoch 2 Step 50 Loss: 3.0485, Avg Loss: 

In [31]:
torch.save(model, "gnn_model_mvl.model")
from IPython.display import FileLink

FileLink('gnn_model_mvl.model')

In [32]:
# del model
gc.collect()
torch.cuda.empty_cache()

In [33]:
log_model(
    experiment=experiment,
    model=model,
    model_name="GNN",
)

In [34]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline-movielens
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/annanet/gnn-recommender/4b3f9c68f8684549869e2e3830603d43
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Test map@10 vs step [200]       : (0.0004927204877536003, 0.006801199674130139)
[1;38;5;39mCOMET INFO:[0m     Test ndcg@10 vs step [200]      : (0.0018364915583727217, 0.022687986491322955)
[1;38;5;39mCOMET INFO:[0m     Test precision@10 vs step [200] : (0.002036423841059603, 0.02427152317880795)
[1;38;5;39mCOME