In [1]:
!pip -q install torch_geometric rectools
!pip -q install comet_ml
!pip -q install python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.0/208.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.1/727.1 kB[0m [31m17.1 M

In [2]:
import comet_ml
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

from dotenv import load_dotenv
import os

In [None]:
load_dotenv(".env")

True

In [4]:
experiment = Experiment(
  api_key=os.getenv('API_KEY'),
  project_name="gnn-recommender",
  workspace="annanet",
  log_code=True
)

experiment.set_name('baseline-beauty')
experiment.add_tags(['beauty', 'leave-n-out'])

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/annanet/gnn-recommender/7989efbe77bc41bc94656aab3b575aa5

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


In [5]:
hyperparameters = {
    'seed': 42,
    'types_of_feedback': ["explicit_positive", "expliсit_negative",
                          "implicit_positive", "implicit_negative"],
    'train_edge_type': ('item','to_feedback_explicit_positive','explicit_positive'),
    'train_num_epochs': 100,
    'train_lr': 8e-5,
    'train_batch_size': 16384, 
    'train_print_every': 10,  
    'train_test_every': 25,
    'test_topk': 10,
    'test_batch_size': 8192
}

In [6]:
import os
os.listdir('/kaggle/input/data/leave-n-out/beauty')

['train.csv', 'test.csv']

In [7]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv, GATConv

from sklearn.preprocessing import LabelEncoder

from rectools import Columns
from rectools.metrics import MAP, Precision, Recall, NDCG, calc_metrics

import gc
import random

In [8]:
SEED = hyperparameters['seed']
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [9]:
rootpath = '/kaggle/input/data/leave-n-out/beauty/'
train = pd.read_csv(
    rootpath+'train.csv'
)
train['date'] = pd.to_datetime(train['unix_time'])
print(train.head())

                 user_id     item_id  rating  \
0  A00414041RD0BXM6WK0GX  B007IY97U0     3.0   
1  A00414041RD0BXM6WK0GX  B00870XLDS     2.0   
2  A00414041RD0BXM6WK0GX  B008MIRO88     1.0   
3  A00414041RD0BXM6WK0GX  B00BQYYMN0     3.0   
4  A00414041RD0BXM6WK0GX  B00GRTQBTM     5.0   

                                         review_text   unix_time       date  
0  Good quality wig, but the blonde is much more ...  2014-07-14 2014-07-14  
1  Very thin and not as long as the photos :( Aft...  2014-07-14 2014-07-14  
2  Very thin and not as long as the photos :( Aft...  2014-07-14 2014-07-14  
3  This is a great quality wig, however it is a m...  2014-07-14 2014-07-14  
4  This is my absolute favorite wig! I have purch...  2014-07-14 2014-07-14  


In [10]:
explicit_positive = train[(train["rating"] == 5)].index
explisit_negative = train[(train["rating"] <= 2)].index

explicit_combined_feedback = explicit_positive.union(explisit_negative)
print('Количество explicit позитивного фидбека', explicit_positive.shape[0])
print('Количество explicit негативного фидбека', explisit_negative.shape[0])

Количество explicit позитивного фидбека 90800
Количество explicit негативного фидбека 17504


In [11]:
implicit_positive = train[(train["rating"] == 4)].index
implicit_negative = train[(train["rating"] == 3)].index

implicit_combined_feedback = implicit_positive.union(implicit_negative)
print('Количество implicit позитивного фидбека', implicit_positive.shape[0])
print('Количество implicit негативного фидбека', implicit_negative.shape[0])

Количество implicit позитивного фидбека 30668
Количество implicit негативного фидбека 17110


In [12]:
train.loc[:, "target"] = ""
train.loc[explicit_positive, "target"] = "explicit_positive"
train.loc[explisit_negative, "target"] = "expliсit_negative"
train.loc[implicit_positive, "target"] = "implicit_positive"
train.loc[implicit_negative, "target"] = "implicit_negative"

train = train[['user_id','item_id','target','date']]
train.head()

Unnamed: 0,user_id,item_id,target,date
0,A00414041RD0BXM6WK0GX,B007IY97U0,implicit_negative,2014-07-14
1,A00414041RD0BXM6WK0GX,B00870XLDS,expliсit_negative,2014-07-14
2,A00414041RD0BXM6WK0GX,B008MIRO88,expliсit_negative,2014-07-14
3,A00414041RD0BXM6WK0GX,B00BQYYMN0,implicit_negative,2014-07-14
4,A00414041RD0BXM6WK0GX,B00GRTQBTM,explicit_positive,2014-07-14


In [13]:
train = train.sort_values(by=["user_id", "date"]).reset_index(drop=True)
train.columns = ['user_id', 'item_id', 'target', 'date']

In [14]:
test = pd.read_csv(
    rootpath+'test.csv'
)
test['date'] = pd.to_datetime(test['unix_time'])
print(test.head())

                 user_id     item_id  rating  \
0  A02155413BVL8D0G7X6DN  B0089JVEPO     5.0   
1  A02155413BVL8D0G7X6DN  B001G2LWDK     5.0   
2  A02155413BVL8D0G7X6DN  B005Z41P28     5.0   
3  A02155413BVL8D0G7X6DN  B0055MYJ0U     5.0   
4  A02155413BVL8D0G7X6DN  B00117CH5M     3.0   

                                         review_text   unix_time       date  
0  leaves my skin clean and smooth. it is creamy ...  2012-10-25 2012-10-25  
1  Works great, smells good, there is a result. I...  2012-12-06 2012-12-06  
2  it works for my hair. smells like almond. made...  2013-01-17 2013-01-17  
3  got this in the mail from China today! holds m...  2013-04-22 2013-04-22  
4  if you like strong smell of honeysuckles and h...  2013-05-01 2013-05-01  


In [15]:
test = test[['user_id','item_id', 'date']]
test.columns = ['user_id', 'item_id', 'date']
test.head()

Unnamed: 0,user_id,item_id,date
0,A02155413BVL8D0G7X6DN,B0089JVEPO,2012-10-25
1,A02155413BVL8D0G7X6DN,B001G2LWDK,2012-12-06
2,A02155413BVL8D0G7X6DN,B005Z41P28,2013-01-17
3,A02155413BVL8D0G7X6DN,B0055MYJ0U,2013-04-22
4,A02155413BVL8D0G7X6DN,B00117CH5M,2013-05-01


# MVP model v2

In [16]:
test = test[(test.user_id.isin(train.user_id)) & (test.item_id.isin(train.item_id))].copy()
test.shape

(42378, 3)

In [17]:
# 2. Преобразование данных - для куарека не особо нужно, но для других - напоминалка
# делаем всегда! чтобы не сломать ничего дальше и чтобы все индексы были от 0 до N без пропусков
user_encoder = LabelEncoder()
video_encoder = LabelEncoder()

train.loc[:, 'user_id'] = user_encoder.fit_transform(train['user_id'])
train.loc[:, 'item_id'] = video_encoder.fit_transform(train['item_id'])

test.loc[:, 'user_id'] = user_encoder.transform(test['user_id'])
test.loc[:, 'item_id'] = video_encoder.transform(test['item_id'])

train['user_id'] = train['user_id'].astype(int)
train['item_id'] = train['item_id'].astype(int)
test['user_id'] = test['user_id'].astype(int)
test['item_id'] = test['item_id'].astype(int)

In [18]:
# т.е. сразу знаем количество и в каких пределах изменяется user_id и video_id
num_videos = train['item_id'].nunique()
num_users = train['user_id'].nunique()

print('Количество уникальных item_id', num_videos)
print('Количество уникальных user_id', num_users)

Количество уникальных item_id 12095
Количество уникальных user_id 22363


In [33]:
def prepare_hetero_data(df) -> HeteroData:
    """
    Build a simple hetero-graph with only item->user edges based on interactions in df.
    df must contain columns 'item_id' and 'user_id'.
    """
    data = HeteroData()

    # Create user and item nodes
    users = torch.from_numpy(df['user_id'].unique())
    items = torch.from_numpy(df['item_id'].unique())
    num_users = int(users.max().item()) + 1
    num_items = int(items.max().item()) + 1

    data['user'].node_id = torch.arange(num_users)
    data['item'].node_id = torch.arange(num_items)

    # Build item -> user edge index from interactions
    item_ids = torch.LongTensor(df['item_id'].values)
    user_ids = torch.LongTensor(df['user_id'].values)
    edge_index = torch.stack([item_ids, user_ids], dim=0)

    data['item', 'interacts', 'user'].edge_index = edge_index
    # data['user', 'interacts_rev', 'item'].edge_index = edge_index.flip(0)

    return data


In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, HeteroConv

class SimpleItemUserGNN(nn.Module):
    """
    Heterogeneous GNN for a bipartite graph with single edge type item->user.
    """
    def __init__(self,
                 num_users: int,
                 num_items: int,
                 emb_dim: int = 32,
                 hidden_dim: int = 16,
                 heads: int = 2,
                 dropout: float = 0.2):
        super().__init__()
        # Embeddings
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        # Two-layer HeteroConv with one relation: ('item','interacts','user')
        conv1 = {
            ('item', 'interacts', 'user'): GATConv(
                in_channels=emb_dim,
                out_channels=hidden_dim,
                heads=heads,
                add_self_loops=False
            ),
            # ('user', 'interacts_rev', 'item'): GATConv(
            #     in_channels=emb_dim,
            #     out_channels=hidden_dim,
            #     heads=heads,
            #     add_self_loops=False
            # ),
        }
        conv2 = {
            ('item', 'interacts', 'user'): GATConv(
                in_channels=hidden_dim * heads,
                out_channels=emb_dim,
                heads=1,
                add_self_loops=False
            ),
            # ('user', 'interacts_rev', 'item'): GATConv(
            #     in_channels=hidden_dim * heads,
            #     out_channels=emb_dim,
            #     heads=1,
            #     add_self_loops=False
            # ),
        }
        self.conv1 = HeteroConv(conv1, aggr='mean')
        self.conv2 = HeteroConv(conv2, aggr='mean')

        # LayerNorm & Dropout
        self.norm1 = nn.ModuleDict({
            'user': nn.LayerNorm(hidden_dim * heads),
            'item': nn.LayerNorm(emb_dim)
        })
        self.norm2 = nn.ModuleDict({
            'user': nn.LayerNorm(emb_dim),
            'item': nn.LayerNorm(emb_dim)
        })
        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        # Initial node features
        x = {
            'user': self.user_emb(data['user'].node_id),
            'item': self.item_emb(data['item'].node_id)
        }
        # First hetero-conv
        h1 = self.conv1(x, data.edge_index_dict)
        # Apply activation, norm, dropout
        h1_user = F.elu(self.norm1['user'](h1['user']))
        h1_user = self.dropout(h1_user)
        h1 = {'user': h1_user, 'item': self.item_emb(data['item'].node_id)}

        # Second hetero-conv
        h2 = self.conv2(h1, data.edge_index_dict)
        # Final normalization
        h2_user = self.norm2['user'](h2['user'])

        return h2_user

In [35]:
data = prepare_hetero_data(train)
data

HeteroData(
  user={ node_id=[22363] },
  item={ node_id=[12095] },
  (item, interacts, user)={ edge_index=[2, 156082] }
)

In [36]:
train.item_id.nunique(), train.item_id.min(), train.item_id.max()

(12095, 0, 12094)

In [37]:
num_users = len(train['user_id'].unique())
num_items = train['item_id'].max() + 1
model = SimpleItemUserGNN(num_users, num_items)



In [38]:
model

SimpleItemUserGNN(
  (user_emb): Embedding(22363, 32)
  (item_emb): Embedding(12095, 32)
  (conv1): HeteroConv(num_relations=1)
  (conv2): HeteroConv(num_relations=1)
  (norm1): ModuleDict(
    (user): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (item): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (norm2): ModuleDict(
    (user): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (item): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

In [39]:
test_df = test[['user_id', 'item_id']]
interactions = test_df.rename(columns={
    'user_id': Columns.User,
    'item_id': Columns.Item,
})

viewed_items = train.groupby("user_id")["item_id"].agg(set).to_dict()

In [40]:
def evaluate(model, train_data,
             test_batch_size, top_k,
             viewed_items, interactions,
             device, test_step):
    """
    Оцениваем модель по всем пользователям:
    - строим топ-K рекомендации
    - фильтруем уже просмотренные
    - считаем recall@K, precision@K, map@K
    """
    model.eval()
    model.to(device)
    num_users = train_data['user'].node_id.shape[0]
    test_top_k = top_k * 150

    item_emb = model.item_emb.weight
    item_emb_t = item_emb.t().detach()
    del item_emb
    gc.collect()

    all_scores = []
    with torch.no_grad():
        for i in range(0, num_users, test_batch_size):
            end = min(i + test_batch_size, num_users)
            batch_users = torch.arange(i, end).to(device)
            user_e = model(
                data=train_data.to(device)
            )
            rating = torch.mm(user_e[batch_users].detach(), item_emb_t)
            _, topk = torch.topk(rating, k=test_top_k, dim=1)
            all_scores.append(topk)

            del user_e, rating
            gc.collect()
    all_scores = torch.cat(all_scores, dim=0).cpu().numpy()

    users_list, items, ranks = [], [], []
    for u in range(num_users):
        seen = viewed_items.get(u, set())
        recs = all_scores[u]
        mask = ~np.isin(recs, list(seen))
        filtered = recs[mask][:top_k]
        for rank, it in enumerate(filtered, 1):
            users_list.append(u)
            items.append(int(it))
            ranks.append(rank)
    reco_df = pd.DataFrame({
        'user_id': users_list,
        'item_id': items,
        'rank': ranks
    })

    metrics = {
        f'map@{top_k}': MAP(k=top_k),
        f'precision@{top_k}': Precision(k=top_k),
        f'recall@{top_k}': Recall(k=top_k),
        f'ndcg@{top_k}': NDCG(k=top_k)
    }
    results = calc_metrics(metrics=metrics,
                           reco=reco_df,
                           interactions=interactions)
    print(f"Step {test_step} — Test metrics:")
    for name, val in results.items():
        print(f"  {name}: {val:.9f}")
        experiment.log_metric(f"Test {name} vs step", val, step=test_step)
    del all_scores
    gc.collect()

    model.to(device)
    train_data.to(device)
    model.train()
    return results

In [41]:
import torch
import torch.nn.functional as F
import gc

def train_simple_model(model,
                       data: HeteroData,
                       num_epochs: int = 10,
                       lr: float = 1e-3,
                       batch_size: int = 1024,
                       device: str = None,
                       print_every: int = 100,
                       test_every: int = 100,
                      top_k: int = 10,
                      test_batch_size: int = 2048):
    """
    Train a SimpleItemUserGNN on item->user interactions with BPR loss.

    Args:
        model: SimpleItemUserGNN instance
        data: HeteroData containing 'item','interacts','user' edges
        num_epochs: number of epochs
        lr: learning rate
        batch_size: negative sampling batch size
        device: 'cpu' or 'cuda'
        print_every: print stats every N steps
    """
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # extract positive edge indices
    src, dst = data['item', 'interacts', 'user'].edge_index
    num_train = src.size(0)
    print(f"Num of training interactions: {num_train}")

    global_step = 0
    for epoch in range(1, num_epochs + 1):
        model.train()
        perm = torch.randperm(num_train, device=device)
        total_loss = 0.0

        for step, start in enumerate(range(0, num_train, batch_size), 1):
            idx = perm[start:start + batch_size]
            pos_items = src[idx]
            users = dst[idx]
            neg_items = torch.randint(
                0,
                model.item_emb.num_embeddings,
                size=pos_items.size(),
                device=device
            )

            optimizer.zero_grad()

            # forward pass: get updated embeddings
            embeddings = model(data)
            user_embs = embeddings[users]
            pos_embs = model.item_emb.weight[pos_items]
            neg_embs = model.item_emb.weight[neg_items]

            # BPR loss
            pos_scores = (user_embs * pos_embs).sum(dim=1)
            neg_scores = (user_embs * neg_embs).sum(dim=1)
            loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
            loss.backward()
            optimizer.step()

            experiment.log_metric('Train BPR Loss vs step', loss.item(), step=global_step)

            total_loss += loss.item() * users.size(0)

            if step % print_every == 0 or step == 1:
                avg_loss = total_loss / (step * batch_size)
                print(f"Epoch {epoch} Step {step} Loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}")

            if step % test_every == 0 or step == 1:
                evaluate(model, data,
                         test_batch_size, top_k,
                         viewed_items, interactions,
                         device, test_step=global_step)

            # cleanup
            del embeddings, user_embs, pos_embs, neg_embs, pos_scores, neg_scores
            gc.collect()
            torch.cuda.empty_cache()

            global_step += 1

        epoch_loss = total_loss / num_train
        print(f"Epoch {epoch} completed. Train BPR Loss: {epoch_loss:.4f}\n")

    return model


In [42]:
experiment.log_parameters(hyperparameters)

In [43]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
edge_type = hyperparameters['train_edge_type']
num_epochs = hyperparameters['train_num_epochs']
lr = hyperparameters['train_lr']
batch_size = hyperparameters['train_batch_size']
print_every = hyperparameters['train_print_every']
test_every = hyperparameters['train_test_every']
top_k = hyperparameters['test_topk']
test_batch_size = hyperparameters['test_batch_size']
model = train_simple_model(model,
                    data,
                    num_epochs=num_epochs,
                    lr=lr,
                    batch_size=batch_size,
                    device=device,
                    print_every=print_every,
                    test_every=test_every,
                    top_k=top_k,
                    test_batch_size=test_batch_size)

Num of training interactions: 156082
Epoch 1 Step 1 Loss: 3.3044, Avg Loss: 3.3044
Step 0 — Test metrics:
  precision@10: 0.000801509
  recall@10: 0.000801509
  ndcg@10: 0.000872591
  map@10: 0.000273942
Epoch 1 Step 10 Loss: 3.3230, Avg Loss: 3.1279
Epoch 1 completed. Train BPR Loss: 3.2833

Epoch 2 Step 1 Loss: 3.2855, Avg Loss: 3.2855
Step 10 — Test metrics:
  precision@10: 0.000848656
  recall@10: 0.000848656
  ndcg@10: 0.000889109
  map@10: 0.000272277
Epoch 2 Step 10 Loss: 3.2272, Avg Loss: 3.0963
Epoch 2 completed. Train BPR Loss: 3.2502

Epoch 3 Step 1 Loss: 3.2744, Avg Loss: 3.2744
Step 20 — Test metrics:
  precision@10: 0.000872230
  recall@10: 0.000872230
  ndcg@10: 0.000904285
  map@10: 0.000274728
Epoch 3 Step 10 Loss: 3.2938, Avg Loss: 3.0678
Epoch 3 completed. Train BPR Loss: 3.2203

Epoch 4 Step 1 Loss: 3.1721, Avg Loss: 3.1721
Step 30 — Test metrics:
  precision@10: 0.000848656
  recall@10: 0.000848656
  ndcg@10: 0.000875080
  map@10: 0.000264251
Epoch 4 Step 10 Loss: 

In [45]:
torch.save(model, "gnn_model_mvl.model")
from IPython.display import FileLink

FileLink('gnn_model_mvl.model')

In [46]:
# del model
gc.collect()
torch.cuda.empty_cache()

In [47]:
log_model(
    experiment=experiment,
    model=model,
    model_name="GNN",
)

In [48]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline-beauty
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/annanet/gnn-recommender/7989efbe77bc41bc94656aab3b575aa5
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Test map@10 vs step [100]       : (0.00021774881569789633, 0.0002747281530874176)
[1;38;5;39mCOMET INFO:[0m     Test ndcg@10 vs step [100]      : (0.0007708170150711486, 0.0010080580744078137)
[1;38;5;39mCOMET INFO:[0m     Test precision@10 vs step [100] : (0.0007779349363507779, 0.001107967939651108)
[1;38;5;39mCO