In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
from pathlib import Path
from utils import *


import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# ==========================================================
# CONFIGURATION
# ==========================================================
np.random.seed(42)
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"ðŸ”§ Device: {device}")

# Paths
ROOT = Path("Amazon_products")
TRAIN_CORPUS_PATH = ROOT / "train" / "train_corpus.txt"
TEST_CORPUS_PATH  = ROOT / "test" / "test_corpus.txt"
CLASS_PATH        = ROOT / "classes.txt"

EMB_DIR          = Path("Embeddings")
X_ALL_PATH       = EMB_DIR / "X_train_test_mpn.pt"
LABEL_EMB_PATH   = EMB_DIR / "labels_hierarchical_new_mpn.pt"

MODEL_SAVE = Path("Models")
MODEL_SAVE.mkdir(exist_ok=True)
MODEL_PATH = MODEL_SAVE / "silver_classifier.pt"

# ==========================================================
# LOAD IDS
# ==========================================================
def load_ids(path):
    ids = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            pid, _ = line.strip().split("\t", 1)
            ids.append(int(pid))
    return ids

train_ids = load_ids(TRAIN_CORPUS_PATH)
test_ids  = load_ids(TEST_CORPUS_PATH)
n_train = len(train_ids)
n_test  = len(test_ids)

print(f"Train IDs: {n_train} | Test IDs: {n_test}")

ðŸ”§ Device: cuda
Train IDs: 29487 | Test IDs: 19658


In [2]:
# ==========================================================
# LOAD SILVER LABELS
# ==========================================================
with open("Silver/silver_train_new_mpn.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

silver_labels = {int(pid): data["labels"] for pid, data in raw.items()}

# ==========================================================
# LOAD X_all
# ==========================================================
print("\nðŸ§  Loading X_all.pt ...")
data = torch.load(X_ALL_PATH, weights_only=False)

if isinstance(data, np.ndarray):
    data = torch.from_numpy(data)
elif isinstance(data, list):
    data = torch.stack(data)

X_all = data.float().to(device)
assert X_all.shape[0] == n_train + n_test

X_train = X_all[:n_train]
X_test  = X_all[n_train:]
print(f"âœ“ X_train: {X_train.shape} | X_test: {X_test.shape}")

# ==========================================================
# LOAD LABEL EMBEDDINGS
# ==========================================================
tmp = torch.load(LABEL_EMB_PATH, weights_only=False)

# Convertir numpy â†’ tensor si nÃ©cessaire
if isinstance(tmp, np.ndarray):
    tmp = torch.from_numpy(tmp)

label_emb = tmp.float().to(device)
print(f"âœ“ Label embeddings: {label_emb.shape}")

# ==========================================================
# LOAD CLASS NAMES
# ==========================================================
classes = {}
with open(CLASS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        cid, cname = line.strip().split("\t")
        classes[int(cid)] = cname

n_classes = len(classes)

pid2idx = {pid: i for i, pid in enumerate(train_ids)}


ðŸ§  Loading X_all.pt ...
âœ“ X_train: torch.Size([29487, 768]) | X_test: torch.Size([19658, 768])
âœ“ Label embeddings: torch.Size([531, 768])


In [3]:
class MultiLabelDataset(Dataset):
    def __init__(self, pids, labels_dict):
        self.pids = pids
        self.labels = labels_dict

    def __len__(self):
        return len(self.pids)

    def __getitem__(self, idx):
        pid = self.pids[idx]
        emb = X_train[pid2idx[pid]]

        y = torch.zeros(n_classes)
        for c in self.labels[pid]:
            if 0 <= c < n_classes:
                y[c] = 1.0

        return {"X": emb, "y": y}

train_p, val_p = train_test_split(
    list(silver_labels.keys()), test_size=0.2, random_state=42
)

train_dataset = MultiLabelDataset(train_p, silver_labels)
val_dataset   = MultiLabelDataset(val_p, silver_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=64)
    

In [4]:
def evaluate(model, loader, thr=0.25):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in loader:
            X = batch["X"]
            y = batch["y"].numpy()

            prob = torch.sigmoid(model(X)).cpu().numpy()
            pred = (prob > thr).astype(int)

            preds.extend(pred)
            labels.extend(y)

    f1s = f1_score(labels, preds, average="samples")
    f1m = f1_score(labels, preds, average="macro")
    return f1s, f1m

In [5]:
class LabelGCN(nn.Module):
    def __init__(self, emb_dim, num_layers=1, dropout=0.5):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        self.dropout = dropout

        self.W_list = nn.ParameterList()
        for _ in range(num_layers):
            W = nn.Parameter(torch.empty(emb_dim, emb_dim))
            nn.init.xavier_uniform_(W)
            self.W_list.append(W)

    def forward(self, H, A_hat):
        for i, W in enumerate(self.W_list):
            H_input = H  # skip connection

            H_msg = A_hat @ H_input
            H_msg = H_msg @ W

            # residual connection
            H = H_input + H_msg

            if i < self.num_layers - 1:
                H = F.relu(H)
                H = F.dropout(H, p=self.dropout, training=self.training)

        return H


class GCNEnhancedClassifier(nn.Module):
    def __init__(self, input_dim, label_init_emb, A_hat, num_layers=1, dropout=0.2):
        super().__init__()
        emb_dim = label_init_emb.size(1)

        # proj docs -> label space
        self.proj = nn.Linear(input_dim, emb_dim)
        self.dropout = dropout

        # GNN sur les labels
        self.encoder = LabelGCN(emb_dim, num_layers=num_layers, dropout=dropout)

        # label embeddings trainables
        self.label_emb = nn.Parameter(label_init_emb.clone())

        # matrice dâ€™adjacence (buffer, pas un paramÃ¨tre)
        self.register_buffer("A_hat", A_hat)

    def forward(self, x, use_dropout=True):
        # 1) raffiner les embeddings de labels
        E_refine = self.encoder(self.label_emb, self.A_hat)   # (C, D)

        # 2) projeter les docs
        x_proj = self.proj(x)
        if use_dropout:
            x_proj = F.dropout(x_proj, p=self.dropout, training=self.training)

        # 3) logits = produit scalaire
        logits = x_proj @ E_refine.T    # (B, C)
        return logits

In [6]:
def build_adj_from_hierarchy(class2hierarchy, n_classes, w_parent=1.0, w_sibling=0.1):
    """
    Construit A_hat pour GCN en utilisant EXCLUSIVEMENT class2hierarchy.

    - parent <-> enfant : poids = w_parent
    - frÃ¨res/soeurs : poids = w_sibling
    - auto-boucle : 1.0 (standard GCN)
    """

    A = torch.zeros((n_classes, n_classes))

    # ---- liens parent/enfant + siblings ----
    for parent, children in class2hierarchy.items():

        # parent <-> enfant
        for c in children:
            A[parent, c] = w_parent
            A[c, parent] = w_parent

        # siblings (enfants du mÃªme parent)
        for i in range(len(children)):
            for j in range(i + 1, len(children)):
                c1, c2 = children[i], children[j]
                A[c1, c2] = w_sibling
                A[c2, c1] = w_sibling

    # ---- self-loops ----
    A = A + torch.eye(n_classes)

    # ---- normalisation GCN ----
    D = A.sum(dim=1)
    D_inv_sqrt = torch.pow(D, -0.5)
    D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0.0
    D_mat = torch.diag(D_inv_sqrt)

    A_hat = D_mat @ A @ D_mat
    return A_hat

def load_multilabel(path):
    """
    Charge un fichier parent-enfant du type :
    parent_id \t child_id

    Retourne :
    {parent: [child, ...]}
    """
    mapping = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            p, c = line.strip().split("\t")
            p, c = int(p), int(c)

            if p not in mapping:
                mapping[p] = []

            mapping[p].append(c)

    return mapping

In [7]:
# ---------- CHARGEMENT HIÃ‰RARCHIE ----------
CLASS_HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
class2hierarchy = load_multilabel(CLASS_HIERARCHY_PATH)

A_hat = build_adj_from_hierarchy(class2hierarchy, n_classes).to(device)

print("A_hat:", A_hat.shape, "Non-zero =", (A_hat > 0).sum().item())
print("A_hat type:", A_hat.dtype)
print("A_hat device:", A_hat.device)


A_hat: torch.Size([531, 531]) Non-zero = 7611
A_hat type: torch.float32
A_hat device: cuda:0


In [8]:
import copy

# ----------------------------
# Hyperparameters
# ----------------------------
epochs = 100
patience = 5
wait = 0
best_f1 = 0

alpha_ema = 0.99       # teacher EMA speed
lambda_cons = 0.5     # weight for consistency loss
noise_std = 0.05        # noise on student input

# ----------------------------
# Init student + teacher
# ----------------------------
student = GCNEnhancedClassifier(
    input_dim=X_train.size(1),
    label_init_emb=label_emb,
    A_hat=A_hat,
    num_layers=3,
    dropout=0.2        # student = bruit
).to(device)

teacher = GCNEnhancedClassifier(
    input_dim=X_train.size(1),
    label_init_emb=label_emb,
    A_hat=A_hat,
    num_layers=3,
    dropout=0.0        # teacher = STABLE
).to(device)

teacher.load_state_dict(student.state_dict())

optimizer = torch.optim.AdamW(student.parameters(), lr=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

best_teacher = copy.deepcopy(teacher.state_dict())

# ----------------------------
# Consistency loss
# ----------------------------
def consistency_loss(log_s, log_t):
    ps = torch.sigmoid(log_s)
    pt = torch.sigmoid(log_t)
    return F.mse_loss(ps, pt)

# ----------------------------
# Training loop
# ----------------------------
for epoch in range(1, epochs + 1):
    student.train()
    teacher.eval()

    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        X = batch["X"].to(device)
        y = batch["y"].to(device)

        # Add noise to student
        noisy_X = X + noise_std * torch.randn_like(X)

        # student forward
        logits_s = student(noisy_X)

        # teacher forward (no gradient)
        with torch.no_grad():
            logits_t = teacher(X)

        # supervised = main objective
        loss_sup = F.binary_cross_entropy_with_logits(logits_s, y)

        # consistency = stability objective
        loss_cons = consistency_loss(logits_s, logits_t)

        # total loss
        loss = loss_sup + lambda_cons * loss_cons

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # EMA update (teacher â†’ student)
        for t_param, s_param in zip(teacher.parameters(), student.parameters()):
            t_param.data = alpha_ema * t_param.data + (1 - alpha_ema) * s_param.data

        total_loss += loss.item()

    scheduler.step()


    teacher.eval()
    f1_sample, f1_macro = evaluate(teacher, val_loader)

    print(f"[Epoch {epoch}] Loss={total_loss/len(train_loader):.4f} | F1={f1_sample:.4f}")

    if f1_sample > best_f1:
        best_f1 = f1_sample
        best_teacher = copy.deepcopy(teacher.state_dict())
        wait = 0
        print(f"New best model saved (F1={best_f1:.4f})")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered")
            break


teacher.load_state_dict(best_teacher)
print("\nðŸŽ‰ Final best F1:", best_f1)

Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 110.55it/s]


[Epoch 1] Loss=0.0493 | F1=0.2003
New best model saved (F1=0.2003)


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 116.95it/s]


[Epoch 2] Loss=0.0189 | F1=0.4704
New best model saved (F1=0.4704)


Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 121.72it/s]


[Epoch 3] Loss=0.0164 | F1=0.5287
New best model saved (F1=0.5287)


Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 128.60it/s]


[Epoch 4] Loss=0.0153 | F1=0.5602
New best model saved (F1=0.5602)


Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 121.20it/s]


[Epoch 5] Loss=0.0145 | F1=0.5771
New best model saved (F1=0.5771)


Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 79.87it/s] 


[Epoch 6] Loss=0.0138 | F1=0.5920
New best model saved (F1=0.5920)


Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:05<00:00, 64.70it/s]


[Epoch 7] Loss=0.0134 | F1=0.6048
New best model saved (F1=0.6048)


Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 89.71it/s] 


[Epoch 8] Loss=0.0130 | F1=0.6142
New best model saved (F1=0.6142)


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 125.18it/s]


[Epoch 9] Loss=0.0127 | F1=0.6215
New best model saved (F1=0.6215)


Epoch 10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 114.32it/s]


[Epoch 10] Loss=0.0125 | F1=0.6267
New best model saved (F1=0.6267)


Epoch 11: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 123.50it/s]


[Epoch 11] Loss=0.0123 | F1=0.6348
New best model saved (F1=0.6348)


Epoch 12: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 125.14it/s]


[Epoch 12] Loss=0.0120 | F1=0.6372
New best model saved (F1=0.6372)


Epoch 13: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 126.95it/s]


[Epoch 13] Loss=0.0119 | F1=0.6411
New best model saved (F1=0.6411)


Epoch 14: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 127.88it/s]


[Epoch 14] Loss=0.0118 | F1=0.6470
New best model saved (F1=0.6470)


Epoch 15: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 127.54it/s]


[Epoch 15] Loss=0.0116 | F1=0.6473
New best model saved (F1=0.6473)


Epoch 16: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 122.91it/s]


[Epoch 16] Loss=0.0115 | F1=0.6502
New best model saved (F1=0.6502)


Epoch 17: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 99.20it/s] 


[Epoch 17] Loss=0.0114 | F1=0.6543
New best model saved (F1=0.6543)


Epoch 18: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 94.82it/s] 


[Epoch 18] Loss=0.0113 | F1=0.6543
New best model saved (F1=0.6543)


Epoch 19: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 95.60it/s] 


[Epoch 19] Loss=0.0113 | F1=0.6565
New best model saved (F1=0.6565)


Epoch 20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 90.61it/s]


[Epoch 20] Loss=0.0112 | F1=0.6598
New best model saved (F1=0.6598)


Epoch 21: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 86.17it/s]


[Epoch 21] Loss=0.0111 | F1=0.6597


Epoch 22: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 83.62it/s]


[Epoch 22] Loss=0.0110 | F1=0.6608
New best model saved (F1=0.6608)


Epoch 23: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 97.01it/s] 


[Epoch 23] Loss=0.0109 | F1=0.6622
New best model saved (F1=0.6622)


Epoch 24: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 92.25it/s]


[Epoch 24] Loss=0.0110 | F1=0.6638
New best model saved (F1=0.6638)


Epoch 25: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 97.76it/s] 


[Epoch 25] Loss=0.0109 | F1=0.6661
New best model saved (F1=0.6661)


Epoch 26: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 95.08it/s] 


[Epoch 26] Loss=0.0108 | F1=0.6661
New best model saved (F1=0.6661)


Epoch 27: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 106.79it/s]


[Epoch 27] Loss=0.0108 | F1=0.6658


Epoch 28: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 87.53it/s] 


[Epoch 28] Loss=0.0108 | F1=0.6674
New best model saved (F1=0.6674)


Epoch 29: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 104.18it/s]


[Epoch 29] Loss=0.0107 | F1=0.6683
New best model saved (F1=0.6683)


Epoch 30: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 95.24it/s] 


[Epoch 30] Loss=0.0107 | F1=0.6686
New best model saved (F1=0.6686)


Epoch 31: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 94.53it/s] 


[Epoch 31] Loss=0.0106 | F1=0.6706
New best model saved (F1=0.6706)


Epoch 32: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 85.48it/s]


[Epoch 32] Loss=0.0105 | F1=0.6719
New best model saved (F1=0.6719)


Epoch 33: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 94.86it/s] 


[Epoch 33] Loss=0.0105 | F1=0.6731
New best model saved (F1=0.6731)


Epoch 34: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 83.32it/s] 


[Epoch 34] Loss=0.0104 | F1=0.6721


Epoch 35: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 77.29it/s]


[Epoch 35] Loss=0.0104 | F1=0.6733
New best model saved (F1=0.6733)


Epoch 36: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 99.58it/s] 


[Epoch 36] Loss=0.0104 | F1=0.6716


Epoch 37: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 96.58it/s] 


[Epoch 37] Loss=0.0104 | F1=0.6740
New best model saved (F1=0.6740)


Epoch 38: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 81.72it/s] 


[Epoch 38] Loss=0.0103 | F1=0.6745
New best model saved (F1=0.6745)


Epoch 39: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 83.02it/s]


[Epoch 39] Loss=0.0103 | F1=0.6718


Epoch 40: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 77.16it/s]


[Epoch 40] Loss=0.0103 | F1=0.6743


Epoch 41: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 88.73it/s]


[Epoch 41] Loss=0.0103 | F1=0.6755
New best model saved (F1=0.6755)


Epoch 42: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 91.67it/s] 


[Epoch 42] Loss=0.0102 | F1=0.6752


Epoch 43: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 108.77it/s]


[Epoch 43] Loss=0.0102 | F1=0.6763
New best model saved (F1=0.6763)


Epoch 44: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 83.93it/s] 


[Epoch 44] Loss=0.0102 | F1=0.6766
New best model saved (F1=0.6766)


Epoch 45: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 102.76it/s]


[Epoch 45] Loss=0.0102 | F1=0.6777
New best model saved (F1=0.6777)


Epoch 46: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 86.48it/s] 


[Epoch 46] Loss=0.0101 | F1=0.6789
New best model saved (F1=0.6789)


Epoch 47: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 84.61it/s]


[Epoch 47] Loss=0.0101 | F1=0.6778


Epoch 48: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 88.78it/s]


[Epoch 48] Loss=0.0101 | F1=0.6770


Epoch 49: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 108.57it/s]


[Epoch 49] Loss=0.0100 | F1=0.6766


Epoch 50: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 97.57it/s] 


[Epoch 50] Loss=0.0100 | F1=0.6787


Epoch 51: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 124.81it/s]


[Epoch 51] Loss=0.0100 | F1=0.6796
New best model saved (F1=0.6796)


Epoch 52: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 124.91it/s]


[Epoch 52] Loss=0.0100 | F1=0.6791


Epoch 53: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 122.53it/s]


[Epoch 53] Loss=0.0100 | F1=0.6805
New best model saved (F1=0.6805)


Epoch 54: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 123.42it/s]


[Epoch 54] Loss=0.0100 | F1=0.6804


Epoch 55: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 101.35it/s]


[Epoch 55] Loss=0.0099 | F1=0.6819
New best model saved (F1=0.6819)


Epoch 56: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 110.91it/s]


[Epoch 56] Loss=0.0099 | F1=0.6822
New best model saved (F1=0.6822)


Epoch 57: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 108.56it/s]


[Epoch 57] Loss=0.0099 | F1=0.6804


Epoch 58: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 109.35it/s]


[Epoch 58] Loss=0.0098 | F1=0.6823
New best model saved (F1=0.6823)


Epoch 59: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 105.94it/s]


[Epoch 59] Loss=0.0098 | F1=0.6826
New best model saved (F1=0.6826)


Epoch 60: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 105.03it/s]


[Epoch 60] Loss=0.0098 | F1=0.6804


Epoch 61: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 116.05it/s]


[Epoch 61] Loss=0.0098 | F1=0.6825


Epoch 62: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 121.30it/s]


[Epoch 62] Loss=0.0098 | F1=0.6807


Epoch 63: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 118.46it/s]


[Epoch 63] Loss=0.0098 | F1=0.6813


Epoch 64: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 120.14it/s]


[Epoch 64] Loss=0.0097 | F1=0.6827
New best model saved (F1=0.6827)


Epoch 65: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 122.24it/s]


[Epoch 65] Loss=0.0097 | F1=0.6836
New best model saved (F1=0.6836)


Epoch 66: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 105.25it/s]


[Epoch 66] Loss=0.0097 | F1=0.6848
New best model saved (F1=0.6848)


Epoch 67: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 116.56it/s]


[Epoch 67] Loss=0.0096 | F1=0.6848
New best model saved (F1=0.6848)


Epoch 68: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 91.99it/s] 


[Epoch 68] Loss=0.0097 | F1=0.6850
New best model saved (F1=0.6850)


Epoch 69: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 115.86it/s]


[Epoch 69] Loss=0.0097 | F1=0.6831


Epoch 70: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 106.83it/s]


[Epoch 70] Loss=0.0096 | F1=0.6841


Epoch 71: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 106.92it/s]


[Epoch 71] Loss=0.0096 | F1=0.6848


Epoch 72: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 117.35it/s]


[Epoch 72] Loss=0.0096 | F1=0.6854
New best model saved (F1=0.6854)


Epoch 73: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 113.05it/s]


[Epoch 73] Loss=0.0096 | F1=0.6860
New best model saved (F1=0.6860)


Epoch 74: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 113.96it/s]


[Epoch 74] Loss=0.0096 | F1=0.6862
New best model saved (F1=0.6862)


Epoch 75: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 106.87it/s]


[Epoch 75] Loss=0.0096 | F1=0.6870
New best model saved (F1=0.6870)


Epoch 76: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 95.53it/s] 


[Epoch 76] Loss=0.0096 | F1=0.6850


Epoch 77: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 90.08it/s] 


[Epoch 77] Loss=0.0095 | F1=0.6853


Epoch 78: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:04<00:00, 86.00it/s]


[Epoch 78] Loss=0.0095 | F1=0.6863


Epoch 79: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 115.85it/s]


[Epoch 79] Loss=0.0095 | F1=0.6865


Epoch 80: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:03<00:00, 107.33it/s]


[Epoch 80] Loss=0.0095 | F1=0.6857
Early stopping triggered

ðŸŽ‰ Final best F1: 0.6870291590027093


In [9]:
import csv
import numpy as np
from pathlib import Path

print("\nGenerating submission...")

teacher.eval()

X_test = X_test.to(device)

def select_k(prob, min_k=2, max_k=3):
    idx = np.argsort(prob)[::-1]  # descend
    top3 = idx[:max_k]

    if prob[top3[2]] < 0.25 * prob[top3[1]]:
        return top3[:2]

    return top3


preds = []

with torch.no_grad():
    for start in tqdm(range(0, len(X_test), 64)):
        batch = X_test[start:start+64]
        logits = teacher(batch, use_dropout=False)

        probs = torch.sigmoid(logits).cpu().numpy()

        for p in probs:
            labels = select_k(p)
            preds.append([str(x) for x in labels])


# ==========================================================
# SAVE CSV
# ==========================================================

OUT_DIR = Path("Submission")
OUT_DIR.mkdir(exist_ok=True)
OUT_PATH = OUT_DIR / "submission_GNN.csv"

with open(OUT_PATH, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["id", "label"])
    for pid, labels in zip(test_ids, preds):
        w.writerow([pid, ",".join(labels)])

print(f"ðŸŽ‰ Submission saved â†’ {OUT_PATH}")


Generating submission...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 308/308 [00:00<00:00, 416.84it/s]

ðŸŽ‰ Submission saved â†’ Submission\submission_GNN.csv



