In [1]:
import random
import numpy as np
import torch
import json
import os
from tqdm import tqdm
from pathlib import Path
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import copy

# Seed pour reproductibilit√©
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


In [2]:
ROOT = Path("Amazon_products")
TRAIN_DIR = ROOT / "train"
TEST_DIR = ROOT / "test"

TEST_CORPUS_PATH = TEST_DIR / "test_corpus.txt"
TRAIN_CORPUS_PATH = TRAIN_DIR / "train_corpus.txt"

CLASS_HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
CLASS_RELATED_PATH = ROOT / "class_related_keywords.txt"
CLASS_PATH = ROOT / "classes.txt"

SUBMISSION_PATH = "Submission/submission.csv"

NUM_CLASSES = 531
MIN_LABELS = 2
MAX_LABELS = 3


In [3]:
def load_corpus(path):
    """Load corpus into {id: text} dictionary."""
    id2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                id, text = parts
                id2text[id] = text
    return id2text

def load_multilabel(path):
    """Load multi-label data into {id: [labels]} dictionary."""
    id2labels = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                pid, label = parts
                pid = int(pid)
                label = int(label)
                if pid not in id2labels:
                    id2labels[pid] = []
                id2labels[pid].append(label)
    return id2labels

def load_class_keywords(path):
    """Load class keywords into {class_name: [keywords]} dictionary."""
    class2keywords = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            classname, keywords = line.strip().split(":", 1)
            keyword_list = [kw.strip() for kw in keywords.split(",") if kw.strip()]
            class2keywords[classname] = keyword_list
    return class2keywords


In [4]:
id2text_test = load_corpus(TEST_CORPUS_PATH)
id2text_train = load_corpus(TRAIN_CORPUS_PATH)

# Classes
id2class = load_corpus(CLASS_PATH)
class2hierarchy = load_multilabel(CLASS_HIERARCHY_PATH)
class2related = load_class_keywords(CLASS_RELATED_PATH)

# Silver labels (RoBERTa - les meilleurs)

with open("Silver/silver_fixed.json", "r", encoding="utf-8") as f:
    pid2labelids_silver = json.load(f)


print(f"Train: {len(id2text_train)} samples")
print(f"Test: {len(id2text_test)} samples")
print(f"Classes: {len(id2class)}")

Train: 29487 samples
Test: 19658 samples
Classes: 531


In [5]:
import json

# Transformation
result = {k: v["labels"] for k, v in pid2labelids_silver["silver_labels"].items()}
pid2labelids_silver = result

# Affichage du r√©sultat format√© en JSON
print(json.dumps(result, indent=2))


{
  "0": [
    0,
    23,
    179
  ],
  "1": [
    23,
    166,
    212
  ],
  "2": [
    166,
    179,
    212
  ],
  "3": [
    56,
    166,
    212
  ],
  "4": [
    56,
    166,
    212
  ],
  "5": [
    166,
    179,
    212
  ],
  "6": [
    3,
    23,
    56
  ],
  "7": [
    56,
    166,
    212
  ],
  "8": [
    40,
    166,
    212
  ],
  "9": [
    56,
    166,
    212
  ],
  "10": [
    56,
    166,
    212
  ],
  "11": [
    166,
    179,
    212
  ],
  "12": [
    56,
    166,
    212
  ],
  "13": [
    56,
    166,
    212
  ],
  "14": [
    3,
    56,
    166
  ],
  "15": [
    56,
    166,
    212
  ],
  "16": [
    10,
    166,
    234
  ],
  "17": [
    56,
    166,
    212
  ],
  "18": [
    56,
    166,
    212
  ],
  "19": [
    56,
    166,
    212
  ],
  "20": [
    56,
    166,
    212
  ],
  "21": [
    56,
    166,
    212
  ],
  "22": [
    166,
    212,
    234
  ],
  "23": [
    3,
    56,
    234
  ],
  "24": [
    56,
    166,
    212
  ],
  "25": [
   

In [6]:
# Embeddings
X_train = torch.load("Embeddings/X_train.pt").to(device)
label_emb = torch.load("Embeddings/label_emb.pt").to(device)
test_ids = list(id2text_test.keys())
train_ids = list(id2text_train.keys())

print(f"Train embeddings: {X_train.shape}")
print(f"Label embeddings: {label_emb.shape}")

# Index mapping
pid2idx = {pid: i for i, pid in enumerate(train_ids)}

input_dim = X_train.size(1)
num_classes = NUM_CLASSES

print(f"Input dimension: {input_dim}")
print(f"Num classes: {num_classes}")

Train embeddings: torch.Size([29487, 768])
Label embeddings: torch.Size([531, 768])
Input dimension: 768
Num classes: 531


In [7]:
class ProductCategoryDataset(Dataset):
    """Dataset using pre-calculated embeddings (train or test compatible)"""
    def __init__(self, pid2label, pid2idx, embeddings, num_classes=531):
        self.pid2label = pid2label
        self.pid2idx = pid2idx
        self.embeddings = embeddings
        self.num_classes = num_classes

        if self.pid2label is not None:
            self.pids = list(pid2label.keys())
            self.has_labels = True
        else:
            self.pids = list(pid2idx.keys())
            self.has_labels = False

        self.indices = [pid2idx[pid] for pid in self.pids]

    def __len__(self):
        return len(self.pids)

    def __getitem__(self, idx):
        emb = self.embeddings[self.indices[idx]]

        if self.has_labels:
            y = torch.zeros(self.num_classes, dtype=torch.float32)
            for label in self.pid2label[self.pids[idx]]:
                y[label] = 1.0
            return {"X": emb, "y": y}
        else:
            return {"X": emb}


In [8]:
def evaluate(model, dataloader, device="cpu", threshold=0.5):
    """√âvalue le mod√®le sur un dataloader"""
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            X = batch["X"].to(device)
            y = batch["y"].cpu().numpy()
            
            logits = model(X)
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs > threshold).astype(int)
            
            all_preds.append(preds)
            all_labels.append(y)

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    f1_macro = f1_score(all_labels, all_preds, average="macro", zero_division=0)
    f1_micro = f1_score(all_labels, all_preds, average="micro", zero_division=0)
    f1_samples = f1_score(all_labels, all_preds, average="samples", zero_division=0)
    
    return {"f1_macro": f1_macro, "f1_micro": f1_micro, "f1_samples" : f1_samples}

In [9]:
class MLPClassifier(nn.Module):
    """Simple MLP for classification multi-label"""
    def __init__(self, input_dim, num_classes, hidden_dim=512, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        return self.net(x)


In [10]:
# Split train/val
silver_pids = list(pid2labelids_silver.keys())
train_pids, val_pids = train_test_split(silver_pids, test_size=0.2, random_state=42)

train_labels = {pid: pid2labelids_silver[pid] for pid in train_pids}
val_labels = {pid: pid2labelids_silver[pid] for pid in val_pids}

print(f"Train: {len(train_labels)} | Val: {len(val_labels)}")

# Datasets
train_dataset = ProductCategoryDataset(train_labels, pid2idx, X_train, num_classes=NUM_CLASSES)
val_dataset = ProductCategoryDataset(val_labels, pid2idx, X_train, num_classes=NUM_CLASSES)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


Train: 23589 | Val: 5898


In [11]:
import copy
import torch.nn.functional as F
from tqdm import tqdm

# === Init Models ===
student = MLPClassifier(input_dim, num_classes).to(device)
teacher = copy.deepcopy(student).to(device)

optimizer = torch.optim.AdamW(student.parameters(), lr=2e-4, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)
criterion = nn.BCEWithLogitsLoss()

# === Self-training parameters ===
alpha_ema = 0.995
lambda_cons = 0.5
pseudo_update_freq = 3
pseudo_threshold = 0.9
patience = 10
EPOCHS = 50

best_f1 = 0.0
best_model = copy.deepcopy(student.state_dict())
wait = 0

def ema_update(teacher, student, alpha):
    for t_param, s_param in zip(teacher.parameters(), student.parameters()):
        t_param.data = alpha * t_param.data + (1 - alpha) * s_param.data

def consistency_loss(logits_s, logits_t):
    return F.mse_loss(torch.sigmoid(logits_s), torch.sigmoid(logits_t))

def generate_pseudo_labels(model, loader, threshold=0.7):
    model.eval()
    pseudo_dict = {}
    with torch.no_grad():
        for X, _ in tqdm(loader, desc="Generating pseudo-labels", leave=False):
            X = X.to(device)
            logits = model(X)
            probs = torch.sigmoid(logits).cpu().numpy()
            for i, prob in enumerate(probs):
                confident_labels = [j for j, p in enumerate(prob) if p > threshold]
                if 2 <= len(confident_labels) <= 3:
                    pass
    return pseudo_dict

# === TRAINING LOOP ===
for epoch in range(1, EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{EPOCHS} ===")

    student.train()
    teacher.eval()
    total_loss = total_sup = total_cons = 0.0

    for X, y in tqdm(train_loader, desc=f"Train Epoch {epoch}", leave=False):
        X, y = X.to(device), y.to(device)

        logits_s = student(X)
        with torch.no_grad():
            logits_t = teacher(X)

        loss_sup = criterion(logits_s, y)
        loss_cons = consistency_loss(logits_s, logits_t)
        loss = loss_sup + lambda_cons * loss_cons

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        ema_update(teacher, student, alpha_ema)

        total_loss += loss.item()
        total_sup += loss_sup.item()
        total_cons += loss_cons.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    avg_sup = total_sup / len(train_loader)
    avg_cons = total_cons / len(train_loader)

    print(f"Loss: {avg_loss:.4f} (sup: {avg_sup:.4f}, cons: {avg_cons:.4f})")

    val_metrics = evaluate(teacher, val_loader, device)
    print(f"Val F1-macro={val_metrics['f1_macro']:.4f} | "
          f"F1-micro={val_metrics['f1_micro']:.4f}| "
          f"F1-samples={val_metrics['f1_samples']:.4f}")

    if val_metrics['f1_samples'] > best_f1:
        best_f1 = val_metrics['f1_samples']
        best_model = copy.deepcopy(teacher.state_dict())
        print(f"New best model (f1_samples={best_f1:.4f})")
        wait = 0
    else:
        wait += 1
        print(f"No improvement: {wait}/{patience}")
        if wait >= patience:
            print("Early stopping triggered")
            break

teacher.load_state_dict(best_model)

print("\n" + "="*50)
print("FINAL EVALUATION")
print("="*50)

test_result = evaluate(teacher, val_loader, device)
print(f"Best Val F1-macro: {test_result['f1_macro']:.4f}")
print(f"Best Val F1-micro: {test_result['f1_micro']:.4f}")
print(f"Best Val F1-samples: {test_result['f1_samples']:.4f}")



=== Epoch 1/50 ===


                                                      

AttributeError: 'str' object has no attribute 'to'

In [None]:
# Save the best model
torch.save(teacher.state_dict(), "Model/mlp_selftraining_best.pt")
print("\nBest model saved to Models/mlp_selftraining_best.pt")


Best model saved to Models/mlp_selftraining_best.pt


In [None]:
# ------------------------
# üîÆ Generate predictions from precomputed X_test embeddings
# ------------------------
import csv
import os
from tqdm import tqdm
import torch
import numpy as np

# --- ‚öôÔ∏è Config ---
TEST_CORPUS_PATH = "Amazon_products/test/test_corpus.txt"
OUTPUT_PATH = "Submission/GNN.csv"
os.makedirs("Submission", exist_ok=True)

THRESHOLD = 0.5
MIN_LABELS = 2      # minimum labels per sample
MAX_LABELS = 3      # maximum labels per sample

# --- üßæ Reload test IDs (from corpus) ---
def load_corpus(path):
    pids, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pids.append(pid)
                texts.append(text)
    return pids, texts

test_ids, _ = load_corpus(TEST_CORPUS_PATH)
print(f"üìÑ Loaded {len(test_ids)} test IDs.")

# --- üß© Load precomputed embeddings ---
X_test = torch.load("Embeddings/X_test.pt").to(device)
print(f"‚úÖ X_test loaded: {X_test.shape}")

# --- üßÆ Generate predictions ---
teacher.eval()
all_pids, all_pred_labels = [], []

with torch.no_grad():
    for start in tqdm(range(0, len(X_test), 64), desc="Generating predictions"):
        end = start + 64
        batch = X_test[start:end]
        batch_pids = test_ids[start:end]

        logits = teacher(batch)
        probs = torch.sigmoid(logits).cpu().numpy()

        for pid, prob in zip(batch_pids, probs):
            pred_row = (prob > THRESHOLD).astype(int)

            # --- Aucun label : on prend les top-2
            if pred_row.sum() == 0:
                topk_idx = np.argsort(prob)[-MIN_LABELS:][::-1]
                pred_row = np.zeros_like(pred_row)
                pred_row[topk_idx] = 1

            # --- Un seul label : on ajoute le 2·µâ plus probable
            elif pred_row.sum() == 1:
                top2_idx = np.argsort(prob)[-2:][::-1]
                pred_row = np.zeros_like(pred_row)
                pred_row[top2_idx] = 1

            # --- Trop de labels : garde seulement top-3
            elif pred_row.sum() > MAX_LABELS:
                topk_idx = np.argsort(prob)[-MAX_LABELS:][::-1]
                pred_row = np.zeros_like(pred_row)
                pred_row[topk_idx] = 1

            labels = [str(j) for j, v in enumerate(pred_row) if v == 1]
            all_pids.append(pid)
            all_pred_labels.append(labels)

print(f"‚úÖ {len(all_pids)} predictions generated.")

# --- üíæ Save submission CSV ---
with open(OUTPUT_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "label"])
    for pid, labels in zip(all_pids, all_pred_labels):
        writer.writerow([pid, ",".join(labels)])

print(f"üìÅ Submission file saved: {OUTPUT_PATH}")
print(f"üî¢ Example: {all_pids[0]} -> {all_pred_labels[0]}")


üìÑ Loaded 19658 test IDs.
‚úÖ X_test loaded: torch.Size([19658, 768])


Generating predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 308/308 [00:01<00:00, 240.17it/s]

‚úÖ 19658 predictions generated.
üìÅ Submission file saved: Submission/GNN.csv
üî¢ Example: 0 -> ['166', '212']



