In [40]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
from pathlib import Path

# ==========================================================
# CONFIGURATION
# ==========================================================
np.random.seed(42)
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"ðŸ”§ Device: {device}")

# Paths
ROOT = Path("Amazon_products")
TRAIN_CORPUS_PATH = ROOT / "train" / "train_corpus.txt"
TEST_CORPUS_PATH  = ROOT / "test" / "test_corpus.txt"
CLASS_PATH        = ROOT / "classes.txt"

EMB_DIR      = Path("Embeddings")
X_ALL_PATH   = EMB_DIR / "X_train_test.pt"        # Train + Test embeddings
LABEL_EMB_PATH = EMB_DIR / "labels_hierarchical.pt"

MODEL_SAVE = Path("Models")
MODEL_SAVE.mkdir(exist_ok=True)
MODEL_PATH = MODEL_SAVE / "silver_classifier.pt"

# ==========================================================
# LOAD IDS
# ==========================================================
def load_ids(path):
    ids = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            pid, _ = line.strip().split("\t", 1)
            ids.append(int(pid))
    return ids

train_ids = load_ids(TRAIN_CORPUS_PATH)
test_ids  = load_ids(TEST_CORPUS_PATH)
n_train = len(train_ids)
n_test = len(test_ids)

print(f"Train IDs: {n_train} | Test IDs: {n_test}")

# ==========================================================
# LOAD SILVER LABELS
# ==========================================================
with open("Silver/silver_train_modify.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

silver_labels = {int(pid): data["labels"] for pid, data in raw.items()}

# ==========================================================
# LOAD X_all â†’ split into X_train + X_test
# ==========================================================
print("\nðŸ§  Loading X_all.pt ...")

data = torch.load(X_ALL_PATH, weights_only=False)

# ensure tensor
if isinstance(data, np.ndarray):
    data = torch.from_numpy(data)
elif isinstance(data, list):
    data = torch.stack(data)

X_all = data.float().to(device)
assert X_all.shape[0] == n_train + n_test, "Bad size"

X_train = X_all[:n_train]
X_test  = X_all[n_train:]
print(f"âœ“ X_train: {X_train.shape}, X_test: {X_test.shape}")

# ==========================================================
# LOAD CLASS NAMES
# ==========================================================
classes = {}
with open(CLASS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        cid, cname = line.strip().split("\t")
        classes[int(cid)] = cname

n_classes = len(classes)

pid2idx = {pid: i for i, pid in enumerate(train_ids)}


ðŸ”§ Device: cuda
Train IDs: 29487 | Test IDs: 19658

ðŸ§  Loading X_all.pt ...
âœ“ X_train: torch.Size([29487, 768]), X_test: torch.Size([19658, 768])


In [41]:
class MultiLabelDataset(Dataset):
    def __init__(self, pids, labels_dict):
        self.pids = pids
        self.labels = labels_dict

    def __len__(self):
        return len(self.pids)

    def __getitem__(self, idx):
        pid = self.pids[idx]
        emb = X_train[pid2idx[pid]]

        y = torch.zeros(n_classes)
        for c in self.labels[pid]:
            if 0 <= c < n_classes:
                y[c] = 1.0

        return {"X": emb, "y": y}

# TRAIN / VAL splits
train_p, val_p = train_test_split(list(silver_labels.keys()),
                                  test_size=0.2,
                                  random_state=42)

train_dataset = MultiLabelDataset(train_p, silver_labels)
val_dataset   = MultiLabelDataset(val_p,   silver_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=64)


In [42]:
class Classifier(nn.Module):
    def __init__(self, dim, n_classes):
        super().__init__()
        self.fc1 = nn.Linear(dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.drop = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, n_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop(x)
        return self.fc2(x)

model = Classifier(X_train.size(1), n_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

In [43]:
def evaluate(model, loader, thr=0.25):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in loader:
            X = batch["X"]
            y = batch["y"].numpy()

            prob = torch.sigmoid(model(X)).cpu().numpy()
            pred = (prob > thr).astype(int)

            preds.extend(pred)
            labels.extend(y)

    f1s = f1_score(labels, preds, average="samples")
    f1m = f1_score(labels, preds, average="macro")
    return f1s, f1m


In [44]:
print("\nðŸš€ Training...")
best = 0
epochs = 12

for epoch in range(1, epochs+1):
    model.train()
    total = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        X = batch["X"]
        y = batch["y"].to(device)

        logits = model(X)
        loss = F.binary_cross_entropy_with_logits(logits, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += loss.item()

    f1s, f1m = evaluate(model, val_loader)

    print(f"[Epoch {epoch}] loss={total/len(train_loader):.4f} | F1={f1s:.4f}")

    if f1s > best:
        best = f1s
        torch.save(model.state_dict(), MODEL_PATH)
        print(f"ðŸ”¥ New best model saved ({best:.4f})")

print(f"\nðŸŽ‰ Best validation F1 = {best:.4f}")
print(f"ðŸ“¦ Model saved at: {MODEL_PATH}")



ðŸš€ Training...


Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 124.24it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 1] loss=0.0429 | F1=0.7086
ðŸ”¥ New best model saved (0.7086)


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 125.29it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 2] loss=0.0092 | F1=0.7507
ðŸ”¥ New best model saved (0.7507)


Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 129.48it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 3] loss=0.0075 | F1=0.7660
ðŸ”¥ New best model saved (0.7660)


Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 135.93it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 4] loss=0.0068 | F1=0.7760
ðŸ”¥ New best model saved (0.7760)


Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 133.58it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 5] loss=0.0062 | F1=0.7816
ðŸ”¥ New best model saved (0.7816)


Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 132.72it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 6] loss=0.0059 | F1=0.7837
ðŸ”¥ New best model saved (0.7837)


Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 133.49it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 7] loss=0.0056 | F1=0.7902
ðŸ”¥ New best model saved (0.7902)


Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 130.45it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 8] loss=0.0054 | F1=0.7901


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 134.59it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 9] loss=0.0052 | F1=0.7913
ðŸ”¥ New best model saved (0.7913)


Epoch 10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 129.30it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 10] loss=0.0050 | F1=0.7921
ðŸ”¥ New best model saved (0.7921)


Epoch 11: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 134.87it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Epoch 11] loss=0.0048 | F1=0.7944
ðŸ”¥ New best model saved (0.7944)


Epoch 12: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 369/369 [00:02<00:00, 126.86it/s]


[Epoch 12] loss=0.0047 | F1=0.7950
ðŸ”¥ New best model saved (0.7950)

ðŸŽ‰ Best validation F1 = 0.7950
ðŸ“¦ Model saved at: Models\silver_classifier.pt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [45]:
import csv

MODEL_PATH = Path("Models") / "silver_classifier.pt"
OUT_DIR    = Path("Submission")
OUT_DIR.mkdir(exist_ok=True)
OUT_PATH   = OUT_DIR / "submission.csv"

test_ids = []
with open(TEST_CORPUS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        pid, _ = line.strip().split("\t", 1)
        test_ids.append(int(pid))

print(f"Loaded test IDs: {len(test_ids)}")

X_all = torch.load(X_ALL_PATH, weights_only=False)
if isinstance(X_all, np.ndarray):
    X_all = torch.from_numpy(X_all)
elif isinstance(X_all, list):
    X_all = torch.stack(X_all)

X_all = X_all.float()

n_test = len(test_ids)
X_test = X_all[-n_test:]

model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
model.eval()

# ==========================================================
# PREDICTION
# ==========================================================
THR = 0.5
MIN_L = 2
MAX_L = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
X_test = X_test.to(device)

preds = []

with torch.no_grad():
    for start in tqdm(range(0, len(X_test), 64)):
        batch = X_test[start:start+64]
        probs = torch.sigmoid(model(batch)).cpu().numpy()

        for p in probs:
            pred = (p > THR).astype(int)

            # Post-processing
            if pred.sum() == 0:
                pred[np.argsort(p)[-MIN_L:]] = 1
            elif pred.sum() == 1:
                pred[np.argsort(p)[-2:]] = 1
            elif pred.sum() > MAX_L:
                pred = np.zeros_like(pred)
                pred[np.argsort(p)[-MAX_L:]] = 1

            labels = [str(i) for i, v in enumerate(pred) if v == 1]
            preds.append(labels)

# ==========================================================
# SAVE CSV
# ==========================================================
with open(OUT_PATH, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["id", "label"])
    for pid, labels in zip(test_ids, preds):
        w.writerow([pid, ",".join(labels)])

print(f"ðŸŽ‰ Submission saved â†’ {OUT_PATH}")


Loaded test IDs: 19658


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 308/308 [00:02<00:00, 145.96it/s]

ðŸŽ‰ Submission saved â†’ Submission\submission.csv



