In [16]:
import numpy as np
import torch
import json
from tqdm import tqdm
from pathlib import Path
from utils import * 
import copy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
import os
import csv
from tqdm import tqdm
import random

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

Device: cuda


In [17]:
import site, sys
sys.path.append(site.USER_SITE)
print(site.USER_SITE)


C:\Users\noamc\AppData\Roaming\Python\Python312\site-packages


In [18]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

2.6.0+cu124
12.4
True
NVIDIA GeForce RTX 3060 Laptop GPU


In [None]:
# Load Data
# Default paths
ROOT = Path("Amazon_products") # Root Amazon_products directory
TRAIN_DIR = ROOT / "train"
TEST_DIR = ROOT / "test"

TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
TRAIN_CORPUS_PATH = os.path.join(TRAIN_DIR, "train_corpus.txt")

CLASS_HIERARCHY_PATH = ROOT / "class_hierarchy.txt" 
CLASS_RELATED_PATH = ROOT / "class_related_keywords.txt" 
CLASS_PATH = ROOT / "classes.txt" 

SUBMISSION_PATH = "Submission/submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0–530)
MIN_LABELS = 2     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# Load corpus
def load_corpus(path):
    """Load test corpus into {id: text} dictionary."""
    id2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                id, text = parts
                id2text[id] = text
    return id2text

def load_multilabel(path):
    """Load multi-label data into {id: [labels]} dictionary."""
    id2labels = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                pid, label = parts
                pid = int(pid)
                label = int(label)

                if pid not in id2labels:
                    id2labels[pid] = []

                id2labels[pid].append(label)
    return id2labels

def load_class_keywords(path):
    """Load class keywords into {class_name: [keywords]} dictionary."""
    class2keywords = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            classname, keywords = line.strip().split(":", 1)
            keyword_list = [kw.strip() for kw in keywords.split(",") if kw.strip()]
            class2keywords[classname] = keyword_list
    return class2keywords

id2text_test = load_corpus(TEST_CORPUS_PATH)
id_list_test = list(id2text_test.keys())

id2text_train = load_corpus(TRAIN_CORPUS_PATH)
id_list_train = list(id2text_train.keys())

id2class = load_corpus(CLASS_PATH)
class2hierarchy = load_multilabel(CLASS_HIERARCHY_PATH)
class2related = load_class_keywords(CLASS_RELATED_PATH)

print(f"Train: {len(id2text_train)} samples")
print(f"Test: {len(id2text_test)} samples")
print(f"Classes: {len(id2class)}")
print(f"Silver labels loaded")

Train: 29487 samples
Test: 19658 samples
Classes: 531
Silver labels loaded


In [20]:
from transformers import AutoTokenizer, AutoModel
import torch, numpy as np, os
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

def get_embeddings(texts, batch_size=16):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            emb = outputs.last_hidden_state[:, 0, :]  # CLS token
        embs.append(emb.cpu().numpy())
    return np.vstack(embs)

# --- Compute product embeddings ---
train_ids = list(id2text_train.keys())
train_texts = [id2text_train[i] for i in train_ids]
X_train = get_embeddings(train_texts)
print(f"X_train shape: {X_train.shape}")

# --- Compute class embeddings (fix here) ---
label_texts = [id2class[str(i)] for i in range(len(id2class))]
label_embs = get_embeddings(label_texts)
label_embs = torch.tensor(label_embs)

# --- Propagate hierarchy info ---
def propagate_hierarchy_to_embeddings(label_emb, hierarchy, alpha=0.20):
    enhanced = label_emb.clone()
    for parent, children in hierarchy.items():
        if len(children) > 0:
            child_embs = label_emb[children]
            enhanced[parent] = (1 - alpha) * label_emb[parent] + alpha * child_embs.mean(0)
    return enhanced

label_embs_enriched = propagate_hierarchy_to_embeddings(label_embs, class2hierarchy, alpha=0.20)

# --- Save everything ---
os.makedirs("Embeddings", exist_ok=True)

torch.save(torch.from_numpy(X_train).float(), "Embeddings/X_train.pt")
torch.save(label_embs_enriched.float(), "Embeddings/label_emb.pt")

print("\nSaved:")
print("Embeddings/X_train.pt")
print("Embeddings/label_emb.pt (hierarchy-aware)")


Encoding: 100%|██████████| 1843/1843 [15:50<00:00,  1.94it/s]


X_train shape: (29487, 768)


Encoding: 100%|██████████| 34/34 [00:00<00:00, 38.50it/s]



Saved:
Embeddings/X_train.pt
Embeddings/label_emb.pt (hierarchy-aware)
