In [10]:
import random
import numpy as np
import torch
import json
from tqdm import tqdm
from pathlib import Path
from utils import * 
import copy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
import os
import csv
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [11]:
# Default paths
ROOT = Path("Amazon_products") # Root Amazon_products directory
TRAIN_DIR = ROOT / "train"
TEST_DIR = ROOT / "test"

TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
TRAIN_CORPUS_PATH = os.path.join(TRAIN_DIR, "train_corpus.txt")

CLASS_HIERARCHY_PATH = ROOT / "class_hierarchy.txt" 
CLASS_RELATED_PATH = ROOT / "class_related_keywords.txt" 
CLASS_PATH = ROOT / "classes.txt" 

SUBMISSION_PATH = "Submission/submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0â€“530)
MIN_LABELS = 2     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample


In [12]:
# --- Load ---

""" 
1. Training corpus: 29,487 product reviews without class labels.
2. Classes: 531 product categories.
3. Class hierarchy: A taxonomy file that defines parentâ€“child relationships among classes (each line represents one relation).
4. Class-related keywords: A list of keywords associated with each product class.
5. Test corpus: 19,658 product reviews for evaluation.
"""

def load_corpus(path):
    """Load test corpus into {id: text} dictionary."""
    id2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                id, text = parts
                id2text[id] = text
    return id2text

def load_multilabel(path):
    """Load multi-label data into {id: [labels]} dictionary."""
    id2labels = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                pid, label = parts
                pid = int(pid)
                label = int(label)

                if pid not in id2labels:
                    id2labels[pid] = []

                id2labels[pid].append(label)
    return id2labels

def load_class_keywords(path):
    """Load class keywords into {class_name: [keywords]} dictionary."""
    class2keywords = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            classname, keywords = line.strip().split(":", 1)
            keyword_list = [kw.strip() for kw in keywords.split(",") if kw.strip()]
            class2keywords[classname] = keyword_list
    return class2keywords

id2text_test = load_corpus(TEST_CORPUS_PATH)
id_list_test = list(id2text_test.keys())

id2text_train = load_corpus(TRAIN_CORPUS_PATH)
id_list_train = list(id2text_train.keys())

id2class = load_corpus(CLASS_PATH)
class2hierarchy = load_multilabel(CLASS_HIERARCHY_PATH)
class2related = load_class_keywords(CLASS_RELATED_PATH)

# ======== Print ===========

print(len(id2class)) 
for i in range(10):
    print(i, ":", id2class[str(i)])

print()

print(len(id2text_test)) 
for i, (id, text) in enumerate(id2text_test.items()):
    if i >= 10: 
        break
    print(id, ":", text)

print()

print(len(id2text_train)) 
for i, (id, text) in enumerate(id2text_train.items()):
    if i >= 10: 
        break
    print(id, ":", text)

print()
print(len(class2hierarchy)) 
for i, (id, node) in enumerate(class2hierarchy.items()):
    if i >= 10: 
        break
    print(id, ":", node)


print()
print(len(class2related)) 
for i, (classp, text) in enumerate(class2related.items()):
    if i >= 10: 
        break
    print(classp, ":", text)


531
0 : grocery_gourmet_food
1 : meat_poultry
2 : jerky
3 : toys_games
4 : games
5 : puzzles
6 : jigsaw_puzzles
7 : board_games
8 : beverages
9 : juices

19658
0 : conair cs15tcs professional straight styles straightening iron woah ! sure this straightener looks like all the other crappy straightners in the world , but there 's a twist to this one ! it is my first straightner and i 've had it for about 7 months . i bought it only because i was desperate for a cheap straightener because my hair is very thick , long , wavy ! i 'm looking for a new straighner right now ... but until then this one is doing just fine . if it works for me , it will work for you !
1 : barbie ballet shoes icon doll i was looking round the toysrus website and found this cheap doll ! " wow " i said . so i got it and her body is painted on ! which is really cute ! , parents would n't you like to get a toy where you save yourself from picking up another barbie item from the floor ! well , make a cardboard danceflo

In [13]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def preprocess_text(text):
    text = re.sub(r"[>&]", " ", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()


label_texts = []
for cid in range(NUM_CLASSES):
    cid_str = str(cid)
    class_name = id2class[cid_str]              
    class_name_clean = preprocess_text(class_name)
    
    keywords = class2related.get(class_name, [])  
    keywords_clean = " ".join(preprocess_text(kw) for kw in keywords)
    
    full_label_text = (class_name_clean + " " + keywords_clean).strip()
    label_texts.append(full_label_text)


train_ids = list(id2text_train.keys())
train_texts = [preprocess_text(id2text_train[pid]) for pid in train_ids]

test_ids = list(id2text_test.keys())
test_texts = [preprocess_text(id2text_test[pid]) for pid in test_ids]

print("Nb train:", len(train_texts))
print("Nb test :", len(test_texts))
print("Nb classes:", len(label_texts))


Nb train: 29487
Nb test : 19658
Nb classes: 531


In [14]:
vectorizer = TfidfVectorizer()
all_texts = train_texts + label_texts
tfidf_matrix = vectorizer.fit_transform(all_texts)

n_train = len(train_texts)
n_labels = len(label_texts)

train_tfidf = tfidf_matrix[:n_train]
label_tfidf = tfidf_matrix[n_train:]

print("TF-IDF shapes:")
print("  train:", train_tfidf.shape)
print("  label:", label_tfidf.shape)

lex_sim_train = cosine_similarity(train_tfidf, label_tfidf)
print("Lexical similarity (train):", lex_sim_train.shape)

TF-IDF shapes:
  train: (29487, 46599)
  label: (531, 46599)
Lexical similarity (train): (29487, 531)


In [None]:
"""#  SEMANTIC EMBEDDINGS PART

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name, device=device)

# --- Encode class texts (531 classes, enriched with keywords)
label_emb = model.encode(label_texts, convert_to_tensor=True, show_progress_bar=True, batch_size=64)

# --- Encode training product texts
train_emb = model.encode(train_texts, convert_to_tensor=True, show_progress_bar=True, batch_size=64)

print("\nShapes:")
print(f"  Train embeddings: {train_emb.shape}")
print(f"  Label embeddings: {label_emb.shape}")

# --- Compute cosine similarities (semantic similarity)
train_np = train_emb.cpu().numpy()
label_np = label_emb.cpu().numpy()

bert_sim_train = cosine_similarity(train_np, label_np)

print(f"BERT similarity (train): {bert_sim_train.shape}")

# --- Save for reuse
torch.save({
    "train_emb": train_emb,
    "label_emb": label_emb,
    "bert_sim_train": torch.tensor(bert_sim_train),
}, "Amazon_products/semantic_embeddings.pt")

print("\nSemantic embeddings and similarities saved to Amazon_products/semantic_embeddings.pt")"""


'#  SEMANTIC EMBEDDINGS PART\n\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport torch\nimport numpy as np\nfrom tqdm import tqdm\n\ndevice = "cuda" if torch.cuda.is_available() else "cpu"\n\nmodel_name = "all-mpnet-base-v2"\nmodel = SentenceTransformer(model_name, device=device)\n\n# --- Encode class texts (531 classes, enriched with keywords)\nprint("\nðŸ”¹ Encoding class embeddings...")\nlabel_emb = model.encode(label_texts, convert_to_tensor=True, show_progress_bar=True, batch_size=64)\n\n# --- Encode training product texts\nprint("ðŸ”¹ Encoding train product embeddings...")\ntrain_emb = model.encode(train_texts, convert_to_tensor=True, show_progress_bar=True, batch_size=64)\n\nprint("\nShapes:")\nprint(f"  Train embeddings: {train_emb.shape}")\nprint(f"  Label embeddings: {label_emb.shape}")\n\n# --- Compute cosine similarities (semantic similarity)\nprint("\nðŸ”¹ Computing cosine similarities...")\ntrain_np = tr

In [16]:
import os
import json
import numpy as np
import torch
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

def hierarchy_consistency(silver, hierarchy):
    ok = total = 0
    for labels in silver.values():
        L = set(labels)
        for parent, children in hierarchy.items():
            parent = int(parent)
            for child in children:
                child = int(child)
                if child in L:
                    total += 1
                    if parent in L:
                        ok += 1
    return ok / total if total > 0 else 0


def analyze_coverage(silver, name, n_classes=531):
    all_labels = [c for labels in silver.values() for c in labels]
    unique = len(set(all_labels))
    counter = Counter(all_labels)
    top5 = counter.most_common(5)

    print(f"\n{name}:")
    print(f"  Coverage: {unique}/{n_classes} ({unique/n_classes*100:.1f}%)")
    print(f"  Top-5 most frequent classes:")
    for cls, count in top5:
        print(f"    Class {cls}: {count} times ({count/len(silver)*100:.1f}%)")

In [17]:
data_sem = torch.load("Embeddings/semantic_embeddings.pt")
bert_sim_train = data_sem["bert_sim_train"].numpy()

alpha = 0.4
final_sim = alpha * lex_sim_train + (1 - alpha) * bert_sim_train
print(f"Final similarity matrix: {final_sim.shape}")

TOP_K = 3
silver_baseline = {
    pid: np.argsort(-final_sim[i])[:TOP_K].tolist()
    for i, pid in enumerate(id_list_train)
}

train_consistency_no = hierarchy_consistency(silver_baseline, class2hierarchy)
print(f"\nWithout hierarchy : {train_consistency_no:.4f}")
analyze_coverage(silver_baseline, "Baseline Train (no hierarchy)")

id2class_name = {int(cid): cname.strip() for cid, cname in id2class.items()}
for i, pid in enumerate(list(id_list_train)[:5]):
    print(f"\n[Doc {pid}]")
    print("Text:", id2text_train[pid][:200].replace("\n", " ") + "...")
    for cls_id in silver_baseline[pid]:
        print(f"   {id2class_name.get(cls_id, f'Class_{cls_id}'):<35} score={final_sim[i, cls_id]:.4f}")

Final similarity matrix: (29487, 531)

Without hierarchy : 0.1062

Baseline Train (no hierarchy):
  Coverage: 530/531 (99.8%)
  Top-5 most frequent classes:
    Class 148: 1495 times (5.1%)
    Class 43: 1256 times (4.3%)
    Class 220: 1170 times (4.0%)
    Class 164: 1087 times (3.7%)
    Class 25: 984 times (3.3%)

[Doc 0]
Text: omron hem 790it automatic blood pressure monitor with advanced omron health management software so far this machine has worked well and is very simple to use . it is nice to have immediate feedback on...
   health_monitors                     score=0.3620
   automatic_feeders                   score=0.2359
   health_personal_care                score=0.2321

[Doc 1]
Text: natural factors whey factors chocolate works well , but there is a lot of dead space in the container when you first open it up . the container comes 3 4 4 5 full and the rest in empty space ....
   chocolate                           score=0.3875
   chocolate_bars                      scor

In [26]:
def propagate_hierarchy_to_embeddings(label_emb, hierarchy, alpha_prop=0.25):
    enhanced = label_emb.clone()
    for parent, children in hierarchy.items():
        parent = int(parent)
        if children:
            child_embs = label_emb[[int(c) for c in children]]
            enhanced[parent] = (1 - alpha_prop) * label_emb[parent] + alpha_prop * child_embs.mean(0)
    return enhanced


def enforce_hierarchy_constraints(scores, hierarchy, top_k=3, parent_boost=0.15):
    scores = scores.copy()
    for parent, children in hierarchy.items():
        parent = int(parent)
        if children:
            max_child_score = max(scores[int(c)] for c in children)
            scores[parent] = max(scores[parent], max_child_score * (1 - parent_boost))
    candidates = np.argsort(-scores)[:top_k * 3]
    selected, added = [], set()
    for c in candidates:
        if len(selected) >= top_k: break
        for parent, children in hierarchy.items():
            parent = int(parent)
            if c in [int(x) for x in children]:
                added.add(parent)
        while added and len(selected) < top_k:
            p = max(added, key=lambda x: scores[x])
            added.remove(p)
            if p not in selected: selected.append(p)
        if len(selected) < top_k and c not in selected:
            selected.append(c)
    return selected[:top_k]


print("\n=== Generating hierarchical silver labels ===")
label_emb_enriched = propagate_hierarchy_to_embeddings(
    data_sem["label_emb"], class2hierarchy, alpha_prop=0.25
)

bert_sim_enriched = cosine_similarity(
    data_sem["train_emb"].cpu().numpy(),
    label_emb_enriched.cpu().numpy()
)

final_sim_enriched = alpha * lex_sim_train + (1 - alpha) * bert_sim_enriched

silver_hierarchy = {
    pid: enforce_hierarchy_constraints(final_sim_enriched[i], class2hierarchy, TOP_K, 0.15)
    for i, pid in enumerate(id_list_train)
}

train_consistency_hier = hierarchy_consistency(silver_hierarchy, class2hierarchy)

print(f"  Baseline (no hierarchy):  {train_consistency_no:.4f}")
print(f"  With hierarchy:           {train_consistency_hier:.4f}")


analyze_coverage(silver_hierarchy, "Baseline Train (no hierarchy)")

id2class_name = {int(cid): cname.strip() for cid, cname in id2class.items()}
for i, pid in enumerate(list(id_list_train)[:5]):
    print(f"\n[Doc {pid}]")
    print("Text:", id2text_train[pid][:200].replace("\n", " ") + "...")
    for cls_id in silver_hierarchy[pid]:
        print(f"   {id2class_name.get(cls_id, f'Class_{cls_id}'):<35} score={final_sim[i, cls_id]:.4f}")


=== Generating hierarchical silver labels ===
  Baseline (no hierarchy):  0.1062
  With hierarchy:           0.5820

Baseline Train (no hierarchy):
  Coverage: 518/531 (97.6%)
  Top-5 most frequent classes:
    Class 3: 3543 times (12.0%)
    Class 21: 2994 times (10.2%)
    Class 23: 2941 times (10.0%)
    Class 10: 2862 times (9.7%)
    Class 24: 2850 times (9.7%)

[Doc 0]
Text: omron hem 790it automatic blood pressure monitor with advanced omron health management software so far this machine has worked well and is very simple to use . it is nice to have immediate feedback on...
   medical_supplies_equipment          score=0.1758
   health_monitors                     score=0.3620
   health_personal_care                score=0.2321

[Doc 1]
Text: natural factors whey factors chocolate works well , but there is a lot of dead space in the container when you first open it up . the container comes 3 4 4 5 full and the rest in empty space ....
   cooking_baking_supplies             score

In [19]:
import json, os

def to_json_serializable(d):
    """Convertit les types NumPy â†’ int/float natifs pour JSON."""
    return {str(k): [int(x) for x in v] for k, v in d.items()}

os.makedirs("Silver", exist_ok=True)

# --- Save hierarchical labels ---
output_path = "Silver/hier.json"
results_hier = {"silver_hierarchy": to_json_serializable(silver_hierarchy)}

with open(output_path, "w") as f:
    json.dump(results_hier, f, indent=2)

print(f"\nSilver labels saved to: {output_path}")

# --- Save baseline labels ---
output_path = "Silver/base.json"
results_base = {"silver_baseline": to_json_serializable(silver_baseline)}

with open(output_path, "w") as f:
    json.dump(results_base, f, indent=2)

print(f"\nSilver labels saved to: {output_path}")



Silver labels saved to: Silver/hier.json

Silver labels saved to: Silver/base.json
