In [1]:
import random
import numpy as np
import torch
import json
from tqdm import tqdm
from pathlib import Path
from utils import * 
import copy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
import os
import csv
from tqdm import tqdm
from transformers import RobertaModel, RobertaTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from collections import Counter

device = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [2]:
# Default paths
ROOT = Path("Amazon_products") # Root Amazon_products directory
TRAIN_DIR = ROOT / "train"
TEST_DIR = ROOT / "test"

TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
TRAIN_CORPUS_PATH = os.path.join(TRAIN_DIR, "train_corpus.txt")

CLASS_HIERARCHY_PATH = ROOT / "class_hierarchy.txt" 
CLASS_RELATED_PATH = ROOT / "class_related_keywords.txt" 
CLASS_PATH = ROOT / "classes.txt" 

SUBMISSION_PATH = "Submission/submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0‚Äì530)
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

In [3]:
# --- Load ---
def load_corpus(path):
    """Load test corpus into {id: text} dictionary."""
    id2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                id, text = parts
                id2text[id] = text
    return id2text

def load_multilabel(path):
    """Load multi-label data into {id: [labels]} dictionary."""
    id2labels = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                pid, label = parts
                pid = int(pid)
                label = int(label)

                if pid not in id2labels:
                    id2labels[pid] = []

                id2labels[pid].append(label)
    return id2labels

def load_class_keywords(path):
    """Load class keywords into {class_name: [keywords]} dictionary."""
    class2keywords = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            classname, keywords = line.strip().split(":", 1)
            keyword_list = [kw.strip() for kw in keywords.split(",") if kw.strip()]
            class2keywords[classname] = keyword_list
    return class2keywords

id2text_test = load_corpus(TEST_CORPUS_PATH)
id_list_test = list(id2text_test.keys())

id2text_train = load_corpus(TRAIN_CORPUS_PATH)
id_list_train = list(id2text_train.keys())

id2class = load_corpus(CLASS_PATH)
class2hierarchy = load_multilabel(CLASS_HIERARCHY_PATH)
class2related = load_class_keywords(CLASS_RELATED_PATH)


In [4]:
def label_stats(name, silver):
    counts = [len(v) for v in silver.values()]
    print(f"\n{name}")
    print(f"  Documents: {len(counts)}")
    print(f"  Avg labels/doc: {np.mean(counts):.2f}")
    print(f"  Min labels: {np.min(counts)}")
    print(f"  Max labels: {np.max(counts)}")

def hierarchy_consistency(silver, hierarchy):
    ok = total = 0
    for labels in silver.values():
        L = set(labels)
        for parent, children in hierarchy.items():
            for child in children:
                if child in L:
                    total += 1
                    if parent in L:
                        ok += 1
    return ok / total if total > 0 else 0

def count_present_classes(silver, total_classes=531):
    # Collect all unique labels appearing in the dataset
    all_labels = set(label for labels in silver.values() for label in labels)
    
    # Count how many distinct classes are present
    n_present = len(all_labels)
    
    print(f"Present classes: {n_present}/{total_classes} ({n_present/total_classes*100:.2f}%)")
    return n_present

def analyze_coverage(silver, name):
    all_labels = []
    for info in silver.values():
        all_labels.extend(info)
    
    unique = len(set(all_labels))
    counter = Counter(all_labels)
    top5 = counter.most_common(5)
    
    print(f"\n{name}:")
    print(f"  Coverage: {unique}/531 ({unique/531*100:.1f}%)")
    print(f"  Top-5 most frequent:")
    for cls, count in top5:
        print(f"    Class {cls}: {count} times ({count/len(silver)*100:.1f}%)")

In [5]:
def expand_with_hierarchy(core_labels, hierarchy):
    """Add parents in the hierarchy"""
    expanded = set(core_labels)
    
    for label in core_labels:
        for parent, children in hierarchy.items():
            if label in children:
                expanded.add(parent)
                expanded.update(expand_with_hierarchy([parent], hierarchy))
    
    # Keep 3 more specific
    return sorted(list(expanded))[:3]

In [6]:
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
print(f"üß† Loading model: {model_name}")
model = SentenceTransformer(model_name, device=device)

def get_embeddings(texts, model, batch_size=64, save_path=None, force_recompute=False):
    
    if save_path and os.path.exists(save_path) and not force_recompute:
        print(f"üì¶ Loading precomputed embeddings from {save_path}")
        return torch.load(save_path, map_location="cpu")

    print(f"‚öôÔ∏è Encoding {len(texts)} texts...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(embeddings, save_path)
        print(f"üíæ Saved embeddings to {save_path}")

    return embeddings


def get_enriched_category_text(class_id, id2class, class2related, max_keywords=10):
    class_name = id2class[str(class_id)]
    
    # Replace underscore with space for better understanding
    clean_name = class_name.replace('_', ' ')
    
    # Add keywords if available
    if class_name in class2related:
        keywords = class2related[class_name][:max_keywords]
        keywords_str = " ".join(keywords)
        enriched = f"{clean_name} {keywords_str}"
    else:
        enriched = clean_name
    
    return enriched


def generate_silver_labels_FAST(
    review_texts,
    review_ids,
    id2class,
    class2related,
    tokenizer,
    model,
    class_hierarchy,
    output_path="Silver/silver_train_true.json"
):
    
    # Create enriched category descriptions
    enriched_categories = [
        get_enriched_category_text(i, id2class, class2related)
        for i in tqdm(range(531), desc="Enriching")
    ]

    category_embeddings = get_embeddings(
        enriched_categories,
        model = model,
        batch_size=64,
        save_path="Embeddings/labels_true.pt",
        force_recompute=True
    )

    review_embeddings = get_embeddings(
        review_texts,
        model = model,
        batch_size=64,
        save_path="Embeddings/X_train_true.pt",
        force_recompute=True
    )

    def to_numpy(x):
        if isinstance(x, torch.Tensor):
            return x.detach().cpu().numpy()
        elif isinstance(x, np.ndarray):
            return x
        elif isinstance(x, list):
            return np.array(x)
        else:
            raise TypeError(f"Unsupported type for embeddings: {type(x)}")

    category_embeddings_np = to_numpy(category_embeddings)
    review_embeddings_np = to_numpy(review_embeddings)

    all_similarities = torch.matmul(
        torch.tensor(review_embeddings_np),
        torch.tensor(category_embeddings_np).T
    ).cpu().numpy()

    # Assign labels
    silver_labels = {}
    
    for idx, review_id in enumerate(tqdm(review_ids, desc="Labels")):
        similarities = all_similarities[idx]
        
        # Top-3 classes et scores
        topk_idx = np.argsort(similarities)[-3:][::-1]
        topk_scores = similarities[topk_idx]
        
        # Expansion with hierarchy
        expanded = expand_with_hierarchy(topk_idx.tolist(), class_hierarchy)
        
        # Normalized pseudo-probabilities (0‚Äì1)
        pseudo_probs = ((topk_scores + 1) / 2).tolist()
        
        # Save everything
        silver_labels[review_id] = {
            "labels": expanded[:3],
            "scores": topk_scores[:3].tolist(),
            "probs": pseudo_probs[:3]
        }
    
    # Save as JSON
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(silver_labels, f, indent=2, ensure_ascii=False)
    
    print(f"Silver labels saved to: {output_path}")
    return silver_labels


üß† Loading model: paraphrase-multilingual-MiniLM-L12-v2


In [None]:
# Exec

print("\n" + "="*50)
print("GENERATING TRAIN SILVER LABELS (FAST)")
print("="*50)

silver_train_safe = generate_silver_labels_FAST(
    list(id2text_train.values()),
    list(id2text_train.keys()),
    id2class,
    class2related,
    None,
    model,
    class2hierarchy,
    output_path = "Silver/silver_train_true.json"
)

get_embeddings(list(id2text_test.values()), model=model, batch_size=64, save_path="Embeddings/X_test_true.pt", force_recompute=True)

# Stats
print()
label_stats("Safe Train", silver_train_safe)


GENERATING TRAIN SILVER LABELS (FAST)


Enriching: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:00<00:00, 1299402.23it/s]


‚öôÔ∏è Encoding 531 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:00<00:00, 10.87it/s]


üíæ Saved embeddings to Embeddings/labels_true.pt
‚öôÔ∏è Encoding 29487 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [01:15<00:00,  6.07it/s]


üíæ Saved embeddings to Embeddings/X_train_true.pt


Labels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29487/29487 [00:02<00:00, 13364.99it/s]


Silver labels saved to: Silver/silver_train_true.json
‚öôÔ∏è Encoding 19658 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 308/308 [00:55<00:00,  5.53it/s]


üíæ Saved embeddings to Embeddings/X_test_noam.pt


Safe Train
  Documents: 29487
  Avg labels/doc: 3.00
  Min labels: 3
  Max labels: 3


In [8]:
silver_train_labels_only = {
    pid: info["labels"]
    for pid, info in silver_train_safe.items()
}

consistency = hierarchy_consistency(silver_train_labels_only, class2hierarchy)
print(f"\nHierarchy Consistency: {consistency:.2%}")


Hierarchy Consistency: 98.96%


In [9]:
def label_coverage(silver_labels, num_classes=531):
    """
    silver_labels : { review_id: [label1, label2, ...] }
    returns coverage_ratio, covered_classes
    """
    covered = set()

    for _, labels in silver_labels.items():
        for lbl in labels:
            if 0 <= lbl < num_classes:
                covered.add(lbl)

    coverage_ratio = len(covered) / num_classes
    return coverage_ratio, sorted(list(covered))

coverage, classes = label_coverage(silver_train_labels_only)
print(f"Coverage: {coverage:.2%}")
print(f"Covered classes: {len(classes)}/{531}")

Coverage: 50.47%
Covered classes: 268/531


# TODO faire test + train et labels embeddings avec hierarchy