In [None]:
import os
import json
from pathlib import Path
from tqdm import tqdm
import torch
from transformers import RobertaModel, RobertaTokenizer
import random
import numpy as np

# --- Default paths ---
ROOT = Path("Amazon_products")                 # Root directory
TRAIN_DIR = ROOT / "train"
TEST_DIR = ROOT / "test"
CLASS_PATH = ROOT / "classes.txt"

SILVER_DIR = Path("Silver")
MODEL_PATH = "Embeddings/roberta_student.pt"             # Fine-tuned model path
OUTPUT_PATH = SILVER_DIR / "pseudo_labels_roberta.json"

# --- Constants ---
NUM_CLASSES = 531
MAX_LABELS = 3
CONFIDENCE_THRESHOLD = 0.65
CONFIDENCE_MIN = 0.55


random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [4]:
def load_corpus(path):
    """Load corpus into {id: text} dictionary."""
    id2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                id2text[int(pid)] = text
    return id2text

# Load training data
TRAIN_CORPUS_PATH = TRAIN_DIR / "train_corpus.txt"
id2text_train = load_corpus(TRAIN_CORPUS_PATH)

# Load label names
id2label = {}
with open(CLASS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            cid, cname = parts
            id2label[int(cid)] = cname.strip()

print(f"Train samples: {len(id2text_train)}")
print(f"Number of classes: {len(id2label)}")


Train samples: 29487
Number of classes: 531


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

class SimpleRobertaClassifier(torch.nn.Module):
    """A small RoBERTa classifier for multi-label prediction."""
    def __init__(self, num_classes):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = torch.nn.Dropout(0.2)
        self.fc = torch.nn.Linear(self.roberta.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]   # CLS token representation
        x = self.dropout(cls)
        return self.fc(x)

# Load tokenizer + model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = SimpleRobertaClassifier(NUM_CLASSES).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SimpleRobertaClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [6]:
pseudo_labels = {}
pseudo_scores = {}

with torch.no_grad():
    for pid, text in tqdm(id2text_train.items(), desc="Predicting"):
        enc = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=256,
            return_tensors="pt"
        )
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits[0]).cpu().numpy()

        # sort by descending confidence
        top_idx = probs.argsort()[::-1]
        selected_labels, selected_scores = [], []

        for idx in top_idx:
            if len(selected_labels) >= MAX_LABELS:
                break
            if probs[idx] > CONFIDENCE_THRESHOLD:
                selected_labels.append(int(idx))
                selected_scores.append(float(probs[idx]))

        if len(selected_labels) > 0:
            pseudo_labels[pid] = selected_labels
            pseudo_scores[pid] = selected_scores

# Filter by minimum top-1 confidence
pid2labelids_pseudo_filtered = {
    pid: labels
    for pid, labels in pseudo_labels.items()
    if pseudo_scores[pid][0] > CONFIDENCE_MIN
}

Predicting: 100%|██████████| 29487/29487 [15:29<00:00, 31.71it/s]


In [8]:
print("\nStatistics")
print(f"Total products: {len(id2text_train)}")
print(f"With pseudo-labels (conf > {CONFIDENCE_THRESHOLD}): {len(pseudo_labels)}")
print(f"High-confidence (top-1 > {CONFIDENCE_MIN}): {len(pid2labelids_pseudo_filtered)}")
print(f"Unlabeled: {len(id2text_train) - len(pid2labelids_pseudo_filtered)}")

print("\nSample predictions:")
for pid in list(pid2labelids_pseudo_filtered.keys())[:5]:
    labels = pid2labelids_pseudo_filtered[pid]
    scores = pseudo_scores[pid]
    print(f"\n{pid}: {id2text_train[pid][:100]}...")
    for lab_id, score in zip(labels, scores):
        print(f"   → {lab_id} | {id2label[lab_id]} (score={score:.3f})")


os.makedirs(SILVER_DIR, exist_ok=True)
output = {
    "pseudo_labels_all": pseudo_labels,
    "pseudo_scores": pseudo_scores,
    "pseudo_labels_filtered": pid2labelids_pseudo_filtered,
    "stats": {
        "total": len(id2text_train),
        "with_pseudo": len(pseudo_labels),
        "high_conf": len(pid2labelids_pseudo_filtered),
        "unlabeled": len(id2text_train) - len(pid2labelids_pseudo_filtered)
    }
}

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2)

print(f"\nSaved to {OUTPUT_PATH}")


Statistics
Total products: 29487
With pseudo-labels (conf > 0.65): 22547
High-confidence (top-1 > 0.55): 22547
Unlabeled: 6940

Sample predictions:

0: omron hem 790it automatic blood pressure monitor with advanced omron health management software so f...
   → 74 | medical_supplies_equipment (score=0.823)
   → 23 | health_personal_care (score=0.815)
   → 118 | health_monitors (score=0.811)

1: natural factors whey factors chocolate works well , but there is a lot of dead space in the containe...
   → 265 | candy_chocolate (score=0.748)

2: clif bar builder 's bar , 2 . 4 ounce bars i love the peanut butter builder 's bars . while amazon i...
   → 271 | snack_food (score=0.752)
   → 265 | candy_chocolate (score=0.700)

3: andis 1875 watt professional ceramic ionic hair dryer i was a little hesitant to purchase since it w...
   → 64 | hair_care (score=0.961)
   → 10 | beauty (score=0.812)

4: clif bar energy bars these were cheaper than what i had bought at sam 's and worked very well .

In [11]:
import json
import random

# --- Load silver labels ---
with open("Silver/hier.json", "r", encoding="utf-8") as f:
    silver_data = json.load(f)
silver_labels = silver_data["silver_hierarchy"]
print(f" Silver labels loaded: {len(silver_labels)} items")

# --- Load pseudo labels (only confident ones) ---
with open("Silver/pseudo_labels_roberta.json", "r", encoding="utf-8") as f:
    pseudo_data = json.load(f)
pseudo_labels = pseudo_data["pseudo_labels_filtered"]
print(f"  → Pseudo labels (confident) loaded: {len(pseudo_labels)} items")
pseudo_labels = {int(k): v for k, v in pseudo_labels.items()}


# --- Parameters ---
MIN_LABELS = 2
MAX_LABELS = 3

merged = {}
count_common, count_union, count_added = 0, 0, 0

for pid_str, silver_labs in silver_labels.items():
    pid = int(pid_str)

    # Only merge if this product has confident pseudo-labels
    if pid not in pseudo_labels:
        continue  # skip unlabeled products

    silver_set = set(silver_labs)
    pseudo_set = set(pseudo_labels[pid])

    # Intersection (most reliable overlap)
    common = list(silver_set & pseudo_set)

    if len(common) >= MIN_LABELS:
        merged[pid] = common[:MAX_LABELS]
        count_common += 1
    else:
        # Otherwise combine both lists (union) and limit
        combined = list(silver_set | pseudo_set)
        random.shuffle(combined)
        merged[pid] = combined[:MAX_LABELS]
        count_union += 1

# Also add confident pseudo-only products (not in silver)
for pid, labels in pseudo_labels.items():
    if str(pid) not in silver_labels:
        if len(labels) >= MIN_LABELS:
            merged[pid] = labels[:MAX_LABELS]
            count_added += 1

print("\nMerge complete!")
print(f"Docs merged via common labels : {count_common}")
print(f"Docs merged via union         : {count_union}")
print(f"Docs added only from pseudo   : {count_added}")
print(f"Total merged docs             : {len(merged)}")

# --- Stats ---
lens = [len(v) for v in merged.values()]
print(f"\nStats on merged dataset:")
print(f"  Min labels per doc : {min(lens)}")
print(f"  Max labels per doc : {max(lens)}")
print(f"  Avg labels per doc : {sum(lens)/len(lens):.2f}")

# --- Show examples ---
print("\nSample merged entries:")
for i, (pid, labs) in enumerate(list(merged.items())[:5]):
    print(f"  Product {pid} → labels: {labs}")
print("  ...")

# --- Save ---
output_path = "Silver/train_merged_for_selftraining.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(merged, f, indent=2)

print(f"\nSaved merged dataset to: {output_path}")

# --- Unlabeled count ---
unlabeled_count = len(silver_labels) - len(merged)
print(f"\nUnlabeled products kept aside: {unlabeled_count}")


 Silver labels loaded: 29487 items
  → Pseudo labels (confident) loaded: 22547 items

Merge complete!
Docs merged via common labels : 8750
Docs merged via union         : 13797
Docs added only from pseudo   : 0
Total merged docs             : 22547

Stats on merged dataset:
  Min labels per doc : 2
  Max labels per doc : 3
  Avg labels per doc : 2.74

Sample merged entries:
  Product 0 → labels: [74, 118, 23]
  Product 1 → labels: [218, 269, 265]
  Product 2 → labels: [265, 271]
  Product 3 → labels: [64, 10]
  Product 4 → labels: [429, 271, 26]
  ...

Saved merged dataset to: Silver/train_merged_for_selftraining.json

Unlabeled products kept aside: 6940
