# Dataset

In [16]:
# Requirements:
#   pip install datasets transformers accelerate evaluate scikit-learn

from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate
import os
from datetime import datetime

# Load dataset (HuffPost / News Category)
ds = load_dataset("heegyu/news-category-dataset") 

# Concatenate headline + short_description -> text
def join_cols(batch):
    head = batch.get("headline", "") or ""
    desc = batch.get("short_description", "") or ""
    batch["text"] = (head + " " + desc).strip()
    return batch

ds = ds.map(join_cols)

# Build label mapping (string categories -> ids)
label_names = sorted(list(set(ds["train"]["category"])))
label2id = {lab: i for i, lab in enumerate(label_names)}
id2label = {i: lab for lab, i in label2id.items()}

def encode_label(batch):
    batch["labels"] = label2id[batch["category"]]
    return batch

ds = ds.map(encode_label)

# cast AFTER labels exist
from datasets import ClassLabel
labels_cls = ClassLabel(names=label_names)
ds = ds.cast_column("labels", labels_cls)

print(ds["train"].features)  # sanity check: 'labels' should be ClassLabel

# 4) Stratified train/val/test split (80/10/10) by 'labels'
tmp = ds["train"].train_test_split(test_size=0.2, seed=42, stratify_by_column="labels")
valtest = tmp["test"].train_test_split(test_size=0.5, seed=42, stratify_by_column="labels")

train_subset = tmp["train"].select(range(20000))      
val_subset   = valtest["train"].select(range(2000))  
test_subset  = valtest["test"].select(range(2000))

ds_splits = {"train": train_subset, "validation": val_subset, "test": test_subset}

print(ds_splits)

{'link': Value('string'), 'headline': Value('string'), 'category': Value('string'), 'short_description': Value('string'), 'authors': Value('string'), 'date': Value('timestamp[s]'), 'text': Value('string'), 'labels': ClassLabel(names=['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'THE WORLDPOST', 'TRAVEL', 'U.S. NEWS', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST'])}
{'train': Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'text', 'labels'],
    num_rows: 20000
}), 'validation': Dataset({
    features: ['link', 'headline', 'category', 'short_descript

# Preprocessing

In [None]:
# Tokenizer (DistilBERT base; generic)
checkpoint = "distilbert-base-uncased"
tok = AutoTokenizer.from_pretrained(checkpoint)

def tok_fn(batch):
    return tok(batch["text"], truncation=True, max_length=256)

cols_to_remove = list(set(ds_splits["train"].column_names) - set(["text", "labels"]))
ds_tok = {k: v.map(tok_fn, batched=True, remove_columns=cols_to_remove) for k, v in ds_splits.items()}

# 6) Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    out = {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_micro": f1.compute(predictions=preds, references=labels, average="micro")["f1"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }
    return out

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

# Train Full Fine-tuneing

In [27]:
# Model (generic, no prior news fine-tuning)
num_labels = len(label_names)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# Training config
run_name = "huffpost-generic-distilbert"
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,        
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",                
)

collator = DataCollatorWithPadding(tok)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tok,
    data_collator=collator,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,1.801,1.737012,0.565,0.565,0.247974


TrainOutput(global_step=1250, training_loss=2.173415771484375, metrics={'train_runtime': 149.6471, 'train_samples_per_second': 133.648, 'train_steps_per_second': 8.353, 'total_flos': 391794960991104.0, 'train_loss': 2.173415771484375, 'epoch': 1.0})

In [28]:
# Evaluate on test
test_metrics = trainer.evaluate(ds_tok["test"])
print("Test metrics:", test_metrics)

Test metrics: {'eval_loss': 1.7709825038909912, 'eval_accuracy': 0.555, 'eval_f1_micro': 0.555, 'eval_f1_macro': 0.2527760201367594, 'eval_runtime': 4.7029, 'eval_samples_per_second': 425.266, 'eval_steps_per_second': 13.396, 'epoch': 1.0}


In [29]:
# Save model, tokenizer, and a simple metrics file
trainer.save_model(run_name)
tok.save_pretrained(run_name)

os.makedirs(run_name, exist_ok=True)
with open(os.path.join(run_name, "metrics.txt"), "w") as f:
    f.write("\nTest:\n")
    for k, v in test_metrics.items():
        f.write(f"{k}: {v}\n")


from transformers import pipeline
clf = pipeline("text-classification", model=run_name, tokenizer=tok, truncation=True)
print(clf("NASA announces new Artemis mission milestone"))
print(clf("The stock market crashed today due to economic uncertainty."))

Device set to use cuda:0


[{'label': 'SCIENCE', 'score': 0.10476445406675339}]
[{'label': 'BUSINESS', 'score': 0.2333836406469345}]


# Train LoRA

In [44]:
from peft import LoraConfig, get_peft_model

for rank in [2, 8, 16]:
    print(f"\n=== Training LoRA rank {rank} ===")

    # Base model
    num_labels = len(label_names)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # LoRA config
    lora_cfg = LoraConfig(
        r=rank,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],
        task_type="SEQ_CLS",
    )

    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()

    # Training config
    run_name = f"huffpost-generic-distilbert-lora-r{rank}"
    args = TrainingArguments(
        output_dir=run_name,
        run_name=run_name,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
    )

    collator = DataCollatorWithPadding(tok)

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tok,
        data_collator=collator,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate
    test_metrics = trainer.evaluate(ds_tok["test"])
    print(f"rank {rank}:", test_metrics)

    # Save model, tokenizer
    model.save_pretrained(run_name)
    tokenizer.save_pretrained(run_name)



=== Training LoRA rank 2 ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 696,618 || all params: 67,682,388 || trainable%: 1.0292


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,2.5943,2.559246,0.373,0.373,0.078764


rank 2: {'eval_loss': 2.562145233154297, 'eval_accuracy': 0.382, 'eval_f1_micro': 0.382, 'eval_f1_macro': 0.08178060379841959, 'eval_runtime': 5.1587, 'eval_samples_per_second': 387.698, 'eval_steps_per_second': 12.212, 'epoch': 1.0}

=== Training LoRA rank 8 ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 917,802 || all params: 67,903,572 || trainable%: 1.3516


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,2.5738,2.538581,0.375,0.375,0.076739


rank 8: {'eval_loss': 2.5357024669647217, 'eval_accuracy': 0.3795, 'eval_f1_micro': 0.3795, 'eval_f1_macro': 0.0790130870054391, 'eval_runtime': 5.1401, 'eval_samples_per_second': 389.099, 'eval_steps_per_second': 12.257, 'epoch': 1.0}

=== Training LoRA rank 16 ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 1,212,714 || all params: 68,198,484 || trainable%: 1.7782


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,2.5766,2.542004,0.3735,0.3735,0.075045


rank 16: {'eval_loss': 2.53794002532959, 'eval_accuracy': 0.3805, 'eval_f1_micro': 0.3805, 'eval_f1_macro': 0.07921883093632665, 'eval_runtime': 5.0554, 'eval_samples_per_second': 395.617, 'eval_steps_per_second': 12.462, 'epoch': 1.0}


## Train IA3

In [46]:
from peft import IA3Config, get_peft_model

ia3_variants = {
    "small":  ["q_lin", "v_lin"],
    "medium": ["q_lin", "v_lin", "out_lin"],
    "full":   ["q_lin", "v_lin", "out_lin", "lin1", "lin2"],
}

for name, target_mods in ia3_variants.items():
    print(f"\n=== Training IA3 ({name}) ===")

    num_labels = len(label_names)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # ✅ only include FFN modules if they’re inside target_mods
    feedforward_mods = [m for m in ["lin1", "lin2"] if m in target_mods]

    ia3_cfg = IA3Config(
        task_type="SEQ_CLS",
        target_modules=target_mods,
        feedforward_modules=feedforward_mods,
    )

    model = get_peft_model(model, ia3_cfg)
    model.print_trainable_parameters()

    run_name = f"huffpost-generic-distilbert-ia3-{name}"
    args = TrainingArguments(
        output_dir=run_name,
        run_name=run_name,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
    )

    collator = DataCollatorWithPadding(tok)

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tok,
        data_collator=collator,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    test_metrics = trainer.evaluate(ds_tok["test"])
    print(f"IA3-{name}:", test_metrics)

    model.save_pretrained(run_name)
    tok.save_pretrained(run_name)



=== Training IA3 (small) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 632,106 || all params: 67,617,876 || trainable%: 0.9348


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,3.0424,2.994007,0.265,0.265,0.026468


IA3-small: {'eval_loss': 3.012336015701294, 'eval_accuracy': 0.2635, 'eval_f1_micro': 0.2635, 'eval_f1_macro': 0.027034833403383144, 'eval_runtime': 4.5488, 'eval_samples_per_second': 439.676, 'eval_steps_per_second': 13.85, 'epoch': 1.0}

=== Training IA3 (medium) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 636,714 || all params: 67,622,484 || trainable%: 0.9416


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,3.0266,2.974749,0.2805,0.2805,0.029411


IA3-medium: {'eval_loss': 2.9899487495422363, 'eval_accuracy': 0.276, 'eval_f1_micro': 0.276, 'eval_f1_macro': 0.028922916254079084, 'eval_runtime': 4.6643, 'eval_samples_per_second': 428.785, 'eval_steps_per_second': 13.507, 'epoch': 1.0}

=== Training IA3 (full) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 659,754 || all params: 67,645,524 || trainable%: 0.9753


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,2.9944,2.941963,0.295,0.295,0.030917


IA3-full: {'eval_loss': 2.9561524391174316, 'eval_accuracy': 0.283, 'eval_f1_micro': 0.283, 'eval_f1_macro': 0.02897736088359369, 'eval_runtime': 7.2178, 'eval_samples_per_second': 277.092, 'eval_steps_per_second': 8.728, 'epoch': 1.0}


## Train Prompt Tuning

In [49]:
from peft import PromptTuningConfig, get_peft_model

# DistilBERT specifics
num_layers = 6       # DistilBERT has 6 transformer layers
token_dim = 768      # hidden size

for num_virtual_tokens in [10, 50, 100]:
    print(f"\n=== Training Prompt Tuning ({num_virtual_tokens} tokens) ===")

    # Base model
    num_labels = len(label_names)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # ✅ Prompt tuning config (added num_layers + token_dim)
    prompt_cfg = PromptTuningConfig(
        task_type="SEQ_CLS",
        num_virtual_tokens=num_virtual_tokens,
        tokenizer_name_or_path=checkpoint,
        num_layers=num_layers,    # must specify for DistilBERT
        token_dim=token_dim,      # hidden size = 768 for DistilBERT-base
        num_attention_heads=12,   # DistilBERT has 12 attention heads
    )

    # Wrap with PEFT
    model = get_peft_model(model, prompt_cfg)
    model.print_trainable_parameters()

    # Training config
    run_name = f"huffpost-generic-distilbert-prompt-{num_virtual_tokens}"
    args = TrainingArguments(
        output_dir=run_name,
        run_name=run_name,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        learning_rate=5e-5,             # slightly higher than LoRA/IA3
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
    )

    collator = DataCollatorWithPadding(tok)

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tok,
        data_collator=collator,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    test_metrics = trainer.evaluate(ds_tok["test"])
    print(f"Prompt Tuning ({num_virtual_tokens} tokens):", test_metrics)

    model.save_pretrained(run_name)
    tok.save_pretrained(run_name)



=== Training Prompt Tuning (10 tokens) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 630,570 || all params: 67,616,340 || trainable%: 0.9326


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,2.9033,2.856576,0.326,0.326,0.04831


Prompt Tuning (10 tokens): {'eval_loss': 2.8726015090942383, 'eval_accuracy': 0.327, 'eval_f1_micro': 0.327, 'eval_f1_macro': 0.052532646030350225, 'eval_runtime': 4.9885, 'eval_samples_per_second': 400.923, 'eval_steps_per_second': 12.629, 'epoch': 1.0}

=== Training Prompt Tuning (50 tokens) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 661,290 || all params: 67,647,060 || trainable%: 0.9776


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,3.0817,3.038867,0.2435,0.2435,0.023913


Prompt Tuning (50 tokens): {'eval_loss': 3.056018590927124, 'eval_accuracy': 0.2485, 'eval_f1_micro': 0.2485, 'eval_f1_macro': 0.02583327723837385, 'eval_runtime': 6.843, 'eval_samples_per_second': 292.271, 'eval_steps_per_second': 9.207, 'epoch': 1.0}

=== Training Prompt Tuning (100 tokens) ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 699,690 || all params: 67,685,460 || trainable%: 1.0337


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,3.2652,3.234683,0.179,0.179,0.00723


Prompt Tuning (100 tokens): {'eval_loss': 3.261652946472168, 'eval_accuracy': 0.1745, 'eval_f1_micro': 0.1745, 'eval_f1_macro': 0.00707494577226378, 'eval_runtime': 11.614, 'eval_samples_per_second': 172.206, 'eval_steps_per_second': 5.424, 'epoch': 1.0}
