# Domain adaptation exp1 – distill to 9 emotions

## Environment setup

In [1]:
!pip install -q -U transformers datasets accelerate evaluate scikit-learn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/511.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m154.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Mount Google Drive

## Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Seeds and target labels

In [3]:
import os, json, random
from pathlib import Path
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import f1_score, classification_report


## Load distillation JSONL data

In [4]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

TARGET_EMOTIONS = [
    "anger", "anticipation", "caring", "disgust", "fear",
    "joy", "neutral", "sadness", "surprise"
]
label2id = {l:i for i,l in enumerate(TARGET_EMOTIONS)}
id2label = {i:l for l,i in label2id.items()}
NUM_LABELS = len(TARGET_EMOTIONS)

# ---- Paths (edit if needed) ----
BASE_DIR = Path("/content/drive/MyDrive/VibeQ-EIE/llmdata")
JOURNAL_JSONL = BASE_DIR / "teacher_journals_distillation.jsonl"
ISEAR_JSONL   = BASE_DIR / "teacher_isear_distillation.jsonl"

# Your already-trained 9-class softmax student (from Notebook 1)
STUDENT_INIT_PATH = "/content/drive/MyDrive/VibeQ-EIE/models/student_singlelabel_9emotions_v1"

OUT_DIR = Path("/content/drive/MyDrive/VibeQ-EIE/models/student_distilled_9_v1")
OUT_DIR.mkdir(parents=True, exist_ok=True)

MAX_LENGTH = 192


## Original label vocabularies

In [5]:
def read_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

journals = read_jsonl(JOURNAL_JSONL)
isear    = read_jsonl(ISEAR_JSONL)

print("journals:", len(journals))
print("isear:", len(isear))
print("example keys:", journals[0].keys())


def row_to_example(row, source_name: str):
    text = row["text"]
    pe = row["primary_emotion"].strip().lower()
    if pe not in label2id:
        return None

    probs = row["teacher_emotion_probs"]
    # enforce order
    teacher_probs = [float(probs[e]) for e in TARGET_EMOTIONS]
    teacher_probs = np.array(teacher_probs, dtype=np.float32)
    # normalize safety
    s = teacher_probs.sum()
    if s <= 0:
        teacher_probs[:] = 1.0 / NUM_LABELS
    else:
        teacher_probs /= s

    conf = float(row.get("teacher_confidence", 1.0))
    conf = max(0.0, min(1.0, conf))

    return {
        "text": text,
        "hard_label": int(label2id[pe]),          # teacher primary (optional CE)
        "teacher_probs": teacher_probs.tolist(), # KL target
        "teacher_conf": conf,
        "source": source_name,
        "is_anchor": 0,
    }

teach_rows = []
for r in journals:
    ex = row_to_example(r, "teacher_journal")
    if ex: teach_rows.append(ex)

for r in isear:
    ex = row_to_example(r, "teacher_isear")
    if ex: teach_rows.append(ex)

teacher_ds = Dataset.from_list(teach_rows)
teacher_ds



journals: 1800
isear: 1963
example keys: dict_keys(['primary_emotion', 'secondary_emotions', 'teacher_emotion_probs', 'vad', 'teacher_confidence', 'text'])


Dataset({
    features: ['text', 'hard_label', 'teacher_probs', 'teacher_conf', 'source', 'is_anchor'],
    num_rows: 3763
})

## Merge and split datasets

In [6]:
# GoEmotions original label names (order)
orig_names = [
    "admiration","amusement","anger","annoyance","approval","caring",
    "confusion","curiosity","desire","disappointment","disapproval",
    "disgust","embarrassment","excitement","fear","gratitude","grief",
    "joy","love","nervousness","optimism","pride","realization",
    "relief","remorse","sadness","surprise","neutral"
]
orig_id2label = {i:n for i,n in enumerate(orig_names)}

EMOTION_MAP = {
    "anger": "anger", "annoyance": "anger", "disapproval": "anger",
    "optimism": "anticipation", "curiosity": "anticipation", "desire": "anticipation",
    "caring": "caring", "love": "caring", "admiration": "caring", "gratitude": "caring", "approval": "caring",
    "disgust": "disgust",
    "fear": "fear", "nervousness": "fear",
    "joy": "joy", "excitement": "joy", "amusement": "joy", "pride": "joy", "relief": "joy",
    "neutral": "neutral",
    "sadness": "sadness", "disappointment": "sadness", "grief": "sadness", "remorse": "sadness", "embarrassment": "sadness",
    "surprise": "surprise", "confusion": "surprise", "realization": "surprise"
}

# choose anchor size
ANCHOR_N = 12000  # adjust (A100 can do more)
go = load_dataset("go_emotions", "simplified")

# remove unclear if present
for split in ["train", "validation", "test"]:
    if "example_very_unclear" in go[split].column_names:
        go[split] = go[split].filter(lambda x: x["example_very_unclear"] == 0)

go_train = go["train"].shuffle(seed=SEED).select(range(min(ANCHOR_N, len(go["train"]))))

def map_go_to_9(example):
    # multiple labels -> pick ONE anchor label (primary) by mapping all and choosing first (or heuristic)
    # heuristic: map all old labels -> mapped labels; choose the first mapped label
    mapped = []
    for old_idx in example["labels"]:
        old_name = orig_id2label[int(old_idx)]
        if old_name in EMOTION_MAP:
            mapped.append(EMOTION_MAP[old_name])
    if not mapped:
        return {"keep": 0}
    # choose one mapped label (could also choose most frequent mapped in list)
    hard = label2id[mapped[0]]
    return {
        "text": example["text"],
        "hard_label": int(hard),
        "teacher_probs": [0.0]*NUM_LABELS,  # unused for anchor
        "teacher_conf": 0.0,
        "source": "anchor_go",
        "is_anchor": 1,
        "keep": 1
    }

anchor_tmp = go_train.map(map_go_to_9)
anchor_ds = anchor_tmp.filter(lambda x: x["keep"] == 1).remove_columns(["keep"])
anchor_ds


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'id', 'hard_label', 'teacher_probs', 'teacher_conf', 'source', 'is_anchor'],
    num_rows: 12000
})

## Tokenize text

In [7]:
full = concatenate_datasets([teacher_ds, anchor_ds]).shuffle(seed=SEED)

# Split
splits = full.train_test_split(test_size=0.1, seed=SEED)
train_ds = splits["train"]
val_ds   = splits["test"]

print("train:", len(train_ds), "val:", len(val_ds))
print(train_ds[0])


train: 14186 val: 1577
{'text': 'Thanks!', 'hard_label': 2, 'teacher_probs': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'teacher_conf': 0.0, 'source': 'anchor_go', 'is_anchor': 1, 'labels': [15], 'id': 'edr6vs7'}


## Custom distillation trainer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(STUDENT_INIT_PATH)

def tok(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    enc["labels"] = batch["hard_label"]                 # for CE
    enc["teacher_probs"] = batch["teacher_probs"]       # for KL
    enc["teacher_conf"] = batch["teacher_conf"]         # weight
    enc["is_anchor"] = batch["is_anchor"]               # mask
    return enc

train_tok = train_ds.map(tok, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(tok, batched=True, remove_columns=val_ds.column_names)

train_tok.set_format("torch")
val_tok.set_format("torch")

train_tok, val_tok


Map:   0%|          | 0/14186 [00:00<?, ? examples/s]

Map:   0%|          | 0/1577 [00:00<?, ? examples/s]

(Dataset({
     features: ['teacher_probs', 'teacher_conf', 'is_anchor', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 14186
 }),
 Dataset({
     features: ['teacher_probs', 'teacher_conf', 'is_anchor', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1577
 }))

## Evaluation metrics

In [9]:
class DistillTrainer(Trainer):
    def __init__(self, *args, T=2.0, alpha=0.7, lambda_hard_teacher=0.2, **kwargs):
        super().__init__(*args, **kwargs)
        self.T = float(T)
        self.alpha = float(alpha)
        self.lambda_hard_teacher = float(lambda_hard_teacher)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        labels = inputs.pop("labels")
        teacher_probs = inputs.pop("teacher_probs")
        teacher_conf  = inputs.pop("teacher_conf")
        is_anchor     = inputs.pop("is_anchor")

        outputs = model(**inputs)
        logits = outputs.logits

        # CE
        ce = F.cross_entropy(logits, labels)

        # KL (teacher rows only)
        teacher_mask = (is_anchor == 0).float()

        log_p_s = F.log_softmax(logits / self.T, dim=-1)
        p_t = teacher_probs.to(log_p_s.dtype)

        kl_per = F.kl_div(log_p_s, p_t, reduction="none").sum(dim=-1)

        w = teacher_conf.float().clamp(0, 1)
        kl_per = kl_per * w

        denom = teacher_mask.sum().clamp(min=1.0)
        kl = (kl_per * teacher_mask).sum() / denom
        kl = kl * (self.T ** 2)

        # optional CE on teacher primary (teacher rows only)
        ce_teacher = F.cross_entropy(logits, labels, reduction="none")
        ce_teacher = (ce_teacher * teacher_mask).sum() / denom

        loss = (1 - self.alpha) * ce + self.alpha * kl + self.lambda_hard_teacher * ce_teacher

        return (loss, outputs) if return_outputs else loss


## Initialize student model

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    f1_macro = f1_score(labels, preds, average="macro", zero_division=0)
    f1_micro = f1_score(labels, preds, average="micro", zero_division=0)
    acc = (preds == labels).mean()

    return {"acc": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}


## Trainer configuration

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    STUDENT_INIT_PATH,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir=str(OUT_DIR),
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.1,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    logging_steps=100,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=False
)


## Validate model

In [16]:
from transformers import default_data_collator
trainer = DistillTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,

    # Distillation knobs
    T=2.0,
    alpha=0.7,
    lambda_hard_teacher=0.2
)

trainer.train()





Epoch,Training Loss,Validation Loss,Acc,F1 Macro,F1 Micro
1,0.8443,0.80068,0.797717,0.776704,0.797717
2,0.4992,0.671193,0.831959,0.818923,0.831959
3,0.4097,0.660803,0.830691,0.814973,0.830691
4,0.2806,0.626728,0.837666,0.823921,0.837666


TrainOutput(global_step=3548, training_loss=0.5790663746594039, metrics={'train_runtime': 1760.6105, 'train_samples_per_second': 32.23, 'train_steps_per_second': 2.015, 'total_flos': 1.9831170379355136e+16, 'train_loss': 0.5790663746594039, 'epoch': 4.0})

## Save distilled checkpoint

In [17]:
pred = trainer.predict(val_tok)
logits = pred.predictions
labels = pred.label_ids

preds = np.argmax(logits, axis=-1)
print("F1 macro:", f1_score(labels, preds, average="macro", zero_division=0))
print("F1 micro:", f1_score(labels, preds, average="micro", zero_division=0))
print("\nReport:\n", classification_report(labels, preds, target_names=TARGET_EMOTIONS, zero_division=0))


F1 macro: 0.8239208274410966
F1 micro: 0.8376664552948636

Report:
               precision    recall  f1-score   support

       anger       0.81      0.82      0.81       192
anticipation       0.78      0.81      0.79       113
      caring       0.84      0.89      0.86       338
     disgust       0.79      0.81      0.80        54
        fear       0.77      0.87      0.81        68
         joy       0.84      0.89      0.86       150
     neutral       0.90      0.81      0.85       416
     sadness       0.84      0.84      0.84       152
    surprise       0.81      0.74      0.78        94

    accuracy                           0.84      1577
   macro avg       0.82      0.83      0.82      1577
weighted avg       0.84      0.84      0.84      1577



In [18]:
trainer.save_model(str(OUT_DIR))
tokenizer.save_pretrained(str(OUT_DIR))
print("Saved to:", OUT_DIR)


Saved to: /content/drive/MyDrive/VibeQ-EIE/models/student_distilled_9_v1
