In [1]:
# ─── 0) Imports ────────────────────────────────────────────────────────────
import sys
!{sys.executable} -m pip install transformers datasets evaluate --quiet

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import (AutoTokenizer, RobertaModel,
                          TrainingArguments, Trainer)
from sklearn.metrics import f1_score, accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())


[2025-04-24 14:24:03,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to xpu (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
2025-04-24 14:24:11,617 - datasets - INFO - PyTorch version 2.6.0+xpu available.


In [2]:
# ✅ Use Intel GPU if available
device = torch.device("xpu" if hasattr(torch, "xpu") and torch.xpu.is_available() else "cpu")
print("Using device:", device)

Using device: xpu


In [3]:
# ─── 1) FocalLoss Definition ─────────────────────────────────────────────
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        BCE = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        p_t = torch.exp(-BCE)
        loss = self.alpha * (1 - p_t) ** self.gamma * BCE
        return loss.mean() if self.reduction=="mean" else loss.sum()

In [4]:
# ─── 2) Load & preprocess dataset ────────────────────────────────────────
dataset = load_dataset("go_emotions", "simplified")

# 📝 Grab and print the human-readable label names before we overwrite them
emotion_labels = dataset["train"].features["labels"].feature.names
print(f"GoEmotions has {len(emotion_labels)} labels: {emotion_labels}")

# Now convert each example’s label-list into a 28-dim multi-hot vector
import numpy as np

def make_multihot(example):
    mh = np.zeros(len(emotion_labels), dtype=np.int64)
    for lbl in example["labels"]:
        mh[lbl] = 1
    example["labels"] = mh
    return example

# map example-by-example so we can read 'labels' as a list of ints
dataset = dataset.map(make_multihot, batched=False)

GoEmotions has 28 labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [5]:
# ─── 3) Compute per-label pos_weight & move to device ────────────────────
all_labels = np.stack(dataset["train"]["labels"])
pos_weight = (all_labels.shape[0] - all_labels.sum(axis=0)) / (all_labels.sum(axis=0) + 1e-12)
pos_weight = torch.tensor(pos_weight, dtype=torch.float, device=device)

In [6]:
# ─── 4) Tokenize ─────────────────────────────────────────────────────────
checkpoint = "roberta-base"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(ex):
    return tokenizer(ex["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=128)

tokenized = dataset.map(tokenize_fn, batched=True)
tokenized.set_format(type="torch",
                     columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

In [7]:
# ─── 5) Threshold‐finding & compute_metrics ───────────────────────────────
def find_best_threshold(logits, labels):
    best, best_thr = 0, 0.5
    probs = torch.sigmoid(torch.tensor(logits))
    for thr in np.linspace(0.1, 0.9, 17):
        preds = (probs > thr).cpu().numpy().astype(int)
        f1 = f1_score(labels, preds, average="micro", zero_division=0)
        if f1 > best:
            best, best_thr = f1, thr
    return best_thr

optimal_threshold = None

def compute_metrics(eval_pred):
    global optimal_threshold
    logits, labels = eval_pred

    if optimal_threshold is None:
        optimal_threshold = find_best_threshold(logits, labels)

    probs = 1 / (1 + np.exp(-logits))
    preds = (probs > optimal_threshold).astype(int)

    micro = f1_score(labels, preds, average="micro", zero_division=0)
    hamming = accuracy_score(labels.flatten(), preds.flatten())
    return {"micro_f1": micro, "hamming_acc": hamming, "threshold": optimal_threshold}

In [8]:
# ─── 6) Model Definition ─────────────────────────────────────────────────
class RobertaForMultiLabel(nn.Module):
    def __init__(self, num_labels=28):
        super().__init__()
        self.roberta    = RobertaModel.from_pretrained(checkpoint).to(device)
        self.dropout    = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
        # switch to FocalLoss or BCEWithLogitsLoss(pos_weight=pos_weight)
        self.loss_fct   = FocalLoss(alpha=1.0, gamma=2.0)
        # self.loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)[1]
        logits  = self.classifier(self.dropout(outputs))
        loss    = self.loss_fct(logits, labels.float().to(device)) if labels is not None else None
        return {"loss": loss, "logits": logits}

In [11]:
# ─── 7) Training Arguments & Trainer ─────────────────────────────────────
training_args = TrainingArguments(
    output_dir="goemotions_multilabel_model",
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    logging_dir="./logs",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=RobertaForMultiLabel(num_labels=28),
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    compute_metrics=compute_metrics,
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# ─── 8) Train ────────────────────────────────────────────────────────────
trainer.train()

2025-04-24 14:58:06,297 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd


Epoch,Training Loss,Validation Loss,Micro F1,Hamming Acc,Threshold
1,0.0245,0.022553,0.600081,0.96751,0.45
2,0.0215,0.021763,0.6148,0.968622,0.45


TrainOutput(global_step=10854, training_loss=0.024539222413276208, metrics={'train_runtime': 804.9374, 'train_samples_per_second': 107.859, 'train_steps_per_second': 13.484, 'total_flos': 0.0, 'train_loss': 0.024539222413276208, 'epoch': 2.0})

In [13]:
best_ckpt = trainer.state.best_model_checkpoint
print("Best ckpt:", best_ckpt)

Best ckpt: goemotions_multilabel_model/checkpoint-10854
