Compared DeBERTa and RoBERTa then tried an ensemble method to see if the both combined would be the best but it turns out DeBERTa performs the best

In [2]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed,
)
from datasets import load_dataset
from sklearn.metrics import f1_score

from sklearn.utils.class_weight import compute_class_weight

# Seed and reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Shared config
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR = 1e-5
EPOCHS = 15

print("Loading QEvasion dataset...")
raw_ds = load_dataset("ailsntua/QEvasion")

def build_text(example):
    clarity = example.get("clarity_label") or "Unknown"
    text = (
        f"Context: {clarity} | "
        f"Question: {example['question']} "
        f"Answer: {example['interview_answer']}"
    )
    return {
        "text": text,
        "evasion_label": example["evasion_label"],
    }

print("Formatting training data...")
proc_train = raw_ds["train"].map(build_text)
proc_train = proc_train.class_encode_column("evasion_label")

# One stratified split used for both models
outer_split = proc_train.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label",
)
train_dev = outer_split["train"]
holdout = outer_split["test"]

inner_split = train_dev.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label",
)
base_train = inner_split["train"]
base_val = inner_split["test"]

label_names = base_train.features["evasion_label"].names
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for label, i in label2id.items()}


def make_weighted_trainer(model_name, train_base, val_base, holdout_base, output_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
        )

    train_ds = train_base.map(tokenize, batched=True)
    val_ds = val_base.map(tokenize, batched=True)
    holdout_ds = holdout_base.map(tokenize, batched=True)

    train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
    val_ds = val_ds.map(lambda x: {"labels": x["evasion_label"]})
    holdout_ds = holdout_ds.map(lambda x: {"labels": x["evasion_label"]})

    y = train_ds["evasion_label"]
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y),
        y=y,
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    class_weights_tensor = torch.tensor(
        class_weights,
        dtype=torch.float32,
        device=device,
    )

    class WeightedTrainer(Trainer):
        def __init__(self, *args, class_weights=None, **kwargs):
            super().__init__(*args, **kwargs)
            self.class_weights = class_weights

        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits", None)
            if logits is None:
                logits = outputs[0]

            loss_fn = nn.CrossEntropyLoss(
                weight=self.class_weights.to(logits.device),
                label_smoothing=0.1,
            )
            loss = loss_fn(
                logits.view(-1, model.config.num_labels),
                labels.view(-1),
            )
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = (preds == labels).mean()
        macro_f1 = f1_score(labels, preds, average="macro")
        return {
            "accuracy": acc,
            "macro_f1": macro_f1,
        }

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_names),
        id2label=id2label,
        label2id=label2id,
    )

    args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        gradient_accumulation_steps=GRAD_ACCUMULATION,
        num_train_epochs=EPOCHS,
        weight_decay=0.05,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to="none",
        dataloader_num_workers=2,
        seed=SEED,
    )

    trainer = WeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
        class_weights=class_weights_tensor,
    )

    return trainer, holdout_ds


def run_model(model_name, short_name):
    output_dir = f"./results_{short_name}"
    print(f"\nTraining model: {short_name}")
    trainer, holdout_ds = make_weighted_trainer(
        model_name=model_name,
        train_base=base_train,
        val_base=base_val,
        holdout_base=holdout,
        output_dir=output_dir,
    )
    trainer.train()

    eval_metrics = trainer.evaluate(holdout_ds)
    print(
        f"{short_name} holdout macro F1: "
        f"{eval_metrics['eval_macro_f1']:.4f}"
    )

    preds = trainer.predict(holdout_ds)
    logits = preds.predictions
    labels = preds.label_ids

    return eval_metrics, logits, labels


roberta_metrics, roberta_logits, labels = run_model(
    "roberta-large",
    "roberta_large",
)

deberta_metrics, deberta_logits, _ = run_model(
    "microsoft/deberta-v3-large",
    "deberta_v3_large",
)

# Ensemble: average probabilities from both models
def softmax_np(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

roberta_probs = softmax_np(roberta_logits)
deberta_probs = softmax_np(deberta_logits)

ensemble_probs = (roberta_probs + deberta_probs) / 2.0
ensemble_preds = np.argmax(ensemble_probs, axis=-1)

ensemble_macro_f1 = f1_score(labels, ensemble_preds, average="macro")
ensemble_acc = (ensemble_preds == labels).mean()

print("\nSummary on the same holdout set")
print(f"RoBERTa large macro F1:   {roberta_metrics['eval_macro_f1']:.4f}")
print(f"DeBERTa v3 large macro F1:{deberta_metrics['eval_macro_f1']:.4f}")
print(f"Ensemble macro F1:        {ensemble_macro_f1:.4f}")
print(f"Ensemble accuracy:        {ensemble_acc:.4f}")

Loading QEvasion dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Formatting training data...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]


Training model: roberta_large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.973068,0.495177,0.296483
2,No log,1.92418,0.553055,0.27928
3,2.120700,1.773336,0.569132,0.470437
4,2.120700,1.764205,0.562701,0.517985
5,2.120700,1.717235,0.594855,0.573023
6,1.674200,1.728956,0.639871,0.616583
7,1.674200,1.768509,0.649518,0.621502
8,1.674200,1.846902,0.630225,0.603409
9,1.336200,1.911313,0.62701,0.606773
10,1.336200,1.99866,0.639871,0.61819


roberta_large holdout macro F1: 0.5698

Training model: deberta_v3_large


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.208538,0.340836,0.152734
2,No log,1.882195,0.517685,0.31333
3,2.135600,1.807287,0.575563,0.445569
4,2.135600,1.736839,0.578778,0.534436
5,2.135600,1.711958,0.585209,0.545147
6,1.749200,1.777269,0.614148,0.534727
7,1.749200,1.817059,0.598071,0.523237
8,1.749200,1.881907,0.643087,0.577411
9,1.509200,1.950593,0.652733,0.583942
10,1.509200,2.142204,0.620579,0.56802


deberta_v3_large holdout macro F1: 0.6055

Summary on the same holdout set
RoBERTa large macro F1:   0.5698
DeBERTa v3 large macro F1:0.6055
Ensemble macro F1:        0.5827
Ensemble accuracy:        0.6348
