In [None]:
pip install -U datasets transformers scikit-learn pandas accelerate

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m193.4 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
pip install torch torchvision torchaudio



In [None]:
from datasets import load_dataset, Dataset
from datasets import ClassLabel
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer

MODEL_NAME = "answerdotai/ModernBERT-base"
MAX_LENGTH = 256
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Load dataset
ds = load_dataset("ailsntua/QEvasion")
print(ds)

# Convert to pandas and use clarity_label / evasion_label
def split_to_df(split_name):
    d = ds[split_name]
    df = d.to_pandas()

    # Make sure they're strings
    df["evasion_str"] = df["evasion_label"].astype(str)
    df["clarity_str"] = df["clarity_label"].astype(str)

    # Text input: question + answer
    df["text_q"] = df["question"].fillna("")
    df["text_a"] = df["interview_answer"].fillna("")
    df["text"]   = "Q: " + df["text_q"] + "\nA: " + df["text_a"]

    return df

train_df = split_to_df("train")
test_df  = split_to_df("test")

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)
print("Unique evasion_str:", sorted(train_df["evasion_str"].unique()))
print("Unique clarity_str:", sorted(train_df["clarity_str"].unique()))

# Stratified train/dev split on evasion or clarity (I’ll use evasion here)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=SEED)
base = train_df.reset_index(drop=True)
tr_idx, dv_idx = next(sss.split(base, base["evasion_str"]))
tr_df = base.iloc[tr_idx].copy()
dv_df = base.iloc[dv_idx].copy()

# Build vocab for 9-way evasion and 3-way clarity
EV_LABELS = sorted(tr_df["evasion_str"].unique())    # 9 labels
CL_LABELS = sorted(tr_df["clarity_str"].unique())    # 3 labels

ev2id = {lbl: i for i, lbl in enumerate(EV_LABELS)}
cl2id = {lbl: i for i, lbl in enumerate(CL_LABELS)}

for df in (tr_df, dv_df, test_df):
    df["evasion"] = df["evasion_str"].map(ev2id)
    df["clarity"] = df["clarity_str"].map(cl2id)

print("EV_LABELS:", EV_LABELS)
print("CL_LABELS:", CL_LABELS)

# Build evasion → clarity index mapping (for your KL term)
E2C = {
    "Explicit": "Clear Reply",

    "Implicit": "Ambivalent",
    "General": "Ambivalent",
    "Partial/half-answer": "Ambivalent",
    "Dodging": "Ambivalent",
    "Deflection": "Ambivalent",

    "Declining to answer": "Clear Non-Reply",
    "Claims ignorance":    "Clear Non-Reply",
    "Clarification":       "Clear Non-Reply",
}

ev_to_cl_list = []
for ev in EV_LABELS:
    cl_name = E2C[ev]        # e.g. "Clear Non-Reply"
    cl_idx  = cl2id[cl_name] # will now work
    ev_to_cl_list.append(cl_idx)

ev_to_cl_idx = torch.tensor(ev_to_cl_list, dtype=torch.long)
print("ev_to_cl_idx:", ev_to_cl_idx.tolist())


# Tokenize with ModernBERT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def df_to_hf(df):
    enc = tokenizer(
        df["text"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    enc["labels_evasion"] = df["evasion"].tolist()
    enc["labels_clarity"] = df["clarity"].tolist()
    return Dataset.from_dict(enc)

hf_train = df_to_hf(tr_df)
hf_dev   = df_to_hf(dv_df)
hf_test  = df_to_hf(test_df)

hf_train.set_format("torch")
hf_dev.set_format("torch")
hf_test.set_format("torch")


DatasetDict({
    train: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 3448
    })
    test: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 308
    })
})
Train size: (3448, 25)
Test size: (308, 25)
Unique evasion_str: ['Claims ignorance', 'Clarification', 'Declining to answer', 'Deflection', 'Dodging', 'Explicit', 'General', 'Implicit', 'Partial/half-answer']
Unique clarity_st

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os
os.environ["WANDB_DISABLED"] = "true"


clarity_train = hf_train.remove_columns(["labels_evasion"]).rename_column("labels_clarity", "labels")
clarity_dev   = hf_dev.remove_columns(["labels_evasion"]).rename_column("labels_clarity", "labels")
clarity_test  = hf_test.remove_columns(["labels_evasion"]).rename_column("labels_clarity", "labels")

num_clarity_labels = len(CL_LABELS)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_clarity_labels,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }

# --- SIMPLER TrainingArguments that works on old versions ---
training_args = TrainingArguments(
    output_dir="./modernbert-clarity",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    # no evaluation_strategy, save_strategy, load_best_model_at_end, etc.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=clarity_train,
    eval_dataset=clarity_dev,  # used when we call trainer.evaluate()
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

# Get accuracy & F1 on dev
dev_metrics = trainer.evaluate(clarity_dev)
print("Dev metrics:", dev_metrics)

#  Get accuracy & F1 on test
test_metrics = trainer.evaluate(clarity_test)
print("Test metrics:", test_metrics)


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Step,Training Loss
50,0.9365
100,0.8767
150,0.818
200,0.7779
250,0.7139
300,0.7144
350,0.7286
400,0.6222
450,0.5281
500,0.561


Dev metrics: {'eval_loss': 0.7654221057891846, 'eval_accuracy': 0.6833976833976834, 'eval_f1_macro': 0.6113102223995619, 'eval_f1_weighted': 0.6668477678767564, 'eval_runtime': 5.2892, 'eval_samples_per_second': 97.936, 'eval_steps_per_second': 3.214, 'epoch': 3.0}
Test metrics: {'eval_loss': 0.8186081647872925, 'eval_accuracy': 0.6266233766233766, 'eval_f1_macro': 0.45046298765225185, 'eval_f1_weighted': 0.6080816116965468, 'eval_runtime': 1.3364, 'eval_samples_per_second': 230.463, 'eval_steps_per_second': 7.483, 'epoch': 3.0}


In [None]:
import numpy as np
import torch
import torch.nn as nn

from datasets import load_dataset

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support
)
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)


# Assumptions / globals
# You already defined CL_LABELS somewhere above, e.g.:
# CL_LABELS = ["Ambivalent Reply", "Clear Non-Reply", "Clear Reply"]
cl2id = {name: i for i, name in enumerate(CL_LABELS)}
num_clarity_labels = len(CL_LABELS)

MODEL_NAME = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load raw QEvasion dataset and make train/dev/test splits
ds = load_dataset("ailsntua/QEvasion")

# ds has only "train" and "test", so we carve a dev set from train
train_valid = ds["train"].train_test_split(test_size=0.15, seed=42)
raw_train = train_valid["train"]
raw_dev   = train_valid["test"]
raw_test  = ds["test"]   # keep their test as your test

# Preprocess: build Q/A text and map clarity_label → ids
def preprocess_function(examples):
    # Use sub-question + full interview answer
    texts = [
        f"[QUESTION]: {q}\n[ANSWER]: {a}"
        for q, a in zip(examples["question"], examples["interview_answer"])
    ]

    enc = tokenizer(
        texts,
        truncation=True,
        padding="max_length",   # or False for dynamic padding
        max_length=256,
    )

    # Map string clarity_label → integer id via CL_LABELS
    enc["labels"] = [cl2id[label] for label in examples["clarity_label"]]
    return enc

clarity_train = raw_train.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_train.column_names,
)

clarity_dev = raw_dev.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dev.column_names,
)

clarity_test = raw_test.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_test.column_names,
)

# Compute class weights on the new train set
y_train = np.array(clarity_train["labels"])
classes = np.unique(y_train)
print("Classes in train:", classes)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train,
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)


# Base model: DeBERTa-v3-large
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_clarity_labels,
)

# Custom Trainer using weighted cross-entropy
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    # add num_items_in_batch to match newer Trainer signature
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs["labels"]
        inputs_no_labels = {k: v for k, v in inputs.items() if k != "labels"}

        outputs = model(**inputs_no_labels)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, model.num_labels), labels.view(-1))

        if return_outputs:
            return loss, outputs
        return loss

# Metrics: accuracy, precision, recall, F1 (macro + weighted)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)

    prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )

    prec_weighted, rec_weighted, f1_weighted, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )

    return {
        "accuracy": acc,
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro,
        "precision_weighted": prec_weighted,
        "recall_weighted": rec_weighted,
        "f1_weighted": f1_weighted,
    }

# -------------------------------------------------
# 7) Training args (only options your version supports)
# -------------------------------------------------
training_args = TrainingArguments(
    output_dir="./deberta-v3-large-clarity-weighted",
    num_train_epochs=8,
    per_device_train_batch_size=8,       # smaller for large model
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",
    fp16=True,                           # comment out if no GPU
)

# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=clarity_train,
    eval_dataset=clarity_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
)

# Train
trainer.train()

# Evaluate on dev and test
dev_metrics = trainer.evaluate(clarity_dev)
print("Dev metrics:", dev_metrics)

test_metrics = trainer.evaluate(clarity_test)
print("Test metrics:", test_metrics)




Map:   0%|          | 0/2930 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Classes in train: [0 1 2]
Class weights: tensor([0.5695, 3.1813, 1.0756])


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss
50,1.0913
100,1.1438
150,1.0825
200,1.0602
250,1.1028
300,1.0361
350,1.0792
400,0.9624
450,1.0082
500,0.8965


Dev metrics: {'eval_loss': 2.0621776580810547, 'eval_accuracy': 0.7104247104247104, 'eval_precision_macro': 0.6498554544912161, 'eval_recall_macro': 0.6173585673585673, 'eval_f1_macro': 0.6302422869270171, 'eval_precision_weighted': 0.7078808374226309, 'eval_recall_weighted': 0.7104247104247104, 'eval_f1_weighted': 0.7081092988903198, 'eval_runtime': 2.3923, 'eval_samples_per_second': 216.524, 'eval_steps_per_second': 13.794, 'epoch': 8.0}
Test metrics: {'eval_loss': 2.0148298740386963, 'eval_accuracy': 0.6688311688311688, 'eval_precision_macro': 0.6033169807679611, 'eval_recall_macro': 0.5613239220379979, 'eval_f1_macro': 0.5662454510680824, 'eval_precision_weighted': 0.6882706444872221, 'eval_recall_weighted': 0.6688311688311688, 'eval_f1_weighted': 0.6713757054564092, 'eval_runtime': 1.4096, 'eval_samples_per_second': 218.499, 'eval_steps_per_second': 14.188, 'epoch': 8.0}
