In [2]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files

# Setup reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Config
MODEL_NAME = "roberta-large"
OUTPUT_DIR = "./roberta_large_final"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 8  # Effective batch size of 32
LR = 1e-5              # RoBERTa tolerates slightly higher LR than DeBERTa
EPOCHS = 15

print("Loading dataset...")
dataset = load_dataset("ailsntua/QEvasion")

# Text formatting
def format_inputs(example):
    # Using the pipe separator style which worked well for your DeBERTa run
    clarity = example.get('clarity_label') or 'Unknown'
    text = (
        f"Context: {clarity} | "
        f"Question: {example['question']} "
        f"Answer: {example['interview_answer']}"
    )
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Processing data...")
processed_ds = dataset["train"].map(format_inputs)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(format_inputs)

processed_ds = processed_ds.class_encode_column("evasion_label")

# Splits
# Split off a clean holdout set (10%)
main_split = processed_ds.train_test_split(
    test_size=0.1, seed=SEED, stratify_by_column="evasion_label"
)
train_dev = main_split["train"]
holdout_test = main_split["test"]

# Split Train into Train/Val (10% val)
inner_split = train_dev.train_test_split(
    test_size=0.1, seed=SEED, stratify_by_column="evasion_label"
)
train_set = inner_split["train"]
val_set = inner_split["test"]

# Mappings
labels = train_set.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_set = train_set.map(tokenize_fn, batched=True)
val_set = val_set.map(tokenize_fn, batched=True)
holdout_test = holdout_test.map(tokenize_fn, batched=True)

# Rename label column for trainer
train_set = train_set.map(lambda x: {"labels": x["evasion_label"]})
val_set = val_set.map(lambda x: {"labels": x["evasion_label"]})
holdout_test = holdout_test.map(lambda x: {"labels": x["evasion_label"]})

# Class Weights
y_vals = train_set["evasion_label"]
weights = compute_class_weight("balanced", classes=np.unique(y_vals), y=y_vals)
weights_tensor = torch.tensor(weights, dtype=torch.float32).to("cuda" if torch.cuda.is_available() else "cpu")

# Custom Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Weighted CE with smoothing
        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=0.1)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# Training Args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    seed=SEED
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

print("Starting training...")
trainer.train()

# Evaluation
print("\nEvaluating on Holdout Test Set...")
res = trainer.evaluate(holdout_test)
print(f"Holdout Macro F1: {res['eval_macro_f1']:.4f}")

# Submission file generation
if "test" in dataset:
    print("\nGenerating submission file...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)

    # Ensure index column exists
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    preds_output = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(preds_output.predictions, axis=-1)
    pred_labels = [id2label[i] for i in pred_ids]

    df = pd.DataFrame({
        "index": comp_test_ds["index"],
        "evasion_label": pred_labels
    })

    csv_name = "roberta_submission.csv"
    df.to_csv(csv_name, index=False)
    print(f"Saved {csv_name}")
    files.download(csv_name)

Loading dataset...
Processing data...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.334232,0.453376,0.211084
2,No log,1.898406,0.553055,0.337304
3,No log,1.827412,0.546624,0.404571
4,No log,1.710673,0.614148,0.578972
5,No log,1.760718,0.581994,0.512345
6,1.972800,1.737058,0.62701,0.577149
7,1.972800,1.753983,0.610932,0.584343
8,1.972800,1.757748,0.614148,0.583255
9,1.972800,1.816134,0.636656,0.60792
10,1.972800,1.838868,0.630225,0.603288



Evaluating on Holdout Test Set...


Holdout Macro F1: 0.5757

Generating submission file...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Saved roberta_submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Roberta did not do as well as Deberta by about .04 for macro f1 but preformed significantly better than llama. Although this could be do to random factors like seed. So we will go further to compare the two models.