In [2]:
import os
import random
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight


MODEL = "microsoft/deberta-v3-large"
SAVE_DIR = "./deberta_best_config"
MAXLEN = 512
BATCH = 4
GRAD_ACC = 4
LR = 8e-6
EPOCHS = 20

USE_ENSEMBLE = True
SEEDS = [42, 777, 999, 2024, 1234]
NUM_MODELS = 5

ds = load_dataset("ailsntua/QEvasion")

def prep(x):
    c = x.get('clarity_label', 'Unknown') or 'Unknown'
    x_text = f"Context: {c} | Question: {x['question']} Answer: {x['interview_answer']}"
    return {"text": x_text, "evasion_label": x["evasion_label"]}

ds["train"] = ds["train"].map(prep)
if "test" in ds:
    test_ds = ds["test"].map(prep)

ds["train"] = ds["train"].class_encode_column("evasion_label")

split = ds["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="evasion_label")
train_full, holdout = split["train"], split["test"]
subsplit = train_full.train_test_split(test_size=0.1, seed=42, stratify_by_column="evasion_label")
train_ds, eval_ds = subsplit["train"], subsplit["test"]

labels = train_ds.features["evasion_label"].names
lbl2id = {l: i for i, l in enumerate(labels)}
id2lbl = {i: l for l, i in lbl2id.items()}

tok = AutoTokenizer.from_pretrained(MODEL)

def tokenize(x):
    return tok(x["text"], padding="max_length", truncation=True, max_length=MAXLEN)

train_ds = train_ds.map(tokenize, batched=True).map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(tokenize, batched=True).map(lambda x: {"labels": x["evasion_label"]})
holdout = holdout.map(tokenize, batched=True).map(lambda x: {"labels": x["evasion_label"]})

def train_model(seed, ensemble_idx=None):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL,
        num_labels=len(labels),
        id2label=id2lbl,
        label2id=lbl2id
    )

    y = train_ds["evasion_label"]
    weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
    w_tensor = torch.tensor(weights, dtype=torch.float)
    if torch.cuda.is_available():
        w_tensor = w_tensor.cuda()

    class MyTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            lbls = inputs.get("labels")
            out = model(**inputs)
            logits = out.get("logits")
            loss_fn = nn.CrossEntropyLoss(weight=w_tensor, label_smoothing=0.1)
            loss = loss_fn(logits.view(-1, model.config.num_labels), lbls.view(-1))
            return (loss, out) if return_outputs else loss

    save_path = f"{SAVE_DIR}_seed{seed}" if ensemble_idx is not None else SAVE_DIR

    args = TrainingArguments(
        output_dir=save_path,
        learning_rate=LR,
        per_device_train_batch_size=BATCH,
        per_device_eval_batch_size=BATCH * 2,
        gradient_accumulation_steps=GRAD_ACC,
        num_train_epochs=EPOCHS,
        weight_decay=0.05,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,  #only keep the best checkpoint
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=True,
        report_to="none",
        dataloader_num_workers=2,
        seed=seed
    )

    def metrics(p):
        logits, y_true = p
        y_pred = np.argmax(logits, axis=-1)
        return {
            "accuracy": (y_pred == y_true).mean(),
            "macro_f1": f1_score(y_true, y_pred, average="macro")
        }

    trainer = MyTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=metrics,
        callbacks=[]
    )

    trainer.train()
    result = trainer.evaluate(holdout)
    print(f"Seed {seed} F1: {result['eval_macro_f1']:.4f}")
    return trainer, result

def cleanup_old_checkpoints(base_dir="./deberta_best_config", keep_only_best=True):
    """Clean up old checkpoint directories to save disk space"""
    if not os.path.exists(base_dir):
        return

    for item in os.listdir(base_dir):
        item_path = os.path.join(base_dir, item)
        if os.path.isdir(item_path) and "checkpoint" in item:
            if keep_only_best:
                #keeps the best checkpoint
                checkpoints = [d for d in os.listdir(item_path) if "checkpoint" in d]
                if checkpoints:
                    checkpoints.sort(key=lambda x: int(x.split("-")[1]) if "-" in x else 0, reverse=True)
                    for cp in checkpoints[1:]:
                        cp_path = os.path.join(item_path, cp)
                        if os.path.exists(cp_path):
                            shutil.rmtree(cp_path)
                            print(cp_path)

if USE_ENSEMBLE and NUM_MODELS > 1:
    all_preds = []
    all_results = []
    all_trainers = []
    for i, seed in enumerate(SEEDS[:NUM_MODELS]):
        tr, res = train_model(seed=seed, ensemble_idx=i)
        all_trainers.append(tr)
        all_results.append(res)
        preds = tr.predict(holdout).predictions
        all_preds.append(preds)


    individual_f1s = [r['eval_macro_f1'] for r in all_results]
    weights = np.array(individual_f1s)
    weights = weights / weights.sum()

    print(f"\nIndividual Model F1s: {[f'{f:.4f}' for f in individual_f1s]}")
    print(f"Ensemble Weights: {[f'{w:.3f}' for w in weights]}")

    weighted_probs = np.average(all_preds, axis=0, weights=weights)
    y_pred = np.argmax(weighted_probs, axis=-1)
    y_true = holdout["labels"]

    f1_ens = f1_score(y_true, y_pred, average="macro")
    acc_ens = (y_pred == y_true).mean()

    avg_probs = np.mean(all_preds, axis=0)
    y_pred_avg = np.argmax(avg_probs, axis=-1)
    f1_ens_avg = f1_score(y_true, y_pred_avg, average="macro")

    print(f"\nEnsemble Results:")
    print(f"  Weighted Ensemble F1: {f1_ens:.4f} | Acc: {acc_ens:.4f}")
    print(f"  Simple Average F1: {f1_ens_avg:.4f}")
    print(f"  Best Individual F1: {max(individual_f1s):.4f}")
    print(f"  Improvement over best: {f1_ens - max(individual_f1s):.4f}")

    if "test" in ds:
        test_ds = test_ds.map(tokenize, batched=True)
        if "index" not in test_ds.column_names:
            test_ds = test_ds.add_column("index", range(len(test_ds)))

        all_test_preds = [t.predict(test_ds).predictions for t in all_trainers]
        weighted_test = np.average(all_test_preds, axis=0, weights=weights)
        pred_ids = np.argmax(weighted_test, axis=-1)
        preds_lbl = [id2lbl[i] for i in pred_ids]

        pd.DataFrame({
            "index": test_ds["index"],
            "evasion_label": preds_lbl
        }).to_csv("ensemble_submission.csv", index=False)

    # for saving disk space
    cleanup_old_checkpoints()

else:
    seed = SEEDS[0]
    trainer, result = train_model(seed)

    if "test" in ds:
        test_ds = test_ds.map(tokenize, batched=True)
        if "index" not in test_ds.column_names:
            test_ds = test_ds.add_column("index", range(len(test_ds)))

        test_out = trainer.predict(test_ds).predictions
        test_pred_ids = np.argmax(test_out, axis=-1)
        final_labels = [id2lbl[i] for i in test_pred_ids]

        pd.DataFrame({
            "index": test_ds["index"],
            "evasion_label": final_labels
        }).to_csv("best_config_submission.csv", index=False)


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.034497,0.517685,0.282873
2,No log,1.907372,0.549839,0.273221
3,2.132000,1.841862,0.575563,0.425551
4,2.132000,1.767888,0.594855,0.526254
5,2.132000,1.752607,0.598071,0.543375
6,1.754700,1.778494,0.649518,0.58978
7,1.754700,1.923069,0.636656,0.569459
8,1.754700,1.905835,0.636656,0.588083
9,1.472100,2.034865,0.617363,0.564591
10,1.472100,2.082523,0.646302,0.595771


Seed 42 F1: 0.5685


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.030439,0.508039,0.297377
2,No log,1.898445,0.553055,0.396686
3,2.115900,1.814815,0.604502,0.481339
4,2.115900,1.728893,0.607717,0.541438
5,2.115900,1.689038,0.630225,0.589067
6,1.714000,1.748508,0.614148,0.566442
7,1.714000,1.869324,0.620579,0.566812
8,1.714000,1.940056,0.636656,0.581212
9,1.376100,2.12887,0.62701,0.592479
10,1.376100,2.237271,0.610932,0.581313


Seed 777 F1: 0.6268


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.232351,0.350482,0.171857
2,No log,1.910518,0.549839,0.373788
3,2.148800,1.845743,0.585209,0.459024
4,2.148800,1.705118,0.598071,0.534482
5,2.148800,1.708964,0.607717,0.509258
6,1.760200,1.786461,0.617363,0.540759
7,1.760200,1.749328,0.62701,0.565626
8,1.760200,1.843158,0.581994,0.548271
9,1.515400,1.936963,0.614148,0.563723
10,1.515400,2.07769,0.598071,0.555171


Seed 999 F1: 0.5769


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.219633,0.405145,0.297808
2,No log,1.895993,0.530547,0.32947
3,2.157100,1.860107,0.553055,0.399496
4,2.157100,1.807572,0.562701,0.451227
5,2.157100,1.756429,0.581994,0.483364
6,1.785800,1.713442,0.633441,0.560035
7,1.785800,1.784536,0.610932,0.559656
8,1.785800,1.815251,0.639871,0.572412
9,1.563700,1.930297,0.620579,0.56805
10,1.563700,2.005022,0.594855,0.552349


Seed 2024 F1: 0.5677


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.002906,0.549839,0.274194
2,No log,1.89611,0.585209,0.37531
3,2.127200,1.864424,0.527331,0.38417
4,2.127200,1.834897,0.553055,0.449133
5,2.127200,1.72562,0.585209,0.545712
6,1.788400,1.686954,0.652733,0.616034
7,1.788400,1.791248,0.607717,0.564545
8,1.788400,1.837995,0.662379,0.587794
9,1.518200,1.941965,0.639871,0.575385
10,1.518200,2.004208,0.643087,0.598545


Seed 1234 F1: 0.5523

Individual Model F1s: ['0.5685', '0.6268', '0.5769', '0.5677', '0.5523']
Ensemble Weights: ['0.197', '0.217', '0.199', '0.196', '0.191']

Ensemble Results:
  Weighted Ensemble F1: 0.5694 | Acc: 0.6087
  Simple Average F1: 0.5671
  Best Individual F1: 0.6268
  Improvement over best: -0.0574


Map:   0%|          | 0/308 [00:00<?, ? examples/s]