In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files

SEED = 999  # trying a new seed

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

try:
    from transformers import set_seed
    set_seed(SEED)
except ImportError:
    pass

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512

BATCH_SIZE = 4
GRAD_ACCUMULATION = 8  # Effective Batch = 32
LR = 1.5e-5            # Higher Learning Rate
EPOCHS = 10

print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Preprocessing...")
full_data = dataset["train"].map(preprocess_text)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess_text)

full_data = full_data.class_encode_column("evasion_label")

# Double Split
split1 = full_data.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_ds = split2["train"]
eval_ds = split2["test"]

labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})

y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float
).to("cuda" if torch.cuda.is_available() else "cpu")

class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Slightly reduced smoothing
        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=0.05
        )

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": (predictions == labels).mean(),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,

    weight_decay=0.01,             # Low decay
    warmup_ratio=0.1,
    lr_scheduler_type="linear",    # Linear scheduler

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    seed=SEED
)

trainer = ProTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print("Starting DeBERTa Training (Aggressive Mode)...")
trainer.train()

print("\n" + "="*40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("="*40)

test_results = trainer.evaluate(held_out_test_ds)
print(f"\n FINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")

if "test" in dataset:
    print("\nGenerating submission.csv...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    out_df = pd.DataFrame({
        "index": comp_test_ds["index"],
        "evasion_label": pred_labels
    })
    # out_df.to_csv("submission_pro.csv", index=False)
    # files.download("submission_pro.csv")

Loading Data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Preprocessing...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting DeBERTa Training (Aggressive Mode)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.804013,0.514469,0.327759
2,No log,1.620513,0.572347,0.360308
3,No log,1.433151,0.607717,0.549278
4,No log,1.421762,0.614148,0.546757
5,No log,1.339869,0.646302,0.597048
6,1.587500,1.457686,0.662379,0.593007
7,1.587500,1.496824,0.633441,0.579195
8,1.587500,1.602072,0.662379,0.606085
9,1.587500,1.64807,0.623794,0.57593
10,1.587500,1.668981,0.639871,0.589294



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.5699

Generating submission.csv...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files

SEED = 2024
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

try:
    from transformers import set_seed
    set_seed(SEED)
except ImportError:
    pass

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512

BATCH_SIZE = 4
GRAD_ACCUMULATION = 8
LR = 9e-6
EPOCHS = 12

print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Preprocessing...")
full_data = dataset["train"].map(preprocess_text)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess_text)

full_data = full_data.class_encode_column("evasion_label")

# double Split the data
split1 = full_data.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_ds = split2["train"]
eval_ds = split2["test"]

labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})

y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float
).to("cuda" if torch.cuda.is_available() else "cpu")

class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=0.05
        )

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": (predictions == labels).mean(),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,

    weight_decay=0.01,             # Flexible decay
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    seed=SEED
)

trainer = ProTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

print(f"Starting Training with LR={LR}, Warmup=0.2, Cosine...")
trainer.train()

print("\n" + "="*40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("="*40)

test_results = trainer.evaluate(held_out_test_ds)
print(f"\n FINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")

if "test" in dataset:
    print("\nGenerating submission.csv...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    out_df = pd.DataFrame({
        "index": comp_test_ds["index"],
        "evasion_label": pred_labels
    })

Loading Data...
Preprocessing...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Training with LR=9e-06, Warmup=0.2, Cosine...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.225367,0.199357,0.117574
2,No log,1.598595,0.581994,0.373642
3,No log,1.539939,0.581994,0.466785
4,No log,1.483558,0.553055,0.478754
5,No log,1.490345,0.598071,0.496041
6,1.717800,1.5415,0.630225,0.513451
7,1.717800,1.575139,0.601286,0.516627
8,1.717800,1.646242,0.636656,0.568506
9,1.717800,1.702231,0.598071,0.559208
10,1.717800,1.724672,0.617363,0.579802



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.5379

Generating submission.csv...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

try:
    from transformers import set_seed
    set_seed(SEED)
except ImportError:
    pass

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR_HEAD = 1e-5
LR_DECAY = 0.9
EPOCHS = 10

print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Preprocessing...")
full_data = dataset["train"].map(preprocess_text)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess_text)

full_data = full_data.class_encode_column("evasion_label")

split1 = full_data.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_ds = split2["train"]
eval_ds = split2["test"]

labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})
#Uses LLRD optimizer
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# impliments Layer-wise Learning Rate Decay
def get_optimizer_grouped_parameters(model, lr_head, lr_decay):
    opt_parameters = []
    named_parameters = list(model.named_parameters())
    classifier_params = ['classifier', 'pooler']

    head_params = [p for n, p in named_parameters if any(nd in n for nd in classifier_params)]
    opt_parameters.append({"params": head_params, "lr": lr_head})


    num_layers = model.config.num_hidden_layers

    for layer_i in range(num_layers - 1, -1, -1):
        layer_params = [p for n, p in named_parameters if f"encoder.layer.{layer_i}." in n]
        lr_layer = lr_head * (lr_decay ** (num_layers - layer_i))
        opt_parameters.append({"params": layer_params, "lr": lr_layer})

    embedding_params = [p for n, p in named_parameters if "embeddings" in n]
    lr_embed = lr_head * (lr_decay ** (num_layers + 1))
    opt_parameters.append({"params": embedding_params, "lr": lr_embed})

    return opt_parameters

# Manually create the optimizer with special groups
optimizer_grouped_parameters = get_optimizer_grouped_parameters(model, LR_HEAD, LR_DECAY)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR_HEAD, weight_decay=0.01)

y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float
).to("cuda" if torch.cuda.is_available() else "cpu")

class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": (predictions == labels).mean(),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # set learning_rate=0 here because since a custom optimizer is being used
    learning_rate=0.0,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    seed=SEED
)

trainer = ProTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None), # Pass the LLRD optimizer here
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print(f"Starting Training with LLRD (Head LR={LR_HEAD}, Decay={LR_DECAY})...")
trainer.train()
print("\n" + "="*40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("="*40)

test_results = trainer.evaluate(held_out_test_ds)
print(f"\n FINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")

if "test" in dataset:
    print("\nGenerating submission.csv...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    out_df = pd.DataFrame({
        "index": comp_test_ds["index"],
        "evasion_label": pred_labels
    })
    out_df.to_csv("submission_pro.csv", index=False)
    files.download("submission_pro.csv")

Loading Data...
Preprocessing...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Training with LLRD (Head LR=1e-05, Decay=0.9)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.330458,0.453376,0.291736
2,No log,1.906703,0.553055,0.330486
3,2.170600,1.867629,0.553055,0.406208
4,2.170600,1.823107,0.55627,0.442891
5,2.170600,1.80436,0.569132,0.480734
6,1.818100,1.77742,0.604502,0.518958
7,1.818100,1.755295,0.588424,0.529833
8,1.818100,1.759881,0.601286,0.543755
9,1.643700,1.761659,0.607717,0.549634
10,1.643700,1.76338,0.614148,0.555846



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.5395

Generating submission.csv...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

This is hte best run I had for seeing how to fine tune the model

In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files

# Set random seed for reproducibility
SEED = 777
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

try:
    from transformers import set_seed
    set_seed(SEED)
except ImportError:
    pass

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Configurations
MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR = 8e-6
EPOCHS = 15

print("Fetching and prepping dataset...")
dataset = load_dataset("ailsntua/QEvasion")

# Formatting each example
def build_text(example):
    clarity = example.get('clarity_label') or 'Unknown'
    return {
        "text": f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}",
        "evasion_label": example["evasion_label"]
    }

processed_data = dataset["train"].map(build_text)
if "test" in dataset:
    test_dataset = dataset["test"].map(build_text)

processed_data = processed_data.class_encode_column("evasion_label")

# Splits
split_1 = processed_data.train_test_split(test_size=0.1, seed=SEED, stratify_by_column="evasion_label")
train_dev = split_1["train"]
test_holdout = split_1["test"]

split_2 = train_dev.train_test_split(test_size=0.1, seed=SEED, stratify_by_column="evasion_label")
train_set = split_2["train"]
val_set = split_2["test"]

# Get label mappings
label_names = train_set.features["evasion_label"].names
label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for label, idx in label2id.items()}

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_set = train_set.map(tokenize, batched=True)
val_set = val_set.map(tokenize, batched=True)
test_holdout = test_holdout.map(tokenize, batched=True)

train_set = train_set.map(lambda x: {"labels": x["evasion_label"]})
val_set = val_set.map(lambda x: {"labels": x["evasion_label"]})
test_holdout = test_holdout.map(lambda x: {"labels": x["evasion_label"]})

# Compute class weights
y = train_set["evasion_label"]
weights = compute_class_weight("balanced", classes=np.unique(y), y=y)
weight_tensor = torch.tensor(weights, dtype=torch.float32).to("cuda" if torch.cuda.is_available() else "cpu")

# Custom Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)

        # Robust logits extraction
        if hasattr(outputs, "logits"):
            logits = outputs.logits
        else:
            logits = outputs[1]

        loss_fn = nn.CrossEntropyLoss(weight=weight_tensor, label_smoothing=0.1)
        loss = loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Evaluation metrics
def metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

# Training config
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,
    weight_decay=0.05,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    eval_strategy="epoch",

    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    seed=SEED
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

trainer.train()

# Final test evaluation
print("\n======================")
print("Evaluating on test set")
print("======================")
final_scores = trainer.evaluate(test_holdout)
print(f"Test Set Macro F1: {final_scores['eval_macro_f1']:.4f}")

# Optional: prepare submission
if "test" in dataset:
    print("Generating predictions for competition test set...")
    test_dataset = test_dataset.map(tokenize, batched=True)

    if "index" not in test_dataset.column_names:
        test_dataset = test_dataset.add_column("index", list(range(len(test_dataset))))

    pred_output = trainer.predict(test_dataset)
    pred_ids = np.argmax(pred_output.predictions, axis=-1)
    pred_labels = [id2label[idx] for idx in pred_ids]

    submission_df = pd.DataFrame({
        "index": test_dataset["index"],
        "evasion_label": pred_labels
    })

    submission_df.to_csv("submission_pro.csv", index=False)
    files.download("submission_pro.csv")

Fetching and prepping dataset...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training begins... fingers crossed 


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.932442,0.569132,0.310894
2,No log,1.820891,0.581994,0.368565
3,2.079100,1.75085,0.594855,0.513794
4,2.079100,1.747446,0.572347,0.514343
5,2.079100,1.706125,0.672026,0.57971
6,1.689300,1.786313,0.62701,0.570905
7,1.689300,1.794326,0.66881,0.630692
8,1.689300,1.814512,0.675241,0.64182
9,1.378800,1.941438,0.630225,0.601879
10,1.378800,1.979787,0.662379,0.630296



Evaluating on test set


Test Set Macro F1: 0.6166
Generating predictions for competition test set...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>