In [1]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)
from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files


SEEDS = [42, 77, 777]
MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR_BASE = "./deberta_v3_ensemble"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR = 8e-6
EPOCHS = 13


def set_all_seeds(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_all_seeds(42)

dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get("clarity_label", "Unknown")
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example.get("evasion_label", -1)}

train_ds = dataset["train"].map(preprocess_text)

if "test" in dataset:
    test_ds = dataset["test"].map(preprocess_text)
else:
    raise ValueError("Test set not found in QEvasion dataset!")


# encode labels on training set
train_ds = train_ds.class_encode_column("evasion_label")
labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

# set labels
train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})

# class weights
y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train,
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float,
).to("cuda" if torch.cuda.is_available() else "cpu")


class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits", None)
        if logits is None:
            logits = outputs[0]

        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=0.1,
        )
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1),
        )
        return (loss, outputs) if return_outputs else loss


all_test_logits = []

for seed in SEEDS:
    print(f"\n training for seed: {seed}")
    set_all_seeds(seed)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )

    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR_BASE}_seed{seed}",
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUMULATION,
        num_train_epochs=EPOCHS,

        weight_decay=0.05,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        eval_strategy="no",  # training on all data
        save_strategy="no",
        load_best_model_at_end=False,

        fp16=True,
        report_to="none",
        seed=seed,
        overwrite_output_dir=True,
    )

    trainer = ProTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
    )

    trainer.train()

    test_preds = trainer.predict(test_ds)
    all_test_logits.append(test_preds.predictions)


all_test_logits = np.stack(all_test_logits, axis=0)
mean_logits = all_test_logits.mean(axis=0)

pred_ids = np.argmax(mean_logits, axis=-1)
pred_labels = [id2label[p] for p in pred_ids]

if "index" not in test_ds.column_names:
    test_ds = test_ds.add_column("index", range(len(test_ds)))

out_df = pd.DataFrame(
    {
        "index": test_ds["index"],
        "evasion_label": pred_labels,
    }
)

out_df.to_csv("submission_ensemble.csv", index=False)
files.download("submission_ensemble.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]


 training for seed: 42


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Step,Training Loss
500,2.1408
1000,1.7484
1500,1.5667
2000,1.3075
2500,1.153



 training for seed: 77


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,2.1028
1000,1.7535
1500,1.5023
2000,1.2654
2500,1.1218



 training for seed: 777


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,2.0984
1000,1.7179
1500,1.4683
2000,1.2242
2500,1.1123


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>