In [None]:
!pip install datasets scikit-learn accelerate torch pandas setfit sentence-transformers

Traceback (most recent call last):
object address  : 0x7f0b3d827640
object refcount : 3
object type     : 0xa2a4e0
object type name: KeyboardInterrupt
object repr     : KeyboardInterrupt()
lost sys.stderr
^C


In [None]:
!pip uninstall -y transformers
!pip install --upgrade --force-reinstall --no-cache-dir transformers
!pip install transformers[torch]

Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m171.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from tr



In [None]:
#!/usr/bin/env python3

import os
import sys

try:
    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        Trainer,
        TrainingArguments,
        EarlyStoppingCallback,
        set_seed
    )
except ImportError as e:
    print("="*80)
    print("ERROR: Transformers import failed!")
    print("="*80)
    print("QUICK FIX - Run these commands in Colab cells (in order):")
    print("")
    print("Cell 1:")
    print("  !pip uninstall -y transformers")
    print("  !pip install --upgrade --force-reinstall --no-cache-dir transformers")
    print("  !pip install transformers[torch]")
    print("")
    print("Cell 2:")
    print("  !pip install datasets scikit-learn accelerate torch pandas setfit sentence-transformers")
    print("")
    print("THEN: Runtime -> Restart runtime")
    print("THEN: Run this script again")
    print("="*80)
    raise e

import random
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    classification_report
)
from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

MAX_LEN = 512
BATCH_SIZE = 4

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
set_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print("=" * 80)
print("LOADING AND PREPROCESSING DATA")
print("=" * 80)

dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

full_data = dataset["train"].map(preprocess_text)
full_data = full_data.class_encode_column("evasion_label")

split1 = full_data.train_test_split(
    test_size=0.1,
    seed=42,
    stratify_by_column="evasion_label"
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=42,
    stratify_by_column="evasion_label"
)
train_ds = split2["train"]
eval_ds = split2["test"]

labels = train_ds.features["evasion_label"].names
label2id = {name: i for i, name in enumerate(labels)}
id2label = {i: name for name, i in label2id.items()}

print(f"Train size: {len(train_ds)}")
print(f"Eval size: {len(eval_ds)}")
print(f"Held-out test size: {len(held_out_test_ds)}")
print(f"Number of classes: {len(labels)}")
print(f"Labels: {labels}")

test_texts = [ex["text"] for ex in held_out_test_ds]
test_labels = [ex["evasion_label"] for ex in held_out_test_ds]

def compute_all_metrics(y_true, y_pred, labels):
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    accuracy = accuracy_score(y_true, y_pred)
    macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
    return {
        "macro_f1": macro_f1,
        "accuracy": accuracy,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall
    }

def print_results(model_name, metrics):
    print(f"\n{'='*80}")
    print(f"RESULTS: {model_name}")
    print(f"{'='*80}")
    print(f"Macro F1 Score:     {metrics['macro_f1']:.4f}")
    print(f"Accuracy:           {metrics['accuracy']:.4f}")
    print(f"Macro Precision:    {metrics['macro_precision']:.4f}")
    print(f"Macro Recall:       {metrics['macro_recall']:.4f}")
    print(f"{'='*80}\n")

train_texts = [ex["text"] for ex in train_ds]
train_labels = [ex["evasion_label"] for ex in train_ds]

vectorizer = TfidfVectorizer(
    max_features=40000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(train_texts)
X_test_vec = vectorizer.transform(test_texts)

lr_model = LogisticRegression(
    max_iter=2500,
    class_weight="balanced",
    multi_class="auto",
    random_state=42
)

print("Training Logistic Regression...")
lr_model.fit(X_train_vec, train_labels)

lr_preds = lr_model.predict(X_test_vec)
lr_metrics = compute_all_metrics(test_labels, lr_preds, labels)
print_results("Logistic Regression (TF-IDF)", lr_metrics)

print("\n" + "="*80)
print("TRAINING: DeBERTa-v3-large (Best Run)")
print("="*80)

def train_deberta():
    MODEL_NAME = "microsoft/deberta-v3-large"
    OUTPUT_DIR = "./deberta_v3_best"

    DEBERTA_SEED = 777
    DEBERTA_LR = 8e-6
    DEBERTA_EPOCHS = 15
    DEBERTA_GRAD_ACCUM = 4

    random.seed(DEBERTA_SEED)
    np.random.seed(DEBERTA_SEED)
    torch.manual_seed(DEBERTA_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(DEBERTA_SEED)
        torch.cuda.manual_seed_all(DEBERTA_SEED)
    set_seed(DEBERTA_SEED)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN
        )

    train_tokenized = train_ds.map(tokenize_fn, batched=True)
    eval_tokenized = eval_ds.map(tokenize_fn, batched=True)
    test_tokenized = held_out_test_ds.map(tokenize_fn, batched=True)

    train_tokenized = train_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    eval_tokenized = eval_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    test_tokenized = test_tokenized.map(lambda x: {"labels": x["evasion_label"]})

    y_train = train_tokenized["evasion_label"]
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights_tensor = torch.tensor(
        class_weights,
        dtype=torch.float
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    class WeightedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss_fct = nn.CrossEntropyLoss(
                weight=class_weights_tensor,
                label_smoothing=0.1
            )
            loss = loss_fct(
                logits.view(-1, self.model.config.num_labels),
                labels.view(-1)
            )
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            "accuracy": (predictions == labels).mean(),
            "macro_f1": f1_score(labels, predictions, average="macro")
        }

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=DEBERTA_LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        gradient_accumulation_steps=DEBERTA_GRAD_ACCUM,
        num_train_epochs=DEBERTA_EPOCHS,
        weight_decay=0.05,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=True,
        report_to="none",
        seed=DEBERTA_SEED,
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=eval_tokenized,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    )

    print(f"Training DeBERTa-v3-large with BEST config: SEED={DEBERTA_SEED}, LR={DEBERTA_LR}...")
    trainer.train()

    test_preds = trainer.predict(test_tokenized)
    test_pred_ids = np.argmax(test_preds.predictions, axis=-1)

    return test_pred_ids, trainer, test_tokenized

deberta_preds, deberta_trainer, deberta_test_tokenized = train_deberta()
deberta_metrics = compute_all_metrics(test_labels, deberta_preds, labels)
print_results("DeBERTa-v3-large (Best Run)", deberta_metrics)

print("\n" + "="*80)
print("TRAINING: RoBERTa-large")
print("="*80)

def train_roberta():
    MODEL_NAME = "roberta-large"
    OUTPUT_DIR = "./roberta_large_final"

    ROBERTA_SEED = 42
    ROBERTA_LR = 1e-5
    ROBERTA_EPOCHS = 15
    ROBERTA_GRAD_ACCUM = 8

    random.seed(ROBERTA_SEED)
    np.random.seed(ROBERTA_SEED)
    torch.manual_seed(ROBERTA_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(ROBERTA_SEED)
        torch.cuda.manual_seed_all(ROBERTA_SEED)
    set_seed(ROBERTA_SEED)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN
        )

    train_tokenized = train_ds.map(tokenize_fn, batched=True)
    eval_tokenized = eval_ds.map(tokenize_fn, batched=True)
    test_tokenized = held_out_test_ds.map(tokenize_fn, batched=True)

    train_tokenized = train_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    eval_tokenized = eval_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    test_tokenized = test_tokenized.map(lambda x: {"labels": x["evasion_label"]})

    y_train = train_tokenized["evasion_label"]
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights_tensor = torch.tensor(
        class_weights,
        dtype=torch.float32
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    class WeightedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss_fct = nn.CrossEntropyLoss(
                weight=class_weights_tensor,
                label_smoothing=0.1
            )
            loss = loss_fct(
                logits.view(-1, self.model.config.num_labels),
                labels.view(-1)
            )
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": (preds == labels).mean(),
            "macro_f1": f1_score(labels, preds, average="macro")
        }

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=ROBERTA_LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        gradient_accumulation_steps=ROBERTA_GRAD_ACCUM,
        num_train_epochs=ROBERTA_EPOCHS,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=True,
        report_to="none",
        dataloader_num_workers=2,
        seed=ROBERTA_SEED,
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=eval_tokenized,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    )

    print(f"Training RoBERTa-large with BEST config: SEED={ROBERTA_SEED}, LR={ROBERTA_LR}, grad_accum={ROBERTA_GRAD_ACCUM}...")
    trainer.train()

    test_preds = trainer.predict(test_tokenized)
    test_pred_ids = np.argmax(test_preds.predictions, axis=-1)

    return test_pred_ids, trainer

roberta_preds, roberta_trainer = train_roberta()
roberta_metrics = compute_all_metrics(test_labels, roberta_preds, labels)
print_results("RoBERTa-large", roberta_metrics)

print("\n" + "="*80)
print("TRAINING: SetFit (Paraphrase-Mpnet-Base-V2)")
print("="*80)

try:
    from setfit import SetFitModel, Trainer as SetFitTrainer, TrainingArguments as SetFitTrainingArguments

    def train_setfit():
        MODEL_NAME = "sentence-transformers/paraphrase-mpnet-base-v2"

        train_df = train_ds.to_pandas()
        eval_df = eval_ds.to_pandas()
        test_df = held_out_test_ds.to_pandas()

        from datasets import Dataset
        train_ds_setfit = Dataset.from_pandas(train_df[["text", "evasion_label"]])
        test_ds_setfit = Dataset.from_pandas(test_df[["text", "evasion_label"]])

        model = SetFitModel.from_pretrained(
            MODEL_NAME,
            labels=sorted(labels)
        )

        args = SetFitTrainingArguments(
            num_iterations=5,
            batch_size=32,
            num_epochs=15,
            body_learning_rate=2e-5,
            head_learning_rate=1e-2,
            use_amp=True,
            report_to=[],
        )

        trainer = SetFitTrainer(
            model=model,
            args=args,
            train_dataset=train_ds_setfit,
            column_mapping={"text": "text", "evasion_label": "label"},
        )

        print("Training SetFit...")
        trainer.train()

        test_texts_list = test_df["text"].tolist()
        test_preds = model.predict(test_texts_list)

        label2id_map = {l: i for i, l in enumerate(labels)}
        test_pred_ids = [label2id_map.get(p, 0) for p in test_preds]

        return test_pred_ids

    setfit_preds = train_setfit()
    setfit_metrics = compute_all_metrics(test_labels, setfit_preds, labels)
    print_results("SetFit (Paraphrase-Mpnet-Base-V2)", setfit_metrics)

except ImportError:
    print("SetFit not available. Install with: pip install setfit")
    setfit_metrics = None
    setfit_preds = None

print("\n" + "="*80)
print("TRAINING: XLNet-large")
print("="*80)

def train_xlnet():
    MODEL_NAME = "xlnet-large-cased"
    OUTPUT_DIR = "./xlnet_large"

    XLNET_SEED = 42
    XLNET_LR = 1e-5
    XLNET_EPOCHS = 10
    XLNET_GRAD_ACCUM = 4

    random.seed(XLNET_SEED)
    np.random.seed(XLNET_SEED)
    torch.manual_seed(XLNET_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(XLNET_SEED)
        torch.cuda.manual_seed_all(XLNET_SEED)
    set_seed(XLNET_SEED)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.padding_side = "left"

    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN
        )

    train_tokenized = train_ds.map(tokenize_fn, batched=True)
    eval_tokenized = eval_ds.map(tokenize_fn, batched=True)
    test_tokenized = held_out_test_ds.map(tokenize_fn, batched=True)

    train_tokenized = train_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    eval_tokenized = eval_tokenized.map(lambda x: {"labels": x["evasion_label"]})
    test_tokenized = test_tokenized.map(lambda x: {"labels": x["evasion_label"]})

    y_train = train_tokenized["evasion_label"]
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights_tensor = torch.tensor(
        class_weights,
        dtype=torch.float
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    class WeightedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss_fct = nn.CrossEntropyLoss(
                weight=class_weights_tensor,
                label_smoothing=0.1
            )
            loss = loss_fct(
                logits.view(-1, self.model.config.num_labels),
                labels.view(-1)
            )
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": (preds == labels).mean(),
            "macro_f1": f1_score(labels, preds, average="macro")
        }

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=XLNET_LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        gradient_accumulation_steps=XLNET_GRAD_ACCUM,
        num_train_epochs=XLNET_EPOCHS,
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        report_to="none",
        seed=XLNET_SEED,
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=eval_tokenized,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(f"Training XLNet-large with BEST config: SEED={XLNET_SEED}, LR={XLNET_LR}, epochs={XLNET_EPOCHS}...")
    trainer.train()

    test_preds = trainer.predict(test_tokenized)
    test_pred_ids = np.argmax(test_preds.predictions, axis=-1)

    return test_pred_ids

xlnet_preds = train_xlnet()
xlnet_metrics = compute_all_metrics(test_labels, xlnet_preds, labels)
print_results("XLNet-large", xlnet_metrics)

print("\n" + "="*80)
print("CREATING: Ensemble (RoBERTa + DeBERTa)")
print("="*80)

def get_ensemble_predictions():
    roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large")
    def roberta_tokenize_fn(examples):
        return roberta_tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN
        )
    test_tokenized_roberta = held_out_test_ds.map(roberta_tokenize_fn, batched=True)
    test_tokenized_roberta = test_tokenized_roberta.map(lambda x: {"labels": x["evasion_label"]})

    roberta_test_preds = roberta_trainer.predict(test_tokenized_roberta)
    roberta_logits = roberta_test_preds.predictions

    deberta_test_preds = deberta_trainer.predict(deberta_test_tokenized)
    deberta_logits = deberta_test_preds.predictions

    def softmax(x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    roberta_probs = softmax(roberta_logits)
    deberta_probs = softmax(deberta_logits)

    ensemble_probs = (roberta_probs + deberta_probs) / 2.0
    ensemble_preds = np.argmax(ensemble_probs, axis=-1)

    return ensemble_preds

ensemble_preds = get_ensemble_predictions()
ensemble_metrics = compute_all_metrics(test_labels, ensemble_preds, labels)
print_results("Ensemble (RoBERTa + DeBERTa)", ensemble_metrics)

print("\n" + "="*80)
print("FINAL SUMMARY - ALL MODELS")
print("="*80)

results = [
    ("Logistic Regression (TF-IDF)", lr_metrics),
    ("DeBERTa-v3-large (Best Run)", deberta_metrics),
    ("RoBERTa-large", roberta_metrics),
]

if setfit_metrics:
    results.append(("SetFit (Paraphrase-Mpnet-Base-V2)", setfit_metrics))

results.extend([
    ("XLNet-large", xlnet_metrics),
    ("Ensemble (RoBERTa + DeBERTa)", ensemble_metrics),
])

summary_data = []
for model_name, metrics in results:
    summary_data.append({
        "Model": model_name,
        "Macro F1": f"{metrics['macro_f1']:.4f}",
        "Accuracy": f"{metrics['accuracy']:.4f}",
        "Macro Precision": f"{metrics['macro_precision']:.4f}",
        "Macro Recall": f"{metrics['macro_recall']:.4f}"
    })

df_summary = pd.DataFrame(summary_data)
print("\n" + df_summary.to_string(index=False))
print("\n" + "="*80)

df_summary.to_csv("task2_all_models_results.csv", index=False)
print("\nResults saved to: task2_all_models_results.csv")

print("\n" + "="*80)
print("EVALUATION COMPLETE!")
print("="*80)


LOADING AND PREPROCESSING DATA


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

Train size: 2792
Eval size: 311
Held-out test size: 345
Number of classes: 9
Labels: ['Claims ignorance', 'Clarification', 'Declining to answer', 'Deflection', 'Dodging', 'Explicit', 'General', 'Implicit', 'Partial/half-answer']

TRAINING: Logistic Regression (TF-IDF)
Training Logistic Regression...





RESULTS: Logistic Regression (TF-IDF)
Macro F1 Score:     0.4708
Accuracy:           0.5420
Macro Precision:    0.4679
Macro Recall:       0.4874


TRAINING: DeBERTa-v3-large (Best Run)




Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training DeBERTa-v3-large with BEST config: SEED=777, LR=8e-06...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.96824,0.517685,0.311372
2,No log,1.881637,0.549839,0.367892
3,2.087200,1.771602,0.575563,0.496934
4,2.087200,1.741801,0.62701,0.570789
5,2.087200,1.725636,0.62701,0.577772
6,1.684600,1.798261,0.649518,0.616613
7,1.684600,1.864015,0.630225,0.585424
8,1.684600,1.897147,0.62701,0.602864
9,1.369100,2.033954,0.620579,0.575462
10,1.369100,2.109884,0.610932,0.571622



RESULTS: DeBERTa-v3-large (Best Run)
Macro F1 Score:     0.5770
Accuracy:           0.6290
Macro Precision:    0.5911
Macro Recall:       0.5801


TRAINING: RoBERTa-large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training RoBERTa-large with BEST config: SEED=42, LR=1e-05, grad_accum=8...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.327952,0.463023,0.241138
2,No log,1.925616,0.543408,0.305455
3,No log,1.869986,0.549839,0.406938
4,No log,1.748328,0.588424,0.545329
5,No log,1.762902,0.601286,0.547445
6,1.979000,1.747752,0.62701,0.574701
7,1.979000,1.761032,0.607717,0.558068
8,1.979000,1.732219,0.610932,0.58703
9,1.979000,1.809152,0.623794,0.585435
10,1.979000,1.84283,0.623794,0.589323



RESULTS: RoBERTa-large
Macro F1 Score:     0.5862
Accuracy:           0.6203
Macro Precision:    0.5922
Macro Recall:       0.5844


TRAINING: SetFit (Paraphrase-Mpnet-Base-V2)
SetFit not available. Install with: pip install setfit

TRAINING: XLNet-large


config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training XLNet-large with BEST config: SEED=42, LR=1e-05, epochs=10...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,2.00956,0.450161,0.268619
2,No log,1.897874,0.530547,0.360086
3,8.652900,1.853642,0.540193,0.371574
4,8.652900,1.75902,0.565916,0.48282
5,8.652900,1.721151,0.572347,0.508384
6,7.144900,1.733062,0.610932,0.556083
7,7.144900,1.735185,0.601286,0.554888
8,7.144900,1.746186,0.607717,0.568418
9,6.212600,1.749102,0.620579,0.565442
10,6.212600,1.763128,0.607717,0.569794



RESULTS: XLNet-large
Macro F1 Score:     0.5775
Accuracy:           0.6261
Macro Precision:    0.5931
Macro Recall:       0.5762


CREATING: Ensemble (RoBERTa + DeBERTa)


Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]


RESULTS: Ensemble (RoBERTa + DeBERTa)
Macro F1 Score:     0.5874
Accuracy:           0.6261
Macro Precision:    0.5959
Macro Recall:       0.5843


FINAL SUMMARY - ALL MODELS

                       Model Macro F1 Accuracy Macro Precision Macro Recall
Logistic Regression (TF-IDF)   0.4708   0.5420          0.4679       0.4874
 DeBERTa-v3-large (Best Run)   0.5770   0.6290          0.5911       0.5801
               RoBERTa-large   0.5862   0.6203          0.5922       0.5844
                 XLNet-large   0.5775   0.6261          0.5931       0.5762
Ensemble (RoBERTa + DeBERTa)   0.5874   0.6261          0.5959       0.5843


Results saved to: task2_all_models_results.csv

EVALUATION COMPLETE!
