In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset, ClassLabel
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files

MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_results"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR = 1e-5
EPOCHS = 25


print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None: clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

# map Preprocessing to all data
print("Preprocessing and Injecting Context...")
full_data = dataset["train"].map(preprocess_text)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess_text)

# encode Labels (converts strs to ints)
full_data = full_data.class_encode_column("evasion_label")

# Creates the "Held-Out" Test Set (10%)
split1 = full_data.train_test_split(test_size=0.1, seed=42, stratify_by_column="evasion_label")
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(test_size=0.1, seed=42, stratify_by_column="evasion_label")
train_ds = split2["train"]
eval_ds = split2["test"]

print(f"Data Split -> Train: {len(train_ds)} | Eval: {len(eval_ds)} | Held-Out Test: {len(held_out_test_ds)}")

# Get the label names from the features
labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

print("Tokenizing...")
train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

# maps labels to 'labels' column
train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})

y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": (predictions == labels).mean(),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",               # optimize F1 directly
    greater_is_better=True,
    fp16=True,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # early stopping
)

print("Starting DeBERTa Training (Supercharged Mode)...")
trainer.train()


print("\n" + "="*40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("="*40)

# evaluate on the held-out set (Simulating the competition)
test_results = trainer.evaluate(held_out_test_ds)
print(f"\n FINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")
print(f" FINAL TEST SET ACCURACY: {test_results['eval_accuracy']:.4f}")

# generate Prediction File
if "test" in dataset:
    print("\nGenerating submission.csv for competition...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)

    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    # predict
    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    # save
    out_df = pd.DataFrame({"index": comp_test_ds["index"], "evasion_label": pred_labels})
    out_df.to_csv("submission.csv", index=False)
    print(" Downloading submission.csv...")
    files.download("submission.csv")

Loading Data...
Preprocessing and Injecting Context...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

Data Split -> Train: 2792 | Eval: 311 | Held-Out Test: 345




Tokenizing...


Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting DeBERTa Training (Supercharged Mode)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.268523,0.530547,0.354149
2,No log,1.216714,0.575563,0.369107
3,1.313900,1.083423,0.565916,0.460134
4,1.313900,1.114185,0.55627,0.479814
5,1.313900,1.124381,0.614148,0.518609
6,0.884200,1.143923,0.630225,0.564933
7,0.884200,1.161873,0.636656,0.580201
8,0.884200,1.385697,0.662379,0.600877
9,0.494200,1.539355,0.617363,0.553185
10,0.494200,1.739431,0.639871,0.600167



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.5858
 FINAL TEST SET ACCURACY: 0.6348

Generating submission.csv for competition...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

 Downloading submission.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed,
)
from datasets import load_dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files


# seeding
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

set_seed(SEED)

# cuDNN determinism helps reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


#  config
MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4
LR = 8e-6
EPOCHS = 15


#  data
print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess(example):
    clarity = example.get("clarity_label", "Unknown")
    if clarity is None:
        clarity = "Unknown"
    text = f"Context: {clarity} | Question: {example['question']} Answer: {example['interview_answer']}"
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Preprocessing...")
full_data = dataset["train"].map(preprocess)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess)

full_data = full_data.class_encode_column("evasion_label")

# train / dev / held-out split
split1 = full_data.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label",
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label",
)
train_ds = split2["train"]
eval_ds = split2["test"]

labels = train_ds.features["evasion_label"].names
label2id = {name: i for i, name in enumerate(labels)}
id2label = {i: name for name, i in label2id.items()}


#  tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})


#  class weights
y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train,
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float,
).to("cuda" if torch.cuda.is_available() else "cpu")


#  trainer
class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=0.1,
        )
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1),
        )
        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,

    weight_decay=0.05,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",

    seed=SEED,
)


trainer = ProTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],
)

print("Starting DeBERTa Training (Pro Mode, Seeded)...")
trainer.train()


# evaluation
print("\n" + "=" * 40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("=" * 40)

test_results = trainer.evaluate(held_out_test_ds)
print(f"\nFINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")


#  submission
if "test" in dataset:
    print("\nGenerating submission_pro.csv...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    out_df = pd.DataFrame(
        {
            "index": comp_test_ds["index"],
            "evasion_label": pred_labels,
        }
    )
    out_df.to_csv("submission_pro.csv", index=False)
    files.download("submission_pro.csv")


Loading Data...
Preprocessing...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]



Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting DeBERTa Training (Pro Mode, Seeded)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.943651,0.508039,0.319641
2,No log,1.91677,0.549839,0.272324
3,2.096900,1.821929,0.578778,0.445404
4,2.096900,1.778823,0.598071,0.515468
5,2.096900,1.744286,0.610932,0.538181
6,1.694000,1.82727,0.659164,0.574685
7,1.694000,1.835448,0.639871,0.594995
8,1.694000,1.923713,0.643087,0.611393
9,1.403000,2.009198,0.655949,0.616455
10,1.403000,2.14336,0.636656,0.597195



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.6259

Generating submission.csv...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Set seed run below:

In [None]:
!pip install -q transformers datasets scikit-learn accelerate torch pandas

import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from google.colab import files
#impliments global seeding
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

try:
    from transformers import set_seed
    set_seed(SEED)
except ImportError:
    pass

# cuDNN determinism
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta_v3_pro_results"
MAX_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUMULATION = 4

# Slightly higher LR to help convergence / minority classes
LR = 1e-5
EPOCHS = 15

print("Loading Data...")
dataset = load_dataset("ailsntua/QEvasion")

def preprocess_text(example):
    # Context Injection with clearer task wording
    clarity = example.get('clarity_label', 'Unknown')
    if clarity is None:
        clarity = "Unknown"

    text = (
        f"Clarity label: {clarity}.\n"
        f"Question: {example['question']}\n"
        f"Answer: {example['interview_answer']}\n"
        "Task: classify the evasion strategy used in the answer."
    )
    return {"text": text, "evasion_label": example["evasion_label"]}

print("Preprocessing...")
full_data = dataset["train"].map(preprocess_text)
if "test" in dataset:
    comp_test_ds = dataset["test"].map(preprocess_text)

full_data = full_data.class_encode_column("evasion_label")

# Double Split
split1 = full_data.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_dev_ds = split1["train"]
held_out_test_ds = split1["test"]

split2 = train_dev_ds.train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="evasion_label"
)
train_ds = split2["train"]
eval_ds = split2["test"]

print("Train size:", len(train_ds))
print("Dev size:", len(eval_ds))
print("Held-out test size:", len(held_out_test_ds))

labels = train_ds.features["evasion_label"].names
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds = eval_ds.map(tokenize_fn, batched=True)
held_out_test_ds = held_out_test_ds.map(tokenize_fn, batched=True)

train_ds = train_ds.map(lambda x: {"labels": x["evasion_label"]})
eval_ds = eval_ds.map(lambda x: {"labels": x["evasion_label"]})
held_out_test_ds = held_out_test_ds.map(lambda x: {"labels": x["evasion_label"]})
y_train = train_ds["evasion_label"]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_tensor = torch.tensor(
    class_weights,
    dtype=torch.float
).to("cuda" if torch.cuda.is_available() else "cpu")

class ProTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=0.0
        )

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": (predictions == labels).mean(),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=EPOCHS,

    weight_decay=0.05,             # Higher decay = less overfitting
    warmup_ratio=0.1,              # Warmup prevents early confusion
    lr_scheduler_type="cosine",    # Smooth curve = better convergence
    max_grad_norm=1.0,             # Gradient clipping for stability
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",

    seed=SEED
)

trainer = ProTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)] # increased patience
)

print("Starting DeBERTa Training (Pro Mode, Seeded)...")
trainer.train()

print("\n" + "="*40)
print("COMPUTING FINAL HELD-OUT TEST SCORE")
print("="*40)

test_results = trainer.evaluate(held_out_test_ds)
print(f"\n FINAL TEST SET MACRO F1: {test_results['eval_macro_f1']:.4f}")
print(f"   FINAL TEST SET ACCURACY: {test_results['eval_accuracy']:.4f}")

print("\nPer-class performance on dev set:")
dev_preds = trainer.predict(eval_ds)
y_true = dev_preds.label_ids
y_pred = np.argmax(dev_preds.predictions, axis=-1)
print(classification_report(y_true, y_pred, target_names=labels))


if "test" in dataset:
    print("\nGenerating submission_pro.csv...")
    comp_test_ds = comp_test_ds.map(tokenize_fn, batched=True)
    if "index" not in comp_test_ds.column_names:
        comp_test_ds = comp_test_ds.add_column("index", range(len(comp_test_ds)))

    comp_preds = trainer.predict(comp_test_ds)
    pred_ids = np.argmax(comp_preds.predictions, axis=-1)
    pred_labels = [id2label[p] for p in pred_ids]

    out_df = pd.DataFrame({
        "index": comp_test_ds["index"],
        "evasion_label": pred_labels
    })
    out_df.to_csv("submission_pro.csv", index=False)
    files.download("submission_pro.csv")


Loading Data...
Preprocessing...


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3448 [00:00<?, ? examples/s]

Train size: 2792
Dev size: 311
Held-out test size: 345




Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting DeBERTa Training (Pro Mode, Seeded)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.34849,0.549839,0.383592
2,No log,1.276027,0.553055,0.285075
3,1.472500,1.196895,0.55627,0.377871
4,1.472500,1.149883,0.588424,0.48556
5,1.472500,1.028996,0.565916,0.495564
6,1.004000,1.023135,0.620579,0.547306
7,1.004000,1.13101,0.655949,0.578387
8,1.004000,1.182748,0.62701,0.571967
9,0.607200,1.308887,0.630225,0.557543
10,0.607200,1.494811,0.639871,0.568193



COMPUTING FINAL HELD-OUT TEST SCORE



 FINAL TEST SET MACRO F1: 0.5505
   FINAL TEST SET ACCURACY: 0.6261

Per-class performance on dev set:
                     precision    recall  f1-score   support

   Claims ignorance       0.82      0.82      0.82        11
      Clarification       0.78      0.88      0.82         8
Declining to answer       0.83      0.77      0.80        13
         Deflection       0.55      0.32      0.41        34
            Dodging       0.64      0.56      0.60        64
           Explicit       1.00      1.00      1.00        95
            General       0.32      0.69      0.44        35
           Implicit       0.39      0.27      0.32        44
Partial/half-answer       0.00      0.00      0.00         7

           accuracy                           0.66       311
          macro avg       0.59      0.59      0.58       311
       weighted avg       0.67      0.66      0.65       311


Generating submission_pro.csv...


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>