# Stage 03: Albert & DeBerta (Part 2)

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

import torch
import torch.nn as nn

import optuna
from optuna.pruners import MedianPruner

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

SEED = 42
set_seed(SEED)

ROOT = Path(".").resolve()  # run from project root
TRAIN_PATH = ROOT / "data" / "processed" / "pcl_task1_train.csv"
DEV_PATH   = ROOT / "data" / "processed" / "pcl_task1_dev.csv"

OUTPUT_DIR = ROOT / "runs" / "optuna_task1"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STUDY_DB = str(OUTPUT_DIR / "optuna_pcl_task1.db")  # sqlite db file
STUDY_NAME = "pcl_task1_binary"

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)

# Keep only what we need
keep_cols = ["par_id", "text", "label_bin"]
train_df = train_df[keep_cols].copy()
dev_df   = dev_df[keep_cols].copy()

train_df["label_bin"] = train_df["label_bin"].astype(int)
dev_df["label_bin"]   = dev_df["label_bin"].astype(int)

print(train_df.shape, dev_df.shape)
print(train_df["label_bin"].value_counts())

In [None]:
train_split, val_split = train_test_split(
    train_df,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df["label_bin"],
)

print("train:", train_split["label_bin"].value_counts().to_dict())
print("val  :", val_split["label_bin"].value_counts().to_dict())

ds_train_raw = Dataset.from_pandas(train_split.reset_index(drop=True))
ds_val_raw   = Dataset.from_pandas(val_split.reset_index(drop=True))
ds_dev_raw   = Dataset.from_pandas(dev_df.reset_index(drop=True))

In [None]:
def compute_metrics_from_logits(logits, labels, threshold=0.5):
    # logits: (N,1) or (N,)
    logits = np.squeeze(logits)
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= threshold).astype(int)

    y = labels.astype(int)

    f1  = f1_score(y, preds, zero_division=0)
    p   = precision_score(y, preds, zero_division=0)
    r   = recall_score(y, preds, zero_division=0)
    acc = accuracy_score(y, preds)

    cm = confusion_matrix(y, preds, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    acc0 = tn / (tn + fp + 1e-12)
    acc1 = tp / (tp + fn + 1e-12)

    return {
        "f1": f1,
        "precision": p,
        "recall": r,
        "accuracy": acc,
        "acc_nonpcl": acc0,
        "acc_pcl": acc1,
        "tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn),
    }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    return compute_metrics_from_logits(logits, labels, threshold=0.5)

In [None]:
def make_tokenized_datasets(model_name: str, max_length: int):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def tok_fn(batch):
        return tok(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    ds_train = ds_train_raw.map(tok_fn, batched=True)
    ds_val   = ds_val_raw.map(tok_fn, batched=True)
    ds_dev   = ds_dev_raw.map(tok_fn, batched=True)

    cols_to_keep = ["input_ids", "attention_mask", "label_bin"]
    ds_train = ds_train.rename_column("label_bin", "labels").with_columns({
        "labels": ds_train["labels"]
    }).remove_columns([c for c in ds_train.column_names if c not in cols_to_keep and c != "labels"])

    ds_val = ds_val.rename_column("label_bin", "labels").remove_columns(
        [c for c in ds_val.column_names if c not in ["input_ids","attention_mask","labels"]]
    )
    ds_dev = ds_dev.rename_column("label_bin", "labels").remove_columns(
        [c for c in ds_dev.column_names if c not in ["input_ids","attention_mask","labels"]]
    )

    return tok, ds_train, ds_val, ds_dev

In [None]:
class WeightedBCETrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(-1)  # (B,)

        # BCEWithLogits with pos_weight
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
def objective(trial: optuna.Trial):
    model_name = trial.suggest_categorical(
        "model_name",
        [
            "microsoft/deberta-v3-base",
            "microsoft/deberta-v3-small",
            "albert-base-v2",
            "albert-large-v2",
        ],
    )

    lr          = trial.suggest_float("lr", 5e-6, 5e-5, log=True)
    batch_size  = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay= trial.suggest_float("weight_decay", 0.0, 0.1)
    warmup_ratio= trial.suggest_float("warmup_ratio", 0.0, 0.15)
    max_length  = trial.suggest_categorical("max_length", [96, 128, 192, 256])
    epochs      = trial.suggest_categorical("epochs", [2, 3, 4])
    grad_accum  = trial.suggest_categorical("grad_accum", [1, 2, 4])
    pos_scale   = trial.suggest_categorical("pos_weight_scale", [0.75, 1.0, 1.25, 1.5, 2.0])

    tok, ds_train, ds_val, _ = make_tokenized_datasets(model_name, max_length)

    # base pos_weight = neg/pos
    y = np.array(train_split["label_bin"].values, dtype=int)
    n_pos = (y == 1).sum()
    n_neg = (y == 0).sum()
    base_pos_weight = (n_neg / max(n_pos, 1))
    pos_weight = torch.tensor(base_pos_weight * pos_scale, dtype=torch.float)

    cfg = AutoConfig.from_pretrained(model_name)
    cfg.num_labels = 1  # single logit
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)

    trial_dir = OUTPUT_DIR / f"trial_{trial.number:04d}"
    trial_dir.mkdir(parents=True, exist_ok=True)

    args = TrainingArguments(
        output_dir=str(trial_dir),
        seed=SEED,
        data_seed=SEED,

        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,

        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,

        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",

        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,

        save_total_limit=1,

        fp16=torch.cuda.is_available(),   # safe; ignored on MPS/CPU
        report_to="none",
    )

    trainer = WeightedBCETrainer(
        model=model,
        args=args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tok,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        pos_weight=pos_weight,
    )

    trainer.train()

    metrics = trainer.evaluate(ds_val)
    # Optuna optimizes this:
    return metrics["eval_f1"]

In [None]:
storage_url = f"sqlite:///{STUDY_DB}"

study = optuna.create_study(
    study_name=STUDY_NAME,
    direction="maximize",
    storage=storage_url,
    load_if_exists=True,
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=0),
)

print("Existing trials:", len(study.trials))

# tweakable:
N_TRIALS = 20

study.optimize(objective, n_trials=N_TRIALS, gc_after_trial=True)

print("Best f1:", study.best_value)
print("Best params:", study.best_params)

In [None]:
best = study.best_params
best_model = best["model_name"]
best_maxlen = best["max_length"]

# rebuild datasets: train on (train+val), evaluate on dev
full_train = pd.concat([train_split, val_split], ignore_index=True)
ds_full_train_raw = Dataset.from_pandas(full_train.reset_index(drop=True))

tok = AutoTokenizer.from_pretrained(best_model, use_fast=True)
def tok_fn(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=best_maxlen)

ds_full_train = ds_full_train_raw.map(tok_fn, batched=True)
ds_dev = ds_dev_raw.map(tok_fn, batched=True)

ds_full_train = ds_full_train.rename_column("label_bin","labels").remove_columns(
    [c for c in ds_full_train.column_names if c not in ["input_ids","attention_mask","labels"]]
)
ds_dev = ds_dev.rename_column("label_bin","labels").remove_columns(
    [c for c in ds_dev.column_names if c not in ["input_ids","attention_mask","labels"]]
)

y_full = np.array(full_train["label_bin"].values, dtype=int)
n_pos = (y_full == 1).sum()
n_neg = (y_full == 0).sum()
base_pos_weight = (n_neg / max(n_pos, 1))
pos_weight = torch.tensor(base_pos_weight * best["pos_weight_scale"], dtype=torch.float)

cfg = AutoConfig.from_pretrained(best_model)
cfg.num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(best_model, config=cfg)

final_dir = OUTPUT_DIR / "best_final_model"
final_dir.mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(final_dir),
    seed=SEED,
    data_seed=SEED,

    learning_rate=best["lr"],
    per_device_train_batch_size=best["batch_size"],
    per_device_eval_batch_size=best["batch_size"],
    gradient_accumulation_steps=best["grad_accum"],

    num_train_epochs=best["epochs"],
    weight_decay=best["weight_decay"],
    warmup_ratio=best["warmup_ratio"],

    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = WeightedBCETrainer(
    model=model,
    args=args,
    train_dataset=ds_full_train,
    eval_dataset=ds_dev,
    tokenizer=tok,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
    pos_weight=pos_weight,
)

trainer.train()
dev_metrics = trainer.evaluate(ds_dev)
dev_metrics