# Stage 03: Albert & DeBerta (Part 2)

## Imports & Load Dataset

In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import optuna
from optuna.pruners import MedianPruner

from src.training.metrics import compute_metrics, compute_metrics_from_logits
from src.training.tokenization_utils import make_tokenized_datasets
from src.training.loss import WeightedBCETrainer

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

SEED = 42
set_seed(SEED)

ROOT = Path('.').resolve().parents[0]  # run from project root
TRAIN_PATH = ROOT / "data" / "processed" / "pcl_task1_train.csv"
DEV_PATH   = ROOT / "data" / "processed" / "pcl_task1_dev.csv"

OUTPUT_DIR = ROOT / "runs" / "optuna_task1"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STUDY_DB = str(OUTPUT_DIR / "optuna_pcl_task1.db")  # sqlite db file
STUDY_NAME = "pcl_task1_binary"

train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)

# Keep only what we need
keep_cols = ["par_id", "text", "label_bin"]
train_df = train_df[keep_cols].copy()
dev_df   = dev_df[keep_cols].copy()

train_df["label_bin"] = train_df["label_bin"].astype(int)
dev_df["label_bin"]   = dev_df["label_bin"].astype(int)

print(train_df.shape, dev_df.shape)
print(train_df["label_bin"].value_counts())

  from .autonotebook import tqdm as notebook_tqdm
2026-02-20 05:01:52.812380: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-20 05:01:52.870497: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-20 05:01:54.406154: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


(8375, 3) (2094, 3)
label_bin
0    7581
1     794
Name: count, dtype: int64


## Validation split

In [2]:
train_split, val_split = train_test_split(
    train_df,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df["label_bin"], # Keep balance of classes
)

print("train:", train_split["label_bin"].value_counts().to_dict())
print("val  :", val_split["label_bin"].value_counts().to_dict())

ds_train_raw = Dataset.from_pandas(train_split.reset_index(drop=True))
ds_val_raw   = Dataset.from_pandas(val_split.reset_index(drop=True))
ds_dev_raw   = Dataset.from_pandas(dev_df.reset_index(drop=True))

train: {0: 6443, 1: 675}
val  : {0: 1138, 1: 119}


## Define search space for configurations 

In [3]:
# Fixed parameters
fixed_maxlen = 128  # 75% fit under 67 tokens    # trial.suggest_categorical("max_length", [96, 128, 192, 256])
fixed_batch_size = 16
fixed_epochs = 12

In [4]:
def objective(trial: optuna.Trial):
    model_name = trial.suggest_categorical(
        "model_name",
        [
            "microsoft/deberta-v3-base",
            "albert-large-v2",
        ],
    )

    lr          = trial.suggest_float("lr", 5e-6, 5e-5, log=True)
    batch_size  = fixed_batch_size
    weight_decay= trial.suggest_float("weight_decay", 0.0, 0.1)
    warmup_ratio= trial.suggest_float("warmup_ratio", 0.0, 0.15)
    max_length  = fixed_maxlen   
    epochs      = fixed_epochs
    grad_accum  = trial.suggest_categorical("grad_accum", [1, 2]) # do [2, 4] if 16 doesnt fit
    pos_scale   = trial.suggest_float("pos_weight_scale", 0.75, 1.5)
    
    tok, ds_train, ds_val, _ = make_tokenized_datasets(model_name, max_length, ds_train_raw, ds_val_raw, ds_dev_raw)

    # base pos_weight = neg/pos (multiplier required to balance the classes GD update significance)
    y = np.array(train_split["label_bin"].values, dtype=int)
    n_pos = (y == 1).sum()
    n_neg = (y == 0).sum()
    base_pos_weight = (n_neg / max(n_pos, 1))
    pos_weight = torch.tensor(base_pos_weight * pos_scale, dtype=torch.float)

    cfg = AutoConfig.from_pretrained(model_name)
    cfg.num_labels = 1  # single logit
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)

    trial_dir = OUTPUT_DIR / f"trial_{trial.number:04d}"
    trial_dir.mkdir(parents=True, exist_ok=True)

    args = TrainingArguments(
        output_dir=str(trial_dir),
        seed=SEED,
        data_seed=SEED,

        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,

        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,

        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",

        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,

        save_total_limit=1,

        fp16=torch.cuda.is_available(),   # safe; ignored on MPS/CPU
        report_to="none",
    )

    trainer = WeightedBCETrainer(
        model=model,
        args=args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tok,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.002)],
        pos_weight=pos_weight,
    )

    trainer.train()

    metrics = trainer.evaluate(ds_val)
    print(metrics)
    # Optuna optimizes this:
    return metrics["eval_f1"]

In [6]:
storage_url = f"sqlite:///{STUDY_DB}"

study = optuna.create_study(
    study_name=STUDY_NAME,
    direction="maximize",
    storage=storage_url,
    load_if_exists=True,
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=4),
)

print("Existing trials:", len(study.trials))

N_TRIALS = 50

study.optimize(objective, n_trials=N_TRIALS, gc_after_trial=True)

print("Best f1:", study.best_value)
print("Best params:", study.best_params)

[32m[I 2026-02-20 05:02:21,886][0m Using an existing study with name 'pcl_task1_binary' instead of creating a new one.[0m


Existing trials: 7


[33m[W 2026-02-20 05:02:22,286][0m Trial 7 failed with parameters: {'model_name': 'microsoft/deberta-v3-base', 'lr': 1.1745444475785038e-05, 'weight_decay': 0.015448649348069077, 'warmup_ratio': 0.1077098423164384, 'grad_accum': 1, 'pos_weight_scale': 1.2189317932988835} because of the following error: OSError("We couldn't connect to 'https://huggingface.co' to load this model, couldn't find it in the cached files and it looks like microsoft/deberta-v3-base is not the path to a directory containing a config.json file.\nCheckout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.").[0m
Traceback (most recent call last):
  File "/home/joshua_killa/.pyenv/versions/pcl-env/lib/python3.10/site-packages/transformers/configuration_utils.py", line 616, in _get_config_dict
    resolved_config_file = cached_path(
  File "/home/joshua_killa/.pyenv/versions/pcl-env/lib/python3.10/site-packages/transforme

OSError: We couldn't connect to 'https://huggingface.co' to load this model, couldn't find it in the cached files and it looks like microsoft/deberta-v3-base is not the path to a directory containing a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

## Train best configuration

In [None]:
best = study.best_params
best_model = best["model_name"]

# rebuild datasets: train on (train+val), evaluate on dev
full_train = pd.concat([train_split, val_split], ignore_index=True)
ds_full_train_raw = Dataset.from_pandas(full_train.reset_index(drop=True))

tok = AutoTokenizer.from_pretrained(best_model, use_fast=True)
def tok_fn(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=fixed_maxlen)

# Tokenize datasets & truncate/padd them to max length
ds_full_train = ds_full_train_raw.map(tok_fn, batched=True)
ds_dev = ds_dev_raw.map(tok_fn, batched=True)

ds_full_train = ds_full_train.rename_column("label_bin","labels").remove_columns(
    [c for c in ds_full_train.column_names if c not in ["input_ids","attention_mask","labels"]]
)
ds_dev = ds_dev.rename_column("label_bin","labels").remove_columns(
    [c for c in ds_dev.column_names if c not in ["input_ids","attention_mask","labels"]]
)

y_full = np.array(full_train["label_bin"].values, dtype=int)
n_pos = (y_full == 1).sum()
n_neg = (y_full == 0).sum()
base_pos_weight = (n_neg / max(n_pos, 1))
pos_weight = torch.tensor(base_pos_weight * best["pos_weight_scale"], dtype=torch.float)

cfg = AutoConfig.from_pretrained(best_model)
cfg.num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(best_model, config=cfg)

final_dir = OUTPUT_DIR / "best_final_model"
final_dir.mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(final_dir),
    seed=SEED,
    data_seed=SEED,

    learning_rate=best["lr"],
    per_device_train_batch_size=fixed_batch_size,
    per_device_eval_batch_size=fixed_batch_size,
    gradient_accumulation_steps=best["grad_accum"],

    num_train_epochs=fixed_epochs,
    weight_decay=best["weight_decay"],
    warmup_ratio=best["warmup_ratio"],

    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = WeightedBCETrainer(
    model=model,
    args=args,
    train_dataset=ds_full_train,
    eval_dataset=ds_dev,
    tokenizer=tok,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
    pos_weight=pos_weight,
)

trainer.train()
dev_metrics = trainer.evaluate(ds_dev)
dev_metrics