# Stage 03: Albert & DeBerta Grid Search (Part 2)

## Imports Load & Clean Dataset

In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import optuna
from optuna.pruners import MedianPruner

from src.training.metrics import compute_metrics, compute_metrics_from_logits
from src.training.tokenization_utils import make_tokenized_datasets, clean_and_prune_by_tokens
from src.training.loss import WeightedBCETrainer
from src.training.search_utils import (
    best_so_far_df, 
    progress_df, 
    reset_study_completely, 
    clean_trial_folders, 
    mark_stale_running_trials_as_fail, 
    remaining_trials_to_run, 
    done_counts,
    OptunaMedianPruningCallback,
    pretty_print_dict,
)

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    EarlyStoppingCallback,
    set_seed,
)

SEED = 42
set_seed(SEED)

ROOT = Path('.').resolve().parents[0]  # run from project root
TRAIN_PATH = ROOT / "data" / "processed" / "pcl_task1_train.csv"
DEV_PATH   = ROOT / "data" / "processed" / "pcl_task1_dev.csv"

OUTPUT_DIR = ROOT / "runs" / "optuna_task1"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STUDY_DB = str(OUTPUT_DIR / "optuna_pcl_task1.db")  # sqlite db file
STUDY_NAME = "pcl_task1_binary"

train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)

# Keep only what we need
keep_cols = ["par_id", "text", "label_bin"]
train_df = train_df[keep_cols].copy()
dev_df   = dev_df[keep_cols].copy()

train_df["label_bin"] = train_df["label_bin"].astype(int)
dev_df["label_bin"]   = dev_df["label_bin"].astype(int)

print(train_df.shape, dev_df.shape)
print(train_df["label_bin"].value_counts())

  from .autonotebook import tqdm as notebook_tqdm


(8375, 3) (2094, 3)
label_bin
0    7581
1     794
Name: count, dtype: int64


In [2]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("No CUDA GPU detected. Using CPU or MPS.")

CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [3]:
EXAMPLE_MODEL_NAME = "microsoft/deberta-v3-base"  # one of the two models used for tokenising
CONTEXT_WINDOW = 128  # max tokens to keep (after tokenization)
train_df = clean_and_prune_by_tokens(train_df, EXAMPLE_MODEL_NAME, max_pos_tokens=CONTEXT_WINDOW*1.25)



## Validation split

In [4]:
train_split, val_split = train_test_split(
    train_df,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df["label_bin"], # Keep balance of classes
)

print("train:", train_split["label_bin"].value_counts().to_dict())
print("val  :", val_split["label_bin"].value_counts().to_dict())

ds_train_raw = Dataset.from_pandas(train_split.reset_index(drop=True))
ds_val_raw   = Dataset.from_pandas(val_split.reset_index(drop=True))
ds_dev_raw   = Dataset.from_pandas(dev_df.reset_index(drop=True))

train: {0: 6444, 1: 671}
val  : {0: 1137, 1: 119}


## Define search space for configurations 

In [5]:
# Fixed parameters
fixed_maxlen = CONTEXT_WINDOW  # 75% fit under 67 tokens    # trial.suggest_categorical("max_length", [96, 128, 192, 256])
fixed_batch_size = 16
fixed_epochs = 12


In [None]:
from optuna import trial


def objective(trial: optuna.Trial):
    try:
        model_name = trial.suggest_categorical(
            "model_name",
            [
                "microsoft/deberta-v3-base",
                "albert-large-v2",
            ],
        )

        is_deberta = "deberta" in model_name.lower()
        use_fp16 = torch.cuda.is_available() and (not is_deberta)  # DeBERTa can be unstable in fp16, so we disable it for that model
        use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() and is_deberta

        trial.set_user_attr("fp16", bool(use_fp16))
        trial.set_user_attr("bf16", bool(use_bf16)) # Faster than normal float32 and no mixed precision instability like fp16 (since same range)

        lr          = trial.suggest_float("lr", 5e-6, 5e-5, log=True)
        batch_size  = fixed_batch_size
        weight_decay= trial.suggest_float("weight_decay", 0.0, 0.1)
        warmup_ratio= trial.suggest_float("warmup_ratio", 0.0, 0.15)
        max_length  = fixed_maxlen   
        epochs      = fixed_epochs
        grad_accum  = trial.suggest_categorical("grad_accum", [1, 2])
        pos_weight_scale = 1.5 # up to 1.5x the base pos weight

        tok, ds_train, ds_val, _ = make_tokenized_datasets(model_name, max_length, ds_train_raw, ds_val_raw, ds_dev_raw)

        y = np.array(train_split["label_bin"].values, dtype=int)
        n_pos = (y == 1).sum()
        n_neg = (y == 0).sum()
        base_pos_weight = (n_neg / max(n_pos, 1))
        pos_weight   = trial.suggest_float("pos_weight", 1, base_pos_weight * pos_weight_scale)
        pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float)
        cfg = AutoConfig.from_pretrained(model_name)
        cfg.num_labels = 1  # single logit
        model = AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)

        trial_dir = OUTPUT_DIR / f"trial_{trial.number:04d}"
        trial_dir.mkdir(parents=True, exist_ok=True)

        args = TrainingArguments(
            output_dir=str(trial_dir),
            seed=SEED,
            data_seed=SEED,
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=grad_accum,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            fp16=use_fp16,
            bf16=use_bf16,
            report_to="none",
        )

        trainer = WeightedBCETrainer(
            model=model,
            args=args,
            train_dataset=ds_train,
            eval_dataset=ds_val,
            compute_metrics=compute_metrics,
            callbacks=[
                OptunaMedianPruningCallback(trial, monitor="eval_f1"),
                EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.003),
            ],
            pos_weight=pos_weight_tensor,
        )

        cfg_to_print = {
            "model_name": model_name,
            "lr": lr,
            "weight_decay": weight_decay,
            "warmup_ratio": warmup_ratio,
            "max_length": max_length,
            "epochs": epochs,
            "grad_accum": grad_accum,
            "pos_weight": pos_weight,
            "pos_weight_rel": pos_weight / base_pos_weight,
            "fp16": use_fp16,
            "bf16": use_bf16,
        }
        pretty_print_dict(f"Trial {trial.number} config", cfg_to_print)

        trainer.train()
        metrics = trainer.evaluate(ds_val)
        pretty_print_dict(f"Trial {trial.number} metrics", metrics, sort_keys=False)
        return metrics["eval_f1"]

    except torch.cuda.OutOfMemoryError:
        print("OOM: pruning this trial")
        torch.cuda.empty_cache()
        raise optuna.TrialPruned()
    except KeyboardInterrupt:
        try:
            trial.set_user_attr("interrupted", True)
        except Exception:
            pass
        raise  # stops the optimize call
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print("OOM (RuntimeError): pruning this trial")
            torch.cuda.empty_cache()
            raise optuna.TrialPruned()
        else:
            raise
    finally:
        try:
            del model
            torch.cuda.empty_cache()
        except Exception:
            pass

In [7]:
storage_url = f"sqlite:///{STUDY_DB}"

study = optuna.create_study(
    study_name=STUDY_NAME,
    direction="maximize",
    storage=storage_url,
    load_if_exists=True,
    pruner=MedianPruner(n_startup_trials=13, n_warmup_steps=4, n_min_trials=7, interval_steps=1)
)

stale = mark_stale_running_trials_as_fail(study)
if stale:
    print("Marked stale RUNNING trials as FAIL:", stale)

TARGET_DONE = 50  # total COMPLETE+PRUNED (across sessions)
to_run = remaining_trials_to_run(study, TARGET_DONE)

[32m[I 2026-02-21 02:15:18,587][0m Using an existing study with name 'pcl_task1_binary' instead of creating a new one.[0m


In [8]:
print("Trial state counts:", done_counts(study))
print(f"Will run {to_run} new trials to reach {TARGET_DONE} done (COMPLETE+PRUNED).")

try:
    if to_run > 0:
        study.optimize(objective, n_trials=to_run, gc_after_trial=True)
except KeyboardInterrupt:
    print("Interrupted. Rerun this cell to continue toward the target.")

print("Trial state counts:", done_counts(study))
print("Best f1:", study.best_value)
print("Best params:", study.best_params)

Trial state counts: {'RUNNING': 0, 'COMPLETE': 7, 'PRUNED': 0, 'FAIL': 15, 'WAITING': 0}
Will run 43 new trials to reach 50 done (COMPLETE+PRUNED).


Map: 100%|██████████| 7115/7115 [00:00<00:00, 18155.28 examples/s]
Map: 100%|██████████| 1256/1256 [00:00<00:00, 19106.78 examples/s]
Map: 100%|██████████| 2094/2094 [00:00<00:00, 6312.44 examples/s]
Loading weights: 100%|██████████| 25/25 [00:00<00:00, 401.47it/s, Materializing param=albert.pooler.weight]                                                             
[1mAlbertForSequenceClassification LOAD REPORT[0m from: albert-large-v2
Key                          | Status     | 
-----------------------------+------------+-
predictions.dense.weight     | UNEXPECTED | 
predictions.LayerNorm.weight | UNEXPECTED | 
predictions.bias             | UNEXPECTED | 
predictions.decoder.bias     | UNEXPECTED | 
predictions.dense.bias       | UNEXPECTED | 
predictions.LayerNorm.bias   | UNEXPECTED | 
classifier.weight            | MISSING    | 
classifier.bias              | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you e

key,value
bf16,False
epochs,12
fp16,True
grad_accum,2
lr,0.000027
max_length,128
model_name,albert-large-v2
pos_weight,4.008505
pos_weight_scale,0.417397
warmup_ratio,0.005122


Epoch,Training Loss,Validation Loss


[33m[W 2026-02-21 02:16:57,947][0m Trial 22 failed with parameters: {'model_name': 'albert-large-v2', 'lr': 2.6803508238399554e-05, 'weight_decay': 0.01209275925113551, 'warmup_ratio': 0.005121893102694125, 'grad_accum': 2, 'pos_weight': 4.008505329143361} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/joshua_killa/.pyenv/versions/pcl-env/lib/python3.10/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_296226/357774104.py", line 96, in objective
    trainer.train()
  File "/home/joshua_killa/.pyenv/versions/pcl-env/lib/python3.10/site-packages/transformers/trainer.py", line 1412, in train
    return inner_training_loop(
  File "/home/joshua_killa/.pyenv/versions/pcl-env/lib/python3.10/site-packages/transformers/trainer.py", line 1747, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
KeyboardInterrupt
[33m[W 2026-0

Interrupted. Rerun this cell to continue toward the target.
Trial state counts: {'RUNNING': 0, 'COMPLETE': 7, 'PRUNED': 0, 'FAIL': 16, 'WAITING': 0}
Best f1: 0.5566037735849056
Best params: {'model_name': 'albert-large-v2', 'lr': 7.295763634857713e-06, 'weight_decay': 0.07317734691943754, 'warmup_ratio': 0.10647176648296738, 'grad_accum': 2, 'pos_weight': 3.2832558029026764}


### Grid search database operations

In [9]:
best_so_far_df(study, 10)

Unnamed: 0,number,state,value,params_grad_accum,params_lr,params_model_name,params_pos_weight,params_pos_weight_scale,params_warmup_ratio,params_weight_decay,user_attrs_bf16,user_attrs_fp16,user_attrs_interrupted
10,10,COMPLETE,0.556604,2.0,7e-06,albert-large-v2,3.283256,,0.106472,0.073177,,,
5,5,COMPLETE,0.470588,1.0,5e-06,albert-large-v2,,0.937763,0.018057,0.089044,,,
0,0,COMPLETE,0.4,1.0,1.3e-05,albert-large-v2,,1.335515,0.141454,0.054436,,,
9,9,COMPLETE,0.263415,1.0,2.4e-05,albert-large-v2,11.248359,,0.110859,0.041321,,,
1,1,COMPLETE,0.261574,1.0,4.3e-05,albert-large-v2,,1.195678,0.144863,0.090752,,,
2,2,COMPLETE,0.173343,2.0,4.5e-05,albert-large-v2,,1.078698,0.101572,0.093909,,,
4,4,COMPLETE,0.172965,2.0,3.1e-05,albert-large-v2,,0.852154,0.085964,0.020242,,,


## Train best configuration

In [None]:
best = study.best_params
best_model = best["model_name"]

is_deberta = "deberta" in best_model.lower()
use_fp16 = torch.cuda.is_available() and (not is_deberta)  # DeBERTa can be unstable in fp16, so we disable it for that model
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() and is_deberta

# rebuild datasets: train on (train+val), evaluate on dev
full_train = pd.concat([train_split, val_split], ignore_index=True)
ds_full_train_raw = Dataset.from_pandas(full_train.reset_index(drop=True))

tok = AutoTokenizer.from_pretrained(best_model, use_fast=True)

def tok_fn(batch):
    texts = [str(x) if x is not None else "" for x in batch["text"]]
    tokenized = tok(
        texts,
        truncation=True,
        padding="max_length",
        max_length=fixed_maxlen,
    )
    if "label_bin" in batch:
        tokenized["labels"] = [float(x) for x in batch["label_bin"]]
    return dict(tokenized)

# Tokenize datasets & truncate/pad them to max length
ds_full_train = ds_full_train_raw.map(tok_fn, batched=True)
ds_dev = ds_dev_raw.map(tok_fn, batched=True)

# Only rename if needed
if "labels" not in ds_full_train.column_names and "label_bin" in ds_full_train.column_names:
    ds_full_train = ds_full_train.rename_column("label_bin", "labels")
if "labels" not in ds_dev.column_names and "label_bin" in ds_dev.column_names:
    ds_dev = ds_dev.rename_column("label_bin", "labels")

# Remove unnecessary columns
ds_full_train = ds_full_train.remove_columns([c for c in ds_full_train.column_names if c not in ["input_ids", "attention_mask", "labels"]])
ds_dev = ds_dev.remove_columns([c for c in ds_dev.column_names if c not in ["input_ids", "attention_mask", "labels"]])

pos_weight_tensor = torch.tensor(best["pos_weight"], dtype=torch.float)

cfg = AutoConfig.from_pretrained(best_model)
cfg.num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(best_model, config=cfg)

final_dir = OUTPUT_DIR / "best_final_model"
final_dir.mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(final_dir),
    seed=SEED,
    data_seed=SEED,
    learning_rate=best["lr"],
    per_device_train_batch_size=fixed_batch_size,
    per_device_eval_batch_size=fixed_batch_size,
    gradient_accumulation_steps=best["grad_accum"],
    num_train_epochs=fixed_epochs,
    weight_decay=best["weight_decay"],
    warmup_ratio=best["warmup_ratio"],
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    fp16=use_fp16,
    bf16=use_bf16,
    report_to="none",
)

trainer = WeightedBCETrainer(
    model=model,
    args=args,
    train_dataset=ds_full_train,
    eval_dataset=ds_dev,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
    pos_weight=pos_weight_tensor,
)

trainer.train()
dev_metrics = trainer.evaluate(ds_dev)
dev_metrics