# Exercise 4: PCL Detection with RoBERTa-large

This notebook implements the proposed approach from Exercise 3:
1. **Stage A** — Data Preparation: load dataset, binarise labels, train/dev split
2. **Stage B** — Model Training: fine-tune RoBERTa-large with focal loss, LLRD, and mixed-precision (AMP)
3. **Stage C** — Threshold Optimisation: grid search for optimal classification threshold

**Output:** `dev.txt` and `test.txt` prediction files, saved model checkpoint.

In [None]:
# import importlib.util
# import sys

# # Dictionary mapping import names to pip install names
# packages = {
#     'pandas': 'pandas',
#     'numpy': 'numpy',
#     'matplotlib': 'matplotlib',
#     'nltk': 'nltk',
#     'torch': 'torch',
#     'transformers': 'transformers',
#     'accelerate': 'accelerate',
#     'sklearn': 'scikit-learn',
#     'sentencepiece': 'sentencepiece',
#     'datasets': 'datasets',
#     'tiktoken': 'tiktoken',
#     'google.protobuf': 'protobuf'
# }

# # Check which packages are missing
# def is_installed(package):
#     try:
#         importlib.util.find_spec(package)
#         return True
#     except ModuleNotFoundError:
#         return False

# missing_packages = []
# for package, pip_name in packages.items():
#     if not is_installed(package):
#         missing_packages.append(pip_name)

# # Install all missing packages in a single pip call
# if len(missing_packages) > 0:
#     print(f"Installing missing packages: {missing_packages}")
#     !{sys.executable} -m pip install {' '.join(missing_packages)} --break-system-packages
# else:
#     print("All packages already installed!")

# # Verify installation
# print("\nVerifying installations:")
# for package, pip_name in packages.items():
#     status = "✓" if is_installed(package) else "✗"
#     print(f"  {status} {pip_name}")

# import sys
# import os
# os.chdir("/home/azureuser/NLP_jw1123/BestModel")
# print("Working directory set to:", os.getcwd())

# packages = [
#     'pandas',
#     'numpy',
#     'matplotlib',
#     'nltk',
#     'torch==2.6.0',
#     'transformers',
#     'accelerate',
#     'scikit-learn',
#     'sentencepiece',
#     'datasets',
#     'tiktoken',
#     'protobuf'
# ]

# !{sys.executable} -m pip install {' '.join(packages)} --break-system-packages

Working directory set to: /home/azureuser/NLP_jw1123/BestModel


Collecting torch==2.5.0
  Using cached torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.0)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.0)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.0)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.0)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.0)
  Using cached nvidia_cufft_cu12-11.2.

In [None]:
import os
import random
import json
import warnings

import numpy as np
import pandas as pd
import torch
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Reproducibility seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
# ---- All hyperparameters in one place ----
CONFIG = {
    "model_name": "roberta-large",
    "max_length": 256,
    "batch_size": 8,
    "gradient_accumulation_steps": 4,  # effective batch size = 32
    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "num_epochs": 10,
    "warmup_ratio": 0.1,
    "patience": 5,
    "seed": SEED,
    "focal_alpha": 0.83,              # optimal for post-aug ratio 1588:7581 = 1:4.8
    "focal_gamma": 2.0,
    # 1 random-swap copy per PCL sample: 794 → 1588 PCL vs 7581 non-PCL (ratio ~1:4.8)
    "aug_copies": 1,
    "threshold_lo": 0.30,
    "threshold_hi": 0.70,
    "threshold_step": 0.01,
    # LLRD: each lower layer's lr = base_lr * llrd_decay^k (k=1 for top layer, k=n_layers+1 for embeddings)
    "llrd_decay": 0.9,
}

for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Stage A: Data Preparation

In [18]:
# Resolve data path (works from BestModel/ or repo root)
DATA_FILE = "dontpatronizeme_pcl.tsv"
TRAIN_SPLIT_FILE = "train_semeval_parids-labels.csv"
DEV_SPLIT_FILE = "dev_semeval_parids-labels.csv"
TEST_FILE = "task4_test.tsv"




def find_file(name):
    if os.path.exists(name):
        return name
    if os.path.exists(os.path.join("..", name)):
        return os.path.join("..", name)
    raise FileNotFoundError(f"{name} not found in current or parent directory.")

DATA_PATH = find_file(DATA_FILE)
TRAIN_SPLIT_PATH = find_file(TRAIN_SPLIT_FILE)
DEV_SPLIT_PATH = find_file(DEV_SPLIT_FILE)
TEST_PATH = find_file(TEST_FILE)

# ---- Load main dataset ----
df = pd.read_csv(
    DATA_PATH,
    sep="\t",
    skiprows=4,
    header=None,
    names=["par_id", "art_id", "keyword", "country_code", "text", "label"],
)

# Binarise: 0-1 -> Not PCL (0), 2-4 -> PCL (1)
df["binary_label"] = (df["label"] >= 2).astype(int)
df["text"] = df["text"].fillna("").astype(str).str.strip()

# ---- Load official SemEval train/dev splits ----
train_split = pd.read_csv(TRAIN_SPLIT_PATH)
dev_split = pd.read_csv(DEV_SPLIT_PATH)
print(f"Official split par_ids — Train: {len(train_split)}, Dev: {len(dev_split)}")

# ---- Load test set ----
test_df = pd.read_csv(
    TEST_PATH,
    sep="\t",
    header=None,
    names=["test_id", "art_id", "keyword", "country_code", "text"],
)
test_df["text"] = test_df["text"].fillna("").astype(str).str.strip()

n_total = len(df)
n_pcl = df["binary_label"].sum()
print(f"\nTotal labelled samples: {n_total}")
print(f"PCL:     {n_pcl} ({n_pcl/n_total*100:.1f}%)")
print(f"Not PCL: {n_total - n_pcl} ({(n_total - n_pcl)/n_total*100:.1f}%)")
print(f"Test samples: {len(test_df)}")

Working directory set to: /home/azureuser/NLP_jw1123/BestModel
Official split par_ids — Train: 8375, Dev: 2094

Total labelled samples: 10469
PCL:     993 (9.5%)
Not PCL: 9476 (90.5%)
Test samples: 3832


In [19]:
# Use official SemEval train/dev splits (join on par_id)
train_par_ids = set(train_split["par_id"].values)
dev_par_ids = set(dev_split["par_id"].values)

train_df = df[df["par_id"].isin(train_par_ids)].reset_index(drop=True)
dev_df = df[df["par_id"].isin(dev_par_ids)].reset_index(drop=True)

# Sanity check: ensure all par_ids were matched
n_train_matched = len(train_df)
n_dev_matched = len(dev_df)
print(f"Train: {n_train_matched} matched out of {len(train_split)} split IDs  (PCL {train_df['binary_label'].sum()})")
print(f"Dev:   {n_dev_matched} matched out of {len(dev_split)} split IDs  (PCL {dev_df['binary_label'].sum()})")

if n_train_matched != len(train_split) or n_dev_matched != len(dev_split):
    print("WARNING: Some par_ids from the split files were not found in the dataset!")

Train: 8375 matched out of 8375 split IDs  (PCL 794)
Dev:   2094 matched out of 2094 split IDs  (PCL 199)


In [None]:
# ---- Data Augmentation: random word swap only ----
# Synonym replacement was removed: it targets long content words that are
# exactly the PCL-distinctive vocabulary (destitute, underprivileged, needy …),
# replacing them with generic synonyms that dilute the classification signal.
# A single random swap preserves all original words while adding surface-form
# variation at zero network/API cost.

def augment_text(text, rng=None):
    """Perform one random word swap for surface-form variation."""
    if rng is None:
        rng = random.Random()
    words = text.split()
    if len(words) >= 2:
        i, j = rng.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
    return " ".join(words)


# ---- Augment minority (PCL) class in training set only ----
_pcl_rows = train_df[train_df["binary_label"] == 1]
_aug_rng = random.Random(CONFIG["seed"])

aug_rows = []
for _, row in _pcl_rows.iterrows():
    for _ in range(CONFIG["aug_copies"]):
        aug_rows.append({**row.to_dict(), "text": augment_text(row["text"], rng=_aug_rng)})

train_df = (
    pd.concat([train_df, pd.DataFrame(aug_rows)], ignore_index=True)
    .sample(frac=1, random_state=CONFIG["seed"])
    .reset_index(drop=True)
)

n_pcl   = train_df["binary_label"].sum()
n_total = len(train_df)
print(f"After augmentation — total: {n_total}")
print(f"  PCL:     {n_pcl} ({n_pcl/n_total*100:.1f}%)")
print(f"  Not PCL: {n_total - n_pcl} ({(n_total - n_pcl)/n_total*100:.1f}%)")

_sample = _pcl_rows.iloc[0]["text"]
print(f"\nOriginal:  {_sample}")
print(f"Swapped:   {augment_text(_sample, rng=random.Random(CONFIG['seed']))}")

## Stage B: Model Training

In [None]:
# ---- Tokenisation ----
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

# Check token length distribution BEFORE choosing max_length
_raw_lengths = [
    len(tokenizer.encode(t, add_special_tokens=True))
    for t in df["text"].tolist()
]
_lengths = np.array(_raw_lengths)
print("Token length percentiles (whole dataset):")
for p in [50, 75, 90, 95, 99, 100]:
    print(f"  p{p:3d}: {np.percentile(_lengths, p):.0f} tokens")
print(f"  Truncated at 128: {(_lengths > 128).sum()} / {len(_lengths)} "
      f"({(_lengths > 128).mean()*100:.1f}%)")
print(f"  Truncated at 256: {(_lengths > 256).sum()} / {len(_lengths)} "
      f"({(_lengths > 256).mean()*100:.1f}%)")


def tokenize(texts, max_length):
    return tokenizer(
        texts.tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


print("\nTokenising train...")
train_enc = tokenize(train_df["text"], CONFIG["max_length"])
print("Tokenising dev...")
dev_enc = tokenize(dev_df["text"], CONFIG["max_length"])
print("Done.")

In [30]:
class PCLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }


train_dataset = PCLDataset(train_enc, train_df["binary_label"].values)
dev_dataset = PCLDataset(dev_enc, dev_df["binary_label"].values)

train_loader = DataLoader(
    train_dataset, batch_size=CONFIG["batch_size"], shuffle=True
)
dev_loader = DataLoader(
    dev_dataset, batch_size=CONFIG["batch_size"], shuffle=False
)

print(f"Train batches: {len(train_loader)}, Dev batches: {len(dev_loader)}")

Train batches: 524, Dev batches: 131


In [None]:
# ---- Focal Loss ----
class FocalLoss(torch.nn.Module):
    """Focal loss for imbalanced binary classification.

    FL(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
    alpha=0.83 upweights the rare PCL class; gamma=2 focuses on hard examples.
    alpha is near-optimal: n_neg/(n_pos+n_neg) = 7581/9169 ≈ 0.83.
    """
    def __init__(self, alpha=0.83, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.register_buffer("alpha_pos", torch.tensor(alpha))
        self.register_buffer("alpha_neg", torch.tensor(1.0 - alpha))

    def forward(self, logits, targets):
        probs = torch.softmax(logits.float(), dim=-1)   # cast to fp32 for numerical stability
        p_t = probs.gather(1, targets.unsqueeze(1)).squeeze(1)
        alpha_t = torch.where(targets == 1, self.alpha_pos, self.alpha_neg)
        focal_weight = alpha_t * (1 - p_t) ** self.gamma
        loss = -focal_weight * torch.log(p_t + 1e-8)
        return loss.mean()


# ---- Model (all layers trainable) ----
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"], num_labels=2,
)
model.to(device)

loss_fn = FocalLoss(alpha=CONFIG["focal_alpha"], gamma=CONFIG["focal_gamma"])
loss_fn.to(device)
print(f"Using Focal Loss: alpha={CONFIG['focal_alpha']}, gamma={CONFIG['focal_gamma']}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# ---- LLRD Optimiser ----
# Classifier head gets the full lr; each lower layer is scaled by decay^k
# (k=1 for top layer, k=n_layers+1 for embeddings).
# Dynamic: works for both roberta-base (12 layers) and roberta-large (24 layers).
_decay    = CONFIG["llrd_decay"]
_lr       = CONFIG["learning_rate"]
_n_layers = len(model.roberta.encoder.layer)  # 12 for base, 24 for large

param_groups = [
    {"params": model.classifier.parameters(), "lr": _lr},
]
for i, layer in enumerate(reversed(model.roberta.encoder.layer)):
    param_groups.append({"params": layer.parameters(), "lr": _lr * (_decay ** (i + 1))})
param_groups.append(
    # k = n_layers + 1 so embeddings are always one step below layer 0
    {"params": model.roberta.embeddings.parameters(), "lr": _lr * (_decay ** (_n_layers + 1))}
)

# Print base LRs BEFORE creating the scheduler.
print(f"\nLLRD base learning rates (model has {_n_layers} transformer layers):")
group_names = (
    ["classifier"]
    + [f"layer {_n_layers - 1 - i:>2d}" for i in range(_n_layers)]
    + ["embeddings"]
)
for name, group in zip(group_names, param_groups):
    print(f"  {name}: lr={group['lr']:.2e}")

optimizer = torch.optim.AdamW(param_groups, weight_decay=CONFIG["weight_decay"])

# ---- Scheduler ----
steps_per_epoch = len(train_loader) // CONFIG["gradient_accumulation_steps"]
total_steps = steps_per_epoch * CONFIG["num_epochs"]
warmup_steps = int(CONFIG["warmup_ratio"] * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

print(f"\nTotal optimiser steps: {total_steps}, Warmup steps: {warmup_steps}")
print(f"Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation_steps']}")

In [None]:
@torch.no_grad()
def evaluate(model, loader):
    """Return (avg_loss, f1, probs_array, labels_array)."""
    model.eval()
    all_probs, all_labels = [], []
    running_loss = 0.0

    for batch in loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labs = batch["labels"].to(device)

        with autocast():
            logits = model(input_ids=ids, attention_mask=mask).logits
        running_loss += loss_fn(logits, labs).item()  # loss_fn casts to fp32 internally

        probs = torch.softmax(logits.float(), dim=-1)[:, 1]
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(labs.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)
    preds = (all_probs >= 0.5).astype(int)
    f1 = f1_score(all_labels, preds, pos_label=1)
    return running_loss / len(loader), f1, all_probs, all_labels

In [None]:
# ---- Training loop with early stopping, gradient accumulation, LLRD & AMP ----
scaler = GradScaler()
best_f1 = 0.0
patience_ctr = 0
history = {"train_loss": [], "dev_loss": [], "dev_f1": []}
accum_steps = CONFIG["gradient_accumulation_steps"]

for epoch in range(1, CONFIG["num_epochs"] + 1):
    model.train()
    epoch_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(train_loader, 1):
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labs = batch["labels"].to(device)

        # --- Forward + backward (fp16 via AMP) ---
        with autocast():
            logits = model(input_ids=ids, attention_mask=mask).logits
            loss = loss_fn(logits, labs) / accum_steps
        scaler.scale(loss).backward()
        epoch_loss += loss.item() * accum_steps

        if step % accum_steps == 0 or step == len(train_loader):
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        if step == 1 or step % 10 == 0 or step == len(train_loader):
            print(f"  [ep {epoch}] step {step:>4}/{len(train_loader)}  loss={loss.item() * accum_steps:.4f}")

    avg_train = epoch_loss / len(train_loader)
    dev_loss, dev_f1, _, _ = evaluate(model, dev_loader)

    history["train_loss"].append(avg_train)
    history["dev_loss"].append(dev_loss)
    history["dev_f1"].append(dev_f1)

    tag = ""
    if dev_f1 > best_f1:
        best_f1 = dev_f1
        patience_ctr = 0
        torch.save(model.state_dict(), "best_model.pt")
        tag = " *saved*"
    else:
        patience_ctr += 1

    print(
        f"Epoch {epoch}: train_loss={avg_train:.4f}  "
        f"dev_loss={dev_loss:.4f}  dev_f1={dev_f1:.4f}  "
        f"patience={patience_ctr}/{CONFIG['patience']}{tag}"
    )

    if patience_ctr >= CONFIG["patience"]:
        print("Early stopping.")
        break

print(f"\nBest dev F1 (t=0.5): {best_f1:.4f}")

In [None]:
# ---- Training curves ----
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))

epochs_range = range(1, len(history["train_loss"]) + 1)
ax1.plot(epochs_range, history["train_loss"], "o-", label="Train")
ax1.plot(epochs_range, history["dev_loss"], "s-", label="Dev")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.set_title("Loss")
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(epochs_range, history["dev_f1"], "o-", color="green")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("F1")
ax2.set_title("Dev F1 (Positive Class)")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
plt.show()

## Stage C: Threshold Optimisation

In [None]:
# Reload best checkpoint
model.load_state_dict(torch.load("best_model.pt", map_location=device))
_, _, dev_probs, dev_labels = evaluate(model, dev_loader)

# Grid search
thresholds = np.arange(
    CONFIG["threshold_lo"],
    CONFIG["threshold_hi"] + CONFIG["threshold_step"],
    CONFIG["threshold_step"],
)

best_t, best_t_f1 = 0.5, 0.0
records = []
for t in thresholds:
    preds = (dev_probs >= t).astype(int)
    f1 = f1_score(dev_labels, preds, pos_label=1)
    prec = precision_score(dev_labels, preds, pos_label=1, zero_division=0)
    rec = recall_score(dev_labels, preds, pos_label=1, zero_division=0)
    records.append({"t": t, "f1": f1, "precision": prec, "recall": rec})
    if f1 > best_t_f1:
        best_t_f1 = f1
        best_t = t
        

res = pd.DataFrame(records)
print(f"Optimal threshold: {best_t:.2f}")
print(f"F1 at optimal threshold: {best_t_f1:.4f}")

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(res["t"], res["f1"], label="F1", linewidth=2)
ax.plot(res["t"], res["precision"], "--", label="Precision", linewidth=2)
ax.plot(res["t"], res["recall"], "--", label="Recall", linewidth=2)
ax.axvline(best_t, color="red", linestyle=":", linewidth=2, label=f"t*={best_t:.2f}")
ax.set_xlabel("Threshold")
ax.set_ylabel("Score")
ax.set_title("Threshold Optimisation on Dev Set")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("threshold_optimisation.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# ---- Final evaluation with optimal threshold ----
dev_preds = (dev_probs >= best_t).astype(int)

print(f"Threshold = {best_t:.2f}")
print("=" * 55)
print(classification_report(dev_labels, dev_preds, target_names=["Not PCL", "PCL"]))

f1_final = f1_score(dev_labels, dev_preds, pos_label=1)
prec_final = precision_score(dev_labels, dev_preds, pos_label=1)
rec_final = recall_score(dev_labels, dev_preds, pos_label=1)
print(f"PCL Precision: {prec_final:.4f}")
print(f"PCL Recall:    {rec_final:.4f}")
print(f"PCL F1:        {f1_final:.4f}")

## Save Predictions & Model

In [None]:
# ---- dev.txt: predictions on official dev set ----
dev_pred_path = os.path.join("..", "dev.txt")
with open(dev_pred_path, "w") as f:
    for p in dev_preds:
        f.write(f"{p}\n")
print(f"Saved {len(dev_preds)} dev predictions -> {dev_pred_path}")

# ---- test.txt: predictions on actual unlabelled test set (task4_test.tsv) ----
print(f"\nGenerating predictions on test set ({len(test_df)} samples)...")
test_enc = tokenize(test_df["text"], CONFIG["max_length"])
test_ds = PCLDataset(test_enc, np.zeros(len(test_df), dtype=int))  # dummy labels
test_loader = DataLoader(test_ds, batch_size=CONFIG["batch_size"], shuffle=False)

model.eval()
all_test_probs = []
with torch.no_grad():
    for batch in test_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        logits = model(input_ids=ids, attention_mask=mask).logits
        probs = torch.softmax(logits, dim=-1)[:, 1]
        all_test_probs.extend(probs.cpu().numpy())

test_preds = (np.array(all_test_probs) >= best_t).astype(int)
test_pred_path = os.path.join("..", "test.txt")
with open(test_pred_path, "w") as f:
    for p in test_preds:
        f.write(f"{p}\n")
print(f"Saved {len(test_preds)} test predictions -> {test_pred_path}")
print(f"Test PCL rate: {test_preds.sum()}/{len(test_preds)} ({test_preds.mean()*100:.1f}%)")

In [None]:
# ---- Save model & tokenizer for reproducibility ----
save_dir = "saved_model"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# Save training config + threshold
meta = {
    **CONFIG,
    "best_threshold": float(best_t),
    "best_f1": float(best_t_f1),
    "dev_precision": float(prec_final),
    "dev_recall": float(rec_final),
}
with open(os.path.join(save_dir, "training_config.json"), "w") as f:
    json.dump(meta, f, indent=2)

print(f"Model saved to {save_dir}/")
print(f"Best threshold: {best_t:.2f}")
print(f"Dev F1: {best_t_f1:.4f}")
print("\nDone!")