# Exercise 4: PCL Detection with DeBERTa-v3-base

This notebook implements the proposed approach from Exercise 3:
1. **Stage A** — Data Preparation: load dataset, binarise labels, train/dev split
2. **Stage B** — Model Training: fine-tune DeBERTa-v3-base with class-weighted cross-entropy
3. **Stage C** — Threshold Optimisation: grid search for optimal classification threshold

**Output:** `dev.txt` and `test.txt` prediction files, saved model checkpoint.

In [25]:
import os
import random
import json
import warnings

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [26]:
# ---- All hyperparameters in one place ----
CONFIG = {
    "model_name": "microsoft/deberta-v3-base",
    "max_length": 128,
    "batch_size": 16,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "num_epochs": 10,
    "warmup_ratio": 0.1,
    "patience": 3,
    "seed": SEED,
    "threshold_lo": 0.30,
    "threshold_hi": 0.70,
    "threshold_step": 0.01,
}

for k, v in CONFIG.items():
    print(f"  {k}: {v}")

  model_name: microsoft/deberta-v3-base
  max_length: 128
  batch_size: 16
  learning_rate: 2e-05
  weight_decay: 0.01
  num_epochs: 10
  warmup_ratio: 0.1
  patience: 3
  seed: 42
  threshold_lo: 0.3
  threshold_hi: 0.7
  threshold_step: 0.01


## Stage A: Data Preparation

In [27]:
# Resolve data path (works from BestModel/ or repo root)
DATA_FILE = "dontpatronizeme_pcl.tsv"
TRAIN_SPLIT_FILE = "train_semeval_parids-labels.csv"
DEV_SPLIT_FILE = "dev_semeval_parids-labels.csv"
TEST_FILE = "task4_test.tsv"

def find_file(name):
    if os.path.exists(name):
        return name
    if os.path.exists(os.path.join("..", name)):
        return os.path.join("..", name)
    raise FileNotFoundError(f"{name} not found in current or parent directory.")

DATA_PATH = find_file(DATA_FILE)
TRAIN_SPLIT_PATH = find_file(TRAIN_SPLIT_FILE)
DEV_SPLIT_PATH = find_file(DEV_SPLIT_FILE)
TEST_PATH = find_file(TEST_FILE)

# ---- Load main dataset ----
df = pd.read_csv(
    DATA_PATH,
    sep="\t",
    skiprows=4,
    header=None,
    names=["par_id", "art_id", "keyword", "country_code", "text", "label"],
)

# Binarise: 0-1 -> Not PCL (0), 2-4 -> PCL (1)
df["binary_label"] = (df["label"] >= 2).astype(int)
df["text"] = df["text"].fillna("").astype(str).str.strip()

# ---- Load official SemEval train/dev splits ----
train_split = pd.read_csv(TRAIN_SPLIT_PATH)
dev_split = pd.read_csv(DEV_SPLIT_PATH)
print(f"Official split par_ids — Train: {len(train_split)}, Dev: {len(dev_split)}")

# ---- Load test set ----
test_df = pd.read_csv(
    TEST_PATH,
    sep="\t",
    header=None,
    names=["test_id", "art_id", "keyword", "country_code", "text"],
)
test_df["text"] = test_df["text"].fillna("").astype(str).str.strip()

n_total = len(df)
n_pcl = df["binary_label"].sum()
print(f"\nTotal labelled samples: {n_total}")
print(f"PCL:     {n_pcl} ({n_pcl/n_total*100:.1f}%)")
print(f"Not PCL: {n_total - n_pcl} ({(n_total - n_pcl)/n_total*100:.1f}%)")
print(f"Test samples: {len(test_df)}")

Official split par_ids — Train: 8375, Dev: 2094

Total labelled samples: 10469
PCL:     993 (9.5%)
Not PCL: 9476 (90.5%)
Test samples: 3832


In [28]:
# Use official SemEval train/dev splits (join on par_id)
train_par_ids = set(train_split["par_id"].values)
dev_par_ids = set(dev_split["par_id"].values)

train_df = df[df["par_id"].isin(train_par_ids)].reset_index(drop=True)
dev_df = df[df["par_id"].isin(dev_par_ids)].reset_index(drop=True)

# Sanity check: ensure all par_ids were matched
n_train_matched = len(train_df)
n_dev_matched = len(dev_df)
print(f"Train: {n_train_matched} matched out of {len(train_split)} split IDs  (PCL {train_df['binary_label'].sum()})")
print(f"Dev:   {n_dev_matched} matched out of {len(dev_split)} split IDs  (PCL {dev_df['binary_label'].sum()})")

if n_train_matched != len(train_split) or n_dev_matched != len(dev_split):
    print("WARNING: Some par_ids from the split files were not found in the dataset!")

Train: 8375 matched out of 8375 split IDs  (PCL 794)
Dev:   2094 matched out of 2094 split IDs  (PCL 199)


## Stage B: Model Training

In [29]:
# ---- Tokenisation ----
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])


def tokenize(texts, max_length):
    return tokenizer(
        texts.tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


print("Tokenising train...")
train_enc = tokenize(train_df["text"], CONFIG["max_length"])
print("Tokenising dev...")
dev_enc = tokenize(dev_df["text"], CONFIG["max_length"])
print("Done.")



Tokenising train...
Tokenising dev...
Done.


In [30]:
class PCLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }


train_dataset = PCLDataset(train_enc, train_df["binary_label"].values)
dev_dataset = PCLDataset(dev_enc, dev_df["binary_label"].values)

train_loader = DataLoader(
    train_dataset, batch_size=CONFIG["batch_size"], shuffle=True
)
dev_loader = DataLoader(
    dev_dataset, batch_size=CONFIG["batch_size"], shuffle=False
)

print(f"Train batches: {len(train_loader)}, Dev batches: {len(dev_loader)}")

Train batches: 524, Dev batches: 131


In [31]:
# ---- Model ----
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"], num_labels=2, torch_dtype=torch.float32,
)
model.to(device)

# ---- Class-weighted cross-entropy (inversely proportional to frequency) ----
n_train = len(train_df)
n_neg = (train_df["binary_label"] == 0).sum()
n_pos = (train_df["binary_label"] == 1).sum()
w_neg = n_train / (2 * n_neg)
w_pos = n_train / (2 * n_pos)
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float32).to(device)
print(f"Class weights: Not-PCL={w_neg:.4f}, PCL={w_pos:.4f}")

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# ---- Optimiser & scheduler ----
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
)
total_steps = len(train_loader) * CONFIG["num_epochs"]
warmup_steps = int(CONFIG["warmup_ratio"] * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

print(f"Total steps: {total_steps}, Warmup steps: {warmup_steps}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Model dtype: {next(model.parameters()).dtype}")

Loading weights: 100%|██████████| 198/198 [00:00<00:00, 362.25it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
classifier.weight                    

Class weights: Not-PCL=0.5524, PCL=5.2739
Total steps: 5240, Warmup steps: 524
Parameters: 184,423,682
Model dtype: torch.float32


In [None]:
@torch.no_grad()
def evaluate(model, loader):
    """Return (avg_loss, f1, probs_array, labels_array)."""
    model.eval()
    all_probs, all_labels = [], []
    running_loss = 0.0

    for batch in loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labs = batch["labels"].to(device)

        logits = model(input_ids=ids, attention_mask=mask).logits
        running_loss += loss_fn(logits, labs).item()

        probs = torch.softmax(logits, dim=-1)[:, 1]
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(labs.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)
    preds = (all_probs >= 0.5).astype(int)
    f1 = f1_score(all_labels, preds, pos_label=1)
    return running_loss / len(loader), f1, all_probs, all_labels

In [None]:
# ---- Training loop with early stopping ----
best_f1 = 0.0
patience_ctr = 0
history = {"train_loss": [], "dev_loss": [], "dev_f1": []}

for epoch in range(1, CONFIG["num_epochs"] + 1):
    model.train()
    epoch_loss = 0.0

    for step, batch in enumerate(train_loader, 1):
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labs = batch["labels"].to(device)

        logits = model(input_ids=ids, attention_mask=mask).logits
        loss = loss_fn(logits, labs)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

        if step % 10 == 0 or step == 1:
            print(f"  [{epoch}] step {step}/{len(train_loader)}  loss={loss.item():.4f}")

    avg_train = epoch_loss / len(train_loader)
    dev_loss, dev_f1, _, _ = evaluate(model, dev_loader)

    history["train_loss"].append(avg_train)
    history["dev_loss"].append(dev_loss)
    history["dev_f1"].append(dev_f1)

    tag = ""
    if dev_f1 > best_f1:
        best_f1 = dev_f1
        patience_ctr = 0
        torch.save(model.state_dict(), "best_model.pt")
        tag = " *saved*"
    else:
        patience_ctr += 1

    print(
        f"Epoch {epoch}: train_loss={avg_train:.4f}  "
        f"dev_loss={dev_loss:.4f}  dev_f1={dev_f1:.4f}  "
        f"patience={patience_ctr}/{CONFIG['patience']}{tag}"
    )

    if patience_ctr >= CONFIG["patience"]:
        print("Early stopping.")
        break

print(f"\nBest dev F1 (t=0.5): {best_f1:.4f}")

In [None]:
# ---- Training curves ----
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))

epochs_range = range(1, len(history["train_loss"]) + 1)
ax1.plot(epochs_range, history["train_loss"], "o-", label="Train")
ax1.plot(epochs_range, history["dev_loss"], "s-", label="Dev")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.set_title("Loss")
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(epochs_range, history["dev_f1"], "o-", color="green")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("F1")
ax2.set_title("Dev F1 (Positive Class)")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
plt.show()

## Stage C: Threshold Optimisation

In [None]:
# Reload best checkpoint
model.load_state_dict(torch.load("best_model.pt", map_location=device))
_, _, dev_probs, dev_labels = evaluate(model, dev_loader)

# Grid search
thresholds = np.arange(
    CONFIG["threshold_lo"],
    CONFIG["threshold_hi"] + CONFIG["threshold_step"],
    CONFIG["threshold_step"],
)

best_t, best_t_f1 = 0.5, 0.0
records = []
for t in thresholds:
    preds = (dev_probs >= t).astype(int)
    f1 = f1_score(dev_labels, preds, pos_label=1)
    prec = precision_score(dev_labels, preds, pos_label=1, zero_division=0)
    rec = recall_score(dev_labels, preds, pos_label=1, zero_division=0)
    records.append({"t": t, "f1": f1, "precision": prec, "recall": rec})
    if f1 > best_t_f1:
        best_t_f1 = f1
        best_t = t

res = pd.DataFrame(records)
print(f"Optimal threshold: {best_t:.2f}")
print(f"F1 at optimal threshold: {best_t_f1:.4f}")

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(res["t"], res["f1"], label="F1", linewidth=2)
ax.plot(res["t"], res["precision"], "--", label="Precision", linewidth=2)
ax.plot(res["t"], res["recall"], "--", label="Recall", linewidth=2)
ax.axvline(best_t, color="red", linestyle=":", linewidth=2, label=f"t*={best_t:.2f}")
ax.set_xlabel("Threshold")
ax.set_ylabel("Score")
ax.set_title("Threshold Optimisation on Dev Set")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("threshold_optimisation.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# ---- Final evaluation with optimal threshold ----
dev_preds = (dev_probs >= best_t).astype(int)

print(f"Threshold = {best_t:.2f}")
print("=" * 55)
print(classification_report(dev_labels, dev_preds, target_names=["Not PCL", "PCL"]))

f1_final = f1_score(dev_labels, dev_preds, pos_label=1)
prec_final = precision_score(dev_labels, dev_preds, pos_label=1)
rec_final = recall_score(dev_labels, dev_preds, pos_label=1)
print(f"PCL Precision: {prec_final:.4f}")
print(f"PCL Recall:    {rec_final:.4f}")
print(f"PCL F1:        {f1_final:.4f}")

## Save Predictions & Model

In [None]:
# ---- dev.txt: predictions on official dev set ----
dev_pred_path = os.path.join("..", "dev.txt")
with open(dev_pred_path, "w") as f:
    for p in dev_preds:
        f.write(f"{p}\n")
print(f"Saved {len(dev_preds)} dev predictions -> {dev_pred_path}")

# ---- test.txt: predictions on actual unlabelled test set (task4_test.tsv) ----
print(f"\nGenerating predictions on test set ({len(test_df)} samples)...")
test_enc = tokenize(test_df["text"], CONFIG["max_length"])
test_ds = PCLDataset(test_enc, np.zeros(len(test_df), dtype=int))  # dummy labels
test_loader = DataLoader(test_ds, batch_size=CONFIG["batch_size"], shuffle=False)

model.eval()
all_test_probs = []
with torch.no_grad():
    for batch in test_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        logits = model(input_ids=ids, attention_mask=mask).logits
        probs = torch.softmax(logits, dim=-1)[:, 1]
        all_test_probs.extend(probs.cpu().numpy())

test_preds = (np.array(all_test_probs) >= best_t).astype(int)
test_pred_path = os.path.join("..", "test.txt")
with open(test_pred_path, "w") as f:
    for p in test_preds:
        f.write(f"{p}\n")
print(f"Saved {len(test_preds)} test predictions -> {test_pred_path}")
print(f"Test PCL rate: {test_preds.sum()}/{len(test_preds)} ({test_preds.mean()*100:.1f}%)")

In [None]:
# ---- Save model & tokenizer for reproducibility ----
save_dir = "saved_model"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# Save training config + threshold
meta = {
    **CONFIG,
    "best_threshold": float(best_t),
    "best_f1": float(best_t_f1),
    "dev_precision": float(prec_final),
    "dev_recall": float(rec_final),
}
with open(os.path.join(save_dir, "training_config.json"), "w") as f:
    json.dump(meta, f, indent=2)

print(f"Model saved to {save_dir}/")
print(f"Best threshold: {best_t:.2f}")
print(f"Dev F1: {best_t_f1:.4f}")
print("\nDone!")