# Experiment E: Top-K Z-score Indicator Features

Instead of 6 aggregate z-score statistics, encode the *identity* of which specific
PCL-discriminative n-grams appear in each document as a k-dim binary indicator vector.

The top-k n-grams are selected by |z-score| from the training subset only.
k ∈ {50, 100, 200} is searched as a hyperparameter.
Feature combination method (CONCAT / GMF) is also searched.

Fixed corrections: VAL_FRACTION=0.15, BATCH_SIZE=32, NUM_EPOCHS=12, PATIENCE=4.

In [None]:
import os
import sys
import random
import logging
import gc
import json

import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
import spacy
import optuna
from optuna.visualization.matplotlib import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
)
import matplotlib.pyplot as plt

sys.path.insert(0, "..")
from utils.data import load_data
from utils.split import split_train_val
from utils.dataloaders import make_dataloaders
from utils.pcl_deberta import PCLDeBERTa, PoolingStrategy
from utils.feature_comb import FeatureComb
from utils.fightin_words import compute_fightin_words_zscores, build_topk_ngrams, extract_topk_zscore_features
from utils.optim import compute_pos_weight
from utils.training_loop import train_model
from utils.eval import evaluate

SEED = 42
DATA_DIR = "../data"
OUT_DIR = "out"
MODEL_NAME = "microsoft/deberta-v3-base"
MAX_LENGTH = 256
VAL_FRACTION = 0.15
BATCH_SIZE = 32
N_TRIALS = 20
NUM_EPOCHS = 12
PATIENCE = 4
N_EVAL_STEPS = 35
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:\t%(message)s")
LOG = logging.getLogger(__name__)
LOG.info(f"Device: {DEVICE}")
os.makedirs(OUT_DIR, exist_ok=True)

## 1. Data Loading and spaCy Processing

In [None]:
train_df, dev_df = load_data(DATA_DIR)
train_sub_df, val_sub_df = split_train_val(train_df, val_frac=VAL_FRACTION, seed=SEED)
tokeniser = AutoTokenizer.from_pretrained(MODEL_NAME)
LOG.info(f"Train: {len(train_sub_df)}, Val: {len(val_sub_df)}, Dev: {len(dev_df)}")

In [None]:
gpu = spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
LOG.info(f"spaCy using {'GPU' if gpu else 'CPU'}")

# Process training subset once; spaCy docs stored for feature extraction
train_texts = train_sub_df["text"].tolist()
train_docs = list(nlp.pipe(train_texts, batch_size=256))
LOG.info(f"spaCy processed {len(train_docs)} train documents")

## 2. Compute Z-score Dictionary

Z-score dict is computed from `train_sub_df` only (no data leakage).
We pre-build the top-K lists for k ∈ {50, 100, 200} and scalers,
to avoid recomputing inside the Optuna objective.

In [None]:
Z_SCORES, _, _, _ = compute_fightin_words_zscores(
    train_docs, train_sub_df["binary_label"].tolist()
)
LOG.info(f"Z-score dictionary: {len(Z_SCORES)} n-grams")

# Pre-build top-k ngram lists and fit scalers for k ∈ {50, 100, 200}
K_VALUES = [50, 100, 200]
topk_ngrams_cache: dict[int, list[str]] = {}
scaler_cache: dict[int, StandardScaler] = {}

for k in K_VALUES:
    topk = build_topk_ngrams(Z_SCORES, k=k)
    topk_ngrams_cache[k] = topk

    # Extract train features to fit scaler
    train_feats = np.array(
        [extract_topk_zscore_features(doc, topk) for doc in train_docs]
    )
    scaler = StandardScaler()
    scaler.fit(train_feats)
    scaler_cache[k] = scaler
    LOG.info(f"k={k}: top ngrams built, scaler fitted on {train_feats.shape}")

# Free spaCy docs after feature prep
del train_docs
gc.collect()

## 3. Feature Factory

The factory runs spaCy on any text list and returns scaled k-dim indicator features.

In [None]:
def make_topk_factory(k: int):
    """Returns an extra_feature_factory for the given k."""
    topk = topk_ngrams_cache[k]
    scaler = scaler_cache[k]

    def factory(texts: list[str]) -> torch.Tensor:
        feats = np.array(
            [extract_topk_zscore_features(doc, topk) for doc in nlp.pipe(texts, batch_size=256)]
        )
        scaled = scaler.transform(feats).astype(np.float32)
        return torch.tensor(scaled).to(DEVICE)

    return factory

## 4. Hyperparameter Search

In [None]:
POOLING_MAP = {
    "cls": PoolingStrategy.CLS,
    "mean": PoolingStrategy.MEAN,
    "max": PoolingStrategy.MAX,
    "cls_mean": PoolingStrategy.CLS_MEAN,
}
EXP_NAME = "E_topk_zscore"


def objective(trial: optuna.trial.Trial) -> float:
    lr              = trial.suggest_float("lr", 4e-6, 6e-5, log=True)
    warmup_fraction = trial.suggest_float("warmup_fraction", 0.03, 0.20, step=0.01)
    hidden_dim      = trial.suggest_categorical("hidden_dim", [0, 128, 256, 512])
    dropout_rate    = trial.suggest_float("dropout_rate", 0.0, 0.4, step=0.05) if hidden_dim > 0 else 0.0
    weight_decay    = trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)
    head_lr_mult    = trial.suggest_categorical("head_lr_multiplier", [1, 3, 5, 10])
    label_smoothing = trial.suggest_float("label_smoothing", 0.0, 0.15, step=0.025)
    pooling_name    = trial.suggest_categorical("pooling", ["cls", "mean", "max", "cls_mean"])
    k               = trial.suggest_categorical("k", [50, 100, 200])
    feat_comb_name  = trial.suggest_categorical("feature_comb_method", ["CONCAT", "GMF"])

    pooling   = POOLING_MAP[pooling_name]
    feat_comb = FeatureComb.CONCAT if feat_comb_name == "CONCAT" else FeatureComb.GMF

    LOG.info(f"[{EXP_NAME}] Trial {trial.number}: lr={lr:.2e}, k={k}, "
             f"feat_comb={feat_comb_name}, pool={pooling_name}")

    factory = make_topk_factory(k)
    train_loader, val_loader, dev_loader = make_dataloaders(
        train_sub_df, val_sub_df, dev_df, BATCH_SIZE, MAX_LENGTH, tokeniser, factory
    )

    model = PCLDeBERTa(
        hidden_dim=hidden_dim,
        dropout_rate=dropout_rate,
        n_extra_features=k,
        pooling=pooling,
        feature_comb_method=feat_comb,
    ).to(DEVICE)

    pos_weight = compute_pos_weight(train_sub_df, DEVICE)

    results = train_model(
        model=model, device=DEVICE,
        train_loader=train_loader, val_loader=val_loader, dev_loader=dev_loader,
        pos_weight=pos_weight, lr=lr, weight_decay=weight_decay,
        num_epochs=NUM_EPOCHS, warmup_fraction=warmup_fraction,
        patience=PATIENCE, head_lr_multiplier=head_lr_mult,
        label_smoothing=label_smoothing, eval_every_n_steps=N_EVAL_STEPS,
        trial=trial,
    )

    trial.set_user_attr("best_val_f1",    results["best_val_f1"])
    trial.set_user_attr("best_threshold", results["best_threshold"])
    trial.set_user_attr("dev_f1",         results["dev_metrics"]["f1"])
    trial.set_user_attr("dev_precision",  results["dev_metrics"]["precision"])
    trial.set_user_attr("dev_recall",     results["dev_metrics"]["recall"])

    try:
        prev_best = trial.study.best_value
    except ValueError:
        prev_best = -float("inf")
    if results["best_val_f1"] > prev_best:
        torch.save(
            {k: v.cpu() for k, v in model.state_dict().items()},
            os.path.join(OUT_DIR, f"exp_{EXP_NAME}_best_model.pt")
        )
        config = {**trial.params, "batch_size": BATCH_SIZE, "num_epochs": NUM_EPOCHS,
                  "patience": PATIENCE, "best_threshold": results["best_threshold"]}
        with open(os.path.join(OUT_DIR, f"exp_{EXP_NAME}_best_params.json"), "w") as f:
            json.dump(config, f, indent=2)
        LOG.info(f"[{EXP_NAME}] New best saved (val F1={results['best_val_f1']:.4f})")

    del model, train_loader, val_loader, dev_loader
    gc.collect()
    torch.cuda.empty_cache()
    return results["best_val_f1"]

## 5. Run Experiment

In [None]:
gc.collect()
torch.cuda.empty_cache()

study = optuna.create_study(
    direction="maximize",
    study_name=f"pcl_deberta_exp_{EXP_NAME}",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=6, n_warmup_steps=300),
)
study.optimize(objective, n_trials=N_TRIALS)

best = study.best_trial
LOG.info(f"Best trial: {best.number}")
LOG.info(f"Val F1: {best.user_attrs['best_val_f1']:.4f} | Dev F1: {best.user_attrs['dev_f1']:.4f}")
LOG.info(f"Best params: {best.params}")

## 6. Results

In [None]:
for plot_fn, suffix in [
    (plot_optimization_history, "history"),
    (plot_param_importances, "importances"),
    (plot_parallel_coordinate, "parallel"),
]:
    plot_fn(study)
    plt.tight_layout()
    plt.savefig(f"{OUT_DIR}/{EXP_NAME}_optuna_{suffix}.png", dpi=300)
    plt.show()

best = study.best_trial
best_params = best.params
pooling = POOLING_MAP[best_params["pooling"]]
feat_comb = FeatureComb.CONCAT if best_params["feature_comb_method"] == "CONCAT" else FeatureComb.GMF
k_best = best_params["k"]

model = PCLDeBERTa(
    hidden_dim=best_params["hidden_dim"],
    dropout_rate=best_params.get("dropout_rate", 0.0),
    n_extra_features=k_best,
    pooling=pooling,
    feature_comb_method=feat_comb,
).to(DEVICE)

state_dict = torch.load(
    os.path.join(OUT_DIR, f"exp_{EXP_NAME}_best_model.pt"), map_location=DEVICE
)
model.load_state_dict(state_dict)

factory = make_topk_factory(k_best)
_, _, dev_loader = make_dataloaders(
    train_sub_df, val_sub_df, dev_df, BATCH_SIZE, MAX_LENGTH, tokeniser, factory
)
dev_metrics = evaluate(model, DEVICE, dev_loader, threshold=best.user_attrs["best_threshold"])

print(f"\n{'='*60}")
print(f"{EXP_NAME.upper()} — Dev Set Results (threshold={best.user_attrs['best_threshold']:.3f})")
print(f"{'='*60}")
print(classification_report(dev_metrics["labels"], dev_metrics["preds"], target_names=["Non-PCL", "PCL"]))
for param_k, param_v in best_params.items():
    print(f"  {param_k}: {param_v}")

# Show most discriminative n-grams used as features
print(f"\nTop-10 PCL n-grams (highest +z):")
topk = topk_ngrams_cache[k_best]
for ng in topk[:10]:
    print(f"  {ng!r:30s}  z={Z_SCORES[ng]:+.2f}")

del model
gc.collect()
torch.cuda.empty_cache()