# HVSM Notebook: hvsm_prod_2.ipynb

- Runs with: slurm_scripts/hvsm_job_2.sh
- Purpose: CPU TF-IDF + XGBoost/LR with binary rules and prevalence match.



In [None]:
# Parameters (papermill)
DATA_DIR = "data"
TRAIN_CSV = "data/train.csv"
VAL_CSV = "data/val.csv"
TEST_CSV = "data/test.csv"

# HVSM — TF–IDF + LR/XGB with Binary Rules, CV, and Prevalence Match

Inputs: `data/train.csv`, `data/val.csv`, `data/test.csv` in `data/`. `data/test.csv` must have `id` and no `label`. Output: `outputs/submission_hvsm_prod_2.csv`.


## Imports and guardrails

In [None]:
from __future__ import annotations
import os, re, string, warnings, json, hashlib
import gc
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import numpy as np, pandas as pd
from tqdm import tqdm
from scipy import stats
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
)
from sklearn.model_selection import ParameterSampler, StratifiedKFold
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
import matplotlib.pyplot as plt

try:
    import seaborn as sns
except Exception:
    sns = None
try:
    from textblob import TextBlob
except Exception:
    TextBlob = None
    warnings.warn("TextBlob missing; sentiment features set to zeros.")
np.set_printoptions(linewidth=79)
pd.set_option("display.width", 79)
pd.set_option("display.max_columns", 60)
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)


## Configuration

In [None]:
@dataclass
class Config:
    tfidf_max_features: int = 40000
    tfidf_ngram_max: int = 3
    use_char_ngrams: bool = False
    char_tfidf_max_features: int = 15000
    min_df: int = 2
    kfolds: int = 3
    xgb_iter: int = 8
    lr_iter: int = 8
    plot_level: str = "full"


CFG = Config()
print(CFG)

## Plotting helpers

In [None]:
def _gc() -> None:
    gc.collect()


_STEP_STARTS = {}


def log_step(msg: str) -> None:
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] {msg}", flush=True)


def log_step_start(name: str) -> None:
    _STEP_STARTS[name] = time.perf_counter()
    log_step(f"START: {name}")


def log_step_end(name: str) -> None:
    start = _STEP_STARTS.pop(name, None)
    if start is None:
        log_step(f"END: {name}")
    else:
        elapsed = time.perf_counter() - start
        log_step(f"END: {name} (elapsed {elapsed:.1f}s)")


def predict_proba_chunks(model, X, chunk_size: int = 50000) -> np.ndarray:
    n = X.shape[0]
    out = np.empty(n, dtype=np.float32)
    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)
        out[start:end] = model.predict_proba(X[start:end])[:, 1]
    return out


def _tight() -> None:
    plt.tight_layout()


def qq_plot(residuals: np.ndarray, title: str) -> None:
    plt.figure(figsize=(5, 4))
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title(title)
    _tight()
    plt.show()


def residual_plot(y_true: np.ndarray, y_prob: np.ndarray, title: str) -> None:
    resid = y_true - y_prob
    plt.figure(figsize=(5, 4))
    plt.scatter(y_prob, resid, s=8)
    plt.axhline(0.0, linestyle="--")
    plt.xlabel("p(y=1)")
    plt.ylabel("residual")
    plt.title(title)
    _tight()
    plt.show()


def violin_by_label(
    df: pd.DataFrame, label_col: str, feat_col: str, title: str
) -> None:
    if sns is None:
        df.boxplot(column=feat_col, by=label_col, figsize=(5, 4))
        plt.title(title)
        plt.suptitle("")
        _tight()
        plt.show()
        return
    plt.figure(figsize=(5, 4))
    sns.violinplot(data=df, x=label_col, y=feat_col)
    plt.title(title)
    _tight()
    plt.show()


def plot_roc_pr(y_true: np.ndarray, y_prob: np.ndarray, title: str) -> None:
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    prec, rec, _ = precision_recall_curve(y_true, y_prob)
    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    ax[0].plot(fpr, tpr)
    ax[0].set_title(f"ROC AUC={roc_auc_score(y_true, y_prob):.3f}")
    ax[0].set_xlabel("FPR")
    ax[0].set_ylabel("TPR")
    ax[1].plot(rec, prec)
    ax[1].set_title("Precision–Recall")
    ax[1].set_xlabel("Recall")
    ax[1].set_ylabel("Precision")
    _tight()
    plt.show()


def plot_confusion(y_true: np.ndarray, y_hat: np.ndarray, title: str) -> None:
    cm = confusion_matrix(y_true, y_hat)
    plt.figure(figsize=(4, 3))
    plt.imshow(cm, cmap="Blues")
    plt.title(title)
    plt.colorbar()
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, int(cm[i, j]), ha="center", va="center")
    plt.xlabel("Pred")
    plt.ylabel("True")
    _tight()
    plt.show()

## Processing and features

In [None]:
log_step_start("Processing and features")


def _ttr(text: str) -> float:
    toks = re.findall(r"\S+", text.lower())
    return float(len(set(toks)) / len(toks)) if toks else 0.0


def _sentiment(df: pd.DataFrame) -> pd.DataFrame:
    if TextBlob is None:
        df["sentiment_polarity"] = 0.0
        df["sentiment_subjectivity"] = 0.0
        return df
    tqdm.pandas()
    df["sentiment_polarity"] = df["text"].progress_apply(
        lambda x: float(TextBlob(x).sentiment.polarity)
    )
    df["sentiment_subjectivity"] = df["text"].progress_apply(
        lambda x: float(TextBlob(x).sentiment.subjectivity)
    )
    return df


def process_text_file(filename: str) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(filename))
    assert "text" in df.columns
    df["text"] = df["text"].astype(str)
    df["text_length"] = df["text"].str.len()
    df["word_count"] = df["text"].str.split().str.len()
    df["sentence_count"] = df["text"].str.count(r"[.!?]+").replace(0, 1)
    df["avg_sentence_length"] = (df["word_count"] / df["sentence_count"]).clip(
        upper=100
    )
    df["punct_count"] = df["text"].str.count(r"[^\w\s]")
    df["punct_ratio"] = (df["punct_count"] / df["text_length"]).clip(0, 0.3)
    df["ttr"] = df["text"].apply(_ttr)
    df["digit_ratio"] = df["text"].str.count(r"\d") / (
        df["text_length"].replace(0, 1)
    )
    df["upper_ratio"] = df["text"].str.count(r"[A-Z]") / (
        df["text_length"].replace(0, 1)
    )
    df["bangs"] = df["text"].str.count(r"!")
    df["questions"] = df["text"].str.count(r"\?")
    return df


log_step_end("Processing and features")

## Binary features and 2^3 sweep

In [None]:
log_step_start("Binary features and 2^3 sweep")


def ends_with_letter(text: str) -> int:
    s = text.rstrip()
    return int(len(s) > 0 and s[-1] in string.ascii_letters)


def has_5gram_repetition(text: str) -> int:
    toks = re.findall(r"\S+", text)
    if len(toks) < 10:
        return 0
    seen = {}
    w = 5
    for i in range(len(toks) - w + 1):
        key = tuple(toks[i : i + w])
        if key in seen:
            return 1
        seen[key] = 1
    return 0


COMMON_SMALL = set(
    [
        "the",
        "be",
        "to",
        "of",
        "and",
        "a",
        "in",
        "that",
        "have",
        "i",
        "it",
        "for",
        "not",
        "on",
        "with",
        "he",
        "as",
        "you",
        "do",
        "at",
        "this",
        "but",
        "his",
    ]
)


def max_uncommon_binary(
    text: str, thr_rep: int = 3, thr_count: int = 5
) -> int:
    toks = [t.lower() for t in re.findall(r"\w+", text)]
    if not toks:
        return 0
    freqs = {}
    uncommon = 0
    for t in toks:
        if t not in COMMON_SMALL:
            uncommon += 1
            freqs[t] = freqs.get(t, 0) + 1
    if uncommon < thr_count:
        return 0
    return int(any(v >= thr_rep for v in freqs.values()))


def add_binary_feats(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    tqdm.pandas()
    out["ends_with_letter"] = out["text"].progress_apply(ends_with_letter)
    out["has_5gram_repetition"] = out["text"].progress_apply(
        has_5gram_repetition
    )
    out["max_uncommon_binary"] = out["text"].progress_apply(
        max_uncommon_binary
    )
    return out


def sweep_binary_subsets(y_true: np.ndarray, fe_df: pd.DataFrame):
    cols = ["ends_with_letter", "has_5gram_repetition", "max_uncommon_binary"]
    best_f1, best_key = -1.0, "none"
    for mask in range(1, 1 << len(cols)):
        sel = [cols[i] for i in range(len(cols)) if (mask >> i) & 1]
        rule = fe_df[sel].any(axis=1).astype(int).values
        f1 = f1_score(y_true, rule)
        key = "|".join(sel)
        if f1 > best_f1:
            best_f1, best_key = f1, key
    return best_key, float(best_f1)


log_step_end("Binary features and 2^3 sweep")

## Load data

In [None]:
log_step_start("Load data")

from pathlib import Path
import hashlib

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / DATA_DIR).exists():
    for parent in PROJECT_ROOT.parents:
        if (parent / DATA_DIR).exists():
            PROJECT_ROOT = parent
            break


def resolve_path(path_str: str) -> str:
    p = Path(path_str)
    if p.is_absolute():
        return str(p)
    if p.parent == Path("."):
        data_dir = Path(DATA_DIR)
        if not data_dir.is_absolute():
            data_dir = PROJECT_ROOT / data_dir
        candidate = data_dir / p.name
        if candidate.exists():
            return str(candidate)
    return str((PROJECT_ROOT / p).resolve())


# Reassemble chunked CSVs if needed
def ensure_chunked_csv(path: Path) -> None:
    if path.exists():
        return
    parts = sorted(path.parent.glob(path.name + ".part*"))
    if not parts:
        raise FileNotFoundError(f"Missing {path} and no chunk files found.")
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    if tmp_path.exists():
        tmp_path.unlink()
    hasher = hashlib.sha256()
    with tmp_path.open("wb") as out:
        for part in parts:
            with part.open("rb") as f:
                while True:
                    chunk = f.read(1024 * 1024)
                    if not chunk:
                        break
                    out.write(chunk)
                    hasher.update(chunk)
    sha_path = path.with_suffix(path.suffix + ".sha256")
    if sha_path.exists():
        expected = sha_path.read_text().split()[0]
        actual = hasher.hexdigest()
        if expected != actual:
            tmp_path.unlink(missing_ok=True)
            raise ValueError(
                f"SHA256 mismatch for {path}: expected {expected} got {actual}"
            )
    tmp_path.replace(path)
    log_step(f"Reassembled {path} from {len(parts)} chunks.")


train_path = Path(resolve_path(TRAIN_CSV))
val_path = Path(resolve_path(VAL_CSV))
test_path = Path(resolve_path(TEST_CSV))
ensure_chunked_csv(train_path)
ensure_chunked_csv(val_path)
ensure_chunked_csv(test_path)

train = process_text_file(str(train_path))
val = process_text_file(str(val_path))
test = process_text_file(str(test_path))
assert "label" in train.columns and "label" in val.columns
assert "label" not in test.columns
assert "id" in test.columns
print("Rows:", len(train), len(val), len(test))
log_step_end("Load data")


## Sentiment + binaries

In [None]:
log_step_start("Sentiment + binaries")
train = _sentiment(train)
val = _sentiment(val)
test = _sentiment(test)
train = add_binary_feats(train)
val = add_binary_feats(val)
test = add_binary_feats(test)
rk, rf1 = sweep_binary_subsets(val["label"].astype(int).values, val)
print(f"Best binary subset (val): {rk} | F1={rf1:.4f}")
log_step_end("Sentiment + binaries")

## Numeric + TF–IDF design

In [None]:
log_step_start("Numeric + TF–IDF design")
num_cols = [
    "text_length",
    "word_count",
    "ttr",
    "sentence_count",
    "avg_sentence_length",
    "punct_ratio",
    "sentiment_polarity",
    "sentiment_subjectivity",
    "digit_ratio",
    "upper_ratio",
    "bangs",
    "questions",
    "ends_with_letter",
    "has_5gram_repetition",
    "max_uncommon_binary",
]
Xtr_num = csr_matrix(train[num_cols].to_numpy(dtype=np.float32))
Xva_num = csr_matrix(val[num_cols].to_numpy(dtype=np.float32))
Xte_num = csr_matrix(test[num_cols].to_numpy(dtype=np.float32))
vec_word = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=50000,
    min_df=2,
    stop_words="english",
    dtype=np.float32,
)
Xtr_w = vec_word.fit_transform(train["text"])
Xva_w = vec_word.transform(val["text"])
Xte_w = vec_word.transform(test["text"])
X_train = hstack([Xtr_num, Xtr_w])
X_val = hstack([Xva_num, Xva_w])
X_test = hstack([Xte_num, Xte_w])
y_train = train["label"].astype(int).values
y_val = val["label"].astype(int).values
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
_gc()
log_step_end("Numeric + TF–IDF design")

## Tuning and calibration

In [None]:
log_step_start("Tuning and calibration")


def _xgb_space():
    return {
        "n_estimators": [200, 300, 400, 500],
        "max_depth": [4, 6, 8],
        "learning_rate": [0.05, 0.1],
        "min_child_weight": [1, 3],
        "subsample": [0.7, 1.0],
        "colsample_bytree": [0.7, 1.0],
        "reg_alpha": [0.0, 0.1, 0.5],
        "reg_lambda": [0.5, 1.0, 1.5],
    }


def _lr_space():
    return {
        "C": [0.5, 1.0, 2.0, 4.0],
        "penalty": ["l2"],
        "solver": ["liblinear", "lbfgs"],
        "class_weight": [None, "balanced"],
    }


def _search_signature(param_distributions, n_iter, kfolds, random_state, X_shape):
    payload = {
        "param_distributions": param_distributions,
        "n_iter": n_iter,
        "kfolds": kfolds,
        "random_state": random_state,
        "X_shape": list(X_shape),
    }
    raw = json.dumps(payload, sort_keys=True, default=str)
    return hashlib.md5(raw.encode()).hexdigest()[:10]


def _load_results(results_path: Path):
    rows = []
    if not results_path.exists():
        return rows
    with results_path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows


def _checkpointed_random_search(
    estimator_factory,
    param_distributions,
    X,
    y,
    n_iter,
    kfolds,
    random_state,
    search_name,
):
    ckpt_dir = Path(PROJECT_ROOT) / "checkpoints"
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    sig = _search_signature(
        param_distributions, n_iter, kfolds, random_state, X.shape
    )
    candidates_path = ckpt_dir / f"{search_name}_{sig}_candidates.json"
    results_path = ckpt_dir / f"{search_name}_{sig}_results.jsonl"
    meta_path = ckpt_dir / f"{search_name}_{sig}_meta.json"

    if candidates_path.exists():
        candidates = json.loads(candidates_path.read_text())
    else:
        candidates = list(
            ParameterSampler(
                param_distributions, n_iter=n_iter, random_state=random_state
            )
        )
        candidates_path.write_text(json.dumps(candidates, indent=2))

    meta_path.write_text(
        json.dumps(
            {
                "signature": sig,
                "n_iter": n_iter,
                "kfolds": kfolds,
                "random_state": random_state,
                "X_shape": list(X.shape),
                "n_candidates": len(candidates),
            },
            indent=2,
        )
    )

    rows = _load_results(results_path)
    scores_by_cand = {}
    done = set()
    for row in rows:
        cand = row.get("cand_idx")
        fold = row.get("fold_idx")
        status = row.get("status")
        if cand is None or fold is None:
            continue
        cand = int(cand)
        fold = int(fold)
        if status == "ok":
            scores_by_cand.setdefault(cand, {})[fold] = float(
                row.get("score", 0.0)
            )
        if status in ("ok", "fail"):
            done.add((cand, fold))

    completed = sum(
        1 for scores in scores_by_cand.values() if len(scores) == kfolds
    )
    if rows:
        print(
            f"Resuming {search_name}: {completed}/{len(candidates)} "
            "candidates fully scored."
        )

    splits = list(
        StratifiedKFold(
            n_splits=kfolds, shuffle=True, random_state=random_state
        ).split(X, y)
    )

    best_score = -1.0
    best_params = None
    for cand_idx, scores in scores_by_cand.items():
        if len(scores) == kfolds:
            mean = float(np.mean(list(scores.values())))
            if mean > best_score:
                best_score = mean
                best_params = candidates[cand_idx]

    with results_path.open("a") as f:
        for cand_idx, params in enumerate(candidates):
            cand_scores = scores_by_cand.setdefault(cand_idx, {})
            if len(cand_scores) == kfolds:
                continue
            log_step(
                f"{search_name}: candidate {cand_idx + 1}/{len(candidates)}"
            )
            for fold_idx, (tr_idx, va_idx) in enumerate(splits):
                if (cand_idx, fold_idx) in done:
                    continue
                start = time.perf_counter()
                status = "ok"
                score = None
                error = None
                try:
                    model = estimator_factory(params)
                    model.fit(X[tr_idx], y[tr_idx])
                    preds = model.predict(X[va_idx])
                    score = float(f1_score(y[va_idx], preds))
                    cand_scores[fold_idx] = score
                except Exception as exc:
                    status = "fail"
                    error = str(exc)
                elapsed = time.perf_counter() - start
                record = {
                    "cand_idx": cand_idx,
                    "fold_idx": fold_idx,
                    "status": status,
                    "score": score,
                    "elapsed_sec": round(elapsed, 2),
                    "params": params,
                    "error": error,
                }
                f.write(json.dumps(record) + "
")
                f.flush()
                os.fsync(f.fileno())
                done.add((cand_idx, fold_idx))
                if status == "ok":
                    log_step(
                        f"{search_name} cand {cand_idx + 1} "
                        f"fold {fold_idx + 1}/{kfolds} f1={score:.4f} "
                        f"({elapsed / 60:.1f} min)"
                    )
                else:
                    log_step(
                        f"{search_name} cand {cand_idx + 1} "
                        f"fold {fold_idx + 1}/{kfolds} failed: {error}"
                    )
                _gc()
            if len(cand_scores) == kfolds:
                mean = float(np.mean(list(cand_scores.values())))
                log_step(
                    f"{search_name} candidate {cand_idx + 1} mean f1={mean:.4f}"
                )
                if mean > best_score:
                    best_score = mean
                    best_params = params

    if best_params is None:
        return None

    params_path = ckpt_dir / f"{search_name}_{sig}_best.json"
    params_path.write_text(
        json.dumps(
            {"best_score": best_score, "best_params": best_params}, indent=2
        )
    )
    return best_params


def tune_xgb(X, y):
    base_params = {
        "random_state": 42,
        "eval_metric": "logloss",
        "tree_method": "hist",
        "max_bin": 256,
        "n_jobs": 1,
    }
    log_step_start("XGB randomized search")
    best_params = _checkpointed_random_search(
        lambda p: XGBClassifier(**base_params, **p),
        _xgb_space(),
        X,
        y,
        n_iter=CFG.xgb_iter,
        kfolds=CFG.kfolds,
        random_state=42,
        search_name="xgb_search",
    )
    log_step_end("XGB randomized search")
    if best_params is None:
        warnings.warn("XGB search produced no valid candidates; using default.")
        model = XGBClassifier(**base_params)
        model.fit(X, y)
        return model
    model = XGBClassifier(**base_params, **best_params)
    log_step_start("XGB refit on full data")
    model.fit(X, y)
    log_step_end("XGB refit on full data")
    return model


def tune_lr(X, y):
    base_params = {
        "max_iter": 2000,
        "random_state": 42,
    }
    log_step_start("LR randomized search")
    best_params = _checkpointed_random_search(
        lambda p: LogisticRegression(**base_params, **p),
        _lr_space(),
        X,
        y,
        n_iter=CFG.lr_iter,
        kfolds=CFG.kfolds,
        random_state=42,
        search_name="lr_search",
    )
    log_step_end("LR randomized search")
    if best_params is None:
        warnings.warn("LR search produced no valid candidates; using default.")
        model = LogisticRegression(**base_params)
        model.fit(X, y)
        return model
    model = LogisticRegression(**base_params, **best_params)
    log_step_start("LR refit on full data")
    model.fit(X, y)
    log_step_end("LR refit on full data")
    return model


xgb_tuned = tune_xgb(X_train, y_train)
lr_tuned = tune_lr(X_train, y_train)
X_trval = vstack([X_train, X_val])
y_trval = np.concatenate([y_train, y_val])
log_step_start("Fold 1/1 (single split)")
log_step_start("XGB training epochs")
xgb_tuned.fit(
    X_trval,
    y_trval,
    eval_set=[(X_val, y_val)],
    verbose=True,
    callbacks=[EarlyStopping(rounds=50)],
)
log_step_end("XGB training epochs")
log_step_start("LR fit")
lr_tuned.fit(X_trval, y_trval)
log_step_end("LR fit")
log_step_end("Fold 1/1 (single split)")

cal_xgb = CalibratedClassifierCV(xgb_tuned, method="sigmoid", cv="prefit")
cal_xgb.fit(X_val, y_val)
cal_lr = CalibratedClassifierCV(lr_tuned, method="sigmoid", cv="prefit")
cal_lr.fit(X_val, y_val)
_gc()
log_step_end("Tuning and calibration")


## Ensembling, thresholding, prevalence match

In [None]:
log_step_start("Ensembling, thresholding, prevalence match")


def decode_prevalence(y_prob: np.ndarray, pos_rate: float) -> np.ndarray:
    n = len(y_prob)
    k = int(round(pos_rate * n))
    idx = np.argsort(-y_prob)
    out = np.zeros(n, dtype=int)
    out[idx[:k]] = 1
    return out


p_xgb = cal_xgb.predict_proba(X_val)[:, 1]
p_lr = cal_lr.predict_proba(X_val)[:, 1]
best_w, best_f1, best_thr = 0.5, -1.0, 0.5
for w in np.linspace(0.0, 1.0, 21):
    p = w * p_xgb + (1.0 - w) * p_lr
    for thr in np.arange(0.1, 0.91, 0.01):
        f1 = f1_score(y_val, (p >= thr).astype(int))
        if f1 > best_f1:
            best_w, best_f1, best_thr = float(w), float(f1), float(thr)
print(f"Threshold head: w={best_w:.2f} thr={best_thr:.2f} F1={best_f1:.4f}")
p_ens = best_w * p_xgb + (1.0 - best_w) * p_lr
val_pos_rate = float(np.mean(y_val))
yhat_topk = decode_prevalence(p_ens, val_pos_rate)
f1_topk = f1_score(y_val, yhat_topk)
print(f"Prevalence head: rate={val_pos_rate:.3f} F1={f1_topk:.4f}")
rk, rf1 = sweep_binary_subsets(y_val, val)
print(f"Rule head (best subset {rk}) F1={rf1:.4f}")
heads = [("threshold", best_f1), ("prevalence", f1_topk), ("rule", rf1)]
heads.sort(key=lambda x: x[1], reverse=True)
print("Head ranking:", heads)
log_step_end("Ensembling, thresholding, prevalence match")

## Validation diagnostics

In [None]:
log_step_start("Validation diagnostics")
winner = heads[0][0]
if winner == "threshold":
    yhat_val = (p_ens >= best_thr).astype(int)
elif winner == "prevalence":
    yhat_val = yhat_topk
else:
    yhat_val = (
        val[
            ["ends_with_letter", "has_5gram_repetition", "max_uncommon_binary"]
        ]
        .any(axis=1)
        .astype(int)
        .values
    )
print(classification_report(y_val, yhat_val))
residual_plot(y_val, p_ens, "Residuals: ensemble on val")
qq_plot(y_val - p_ens, "QQ: residuals (val)")
plot_roc_pr(y_val, p_ens, "Validation ROC/PR (ensemble)")
plot_confusion(y_val, yhat_val, "Confusion (val, winner head)")
log_step_end("Validation diagnostics")

## Predict test and save submission

In [None]:
log_step_start("Predict test and save submission")
p_xgb_te = predict_proba_chunks(cal_xgb, X_test)
p_lr_te = predict_proba_chunks(cal_lr, X_test)
p_ens_te = best_w * p_xgb_te + (1.0 - best_w) * p_lr_te
if winner == "threshold":
    yhat_te = (p_ens_te >= best_thr).astype(int)
elif winner == "prevalence":
    yhat_te = decode_prevalence(p_ens_te, val_pos_rate)
else:
    yhat_te = (
        test[
            ["ends_with_letter", "has_5gram_repetition", "max_uncommon_binary"]
        ]
        .any(axis=1)
        .astype(int)
        .values
    )
submission = pd.DataFrame({"id": test["id"], "label": yhat_te})
outputs_dir = "outputs"
os.makedirs(outputs_dir, exist_ok=True)
submission_path = os.path.join(outputs_dir, "submission_hvsm_prod_2.csv")
submission.to_csv(submission_path, index=False)
print("Saved", submission_path, "with", len(submission), "rows")
_gc()
log_step_end("Predict test and save submission")