# HVSM Notebook: hvsm_prod.ipynb

- Runs with: slurm_scripts/hvsm_job.sh
- Purpose: Baseline CPU TF-IDF + XGBoost + Logistic Regression pipeline.



In [None]:
# Parameters (papermill)
DATA_DIR = "data"
TRAIN_CSV = "data/train.csv"
VAL_CSV = "data/val.csv"
TEST_CSV = "data/test.csv"

# Kaggle Baseline: TF–IDF + XGBoost + Logistic Regression

This notebook implements the exact baseline pipeline you provided, with documentation, type hints, assertions, and diagnostic plots. It expects `data/train.csv`, `data/val.csv`, and `data/test.csv` to reside in `data/`. The outputs include validation reports and a `outputs/submission_hvsm_prod.csv` file for Kaggle.

The workflow:
1. Load CSVs and engineer basic text features.
2. Compute TF–IDF up to trigrams.
3. Train XGBoost and Logistic Regression models.
4. Calibrate via Platt scaling.
5. Ensemble (weighted average) and threshold tune on validation.
6. Generate predictions on `data/test.csv`.

Additional diagnostics: QQ plot, residual plot, violin plot, and a brief sanity audit of the inputs.


In [None]:
from __future__ import annotations

from typing import List, Tuple
import os
import re
import inspect
import warnings
import gc
import time
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
from scipy import stats

try:
    import seaborn as sns  # optional, for violin plots
except Exception:  # pragma: no cover
    sns = None

try:
    from textblob import TextBlob
except Exception as e:  # pragma: no cover
    TextBlob = None
    warnings.warn("TextBlob not available; sentiment features will be zeros.")
from datetime import datetime


## Utilities and plotting

In [None]:
def _gc() -> None:
    gc.collect()


_STEP_STARTS = {}


def log_step(msg: str) -> None:
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] {msg}", flush=True)


def log_step_start(name: str) -> None:
    _STEP_STARTS[name] = time.perf_counter()
    log_step(f"START: {name}")


def log_step_end(name: str) -> None:
    start = _STEP_STARTS.pop(name, None)
    if start is None:
        log_step(f"END: {name}")
    else:
        elapsed = time.perf_counter() - start
        log_step(f"END: {name} (elapsed {elapsed:.1f}s)")


def predict_proba_chunks(model, X, chunk_size: int = 50000) -> np.ndarray:
    n = X.shape[0]
    out = np.empty(n, dtype=np.float32)
    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)
        out[start:end] = model.predict_proba(X[start:end])[:, 1]
    return out


def qq_plot(residuals: np.ndarray, title: str) -> None:
    """Draw a QQ plot of residuals.

    Args:
        residuals: Array of residuals.
        title: Plot title.
    """
    plt.figure(figsize=(5, 4))
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title(title)
    plt.tight_layout()
    plt.show()


def residual_plot(y_true: np.ndarray, y_prob: np.ndarray, title: str) -> None:
    """Scatter residuals vs predicted probabilities.

    Args:
        y_true: True binary labels.
        y_prob: Predicted probabilities for the positive class.
        title: Plot title.
    """
    resid = y_true - y_prob
    plt.figure(figsize=(5, 4))
    plt.scatter(y_prob, resid, s=8)
    plt.axhline(0.0, linestyle="--")
    plt.xlabel("p(y=1)")
    plt.ylabel("residual")
    plt.title(title)
    plt.tight_layout()
    plt.show()


def violin_by_label(
    df: pd.DataFrame, label_col: str, feat_col: str, title: str
) -> None:
    """Violin plot of a numeric feature split by label.

    Args:
        df: DataFrame containing labels and the feature.
        label_col: Name of the label column.
        feat_col: Name of the numeric feature column.
        title: Plot title.
    """
    if sns is None:  # fallback to simple boxplot if seaborn missing
        plt.figure(figsize=(5, 4))
        df.boxplot(column=feat_col, by=label_col)
        plt.title(title)
        plt.suptitle("")
        plt.tight_layout()
        plt.show()
        return
    plt.figure(figsize=(5, 4))
    sns.violinplot(data=df, x=label_col, y=feat_col)
    plt.title(title)
    plt.tight_layout()
    plt.show()

## Data loading and processing

In [None]:
log_step_start("Data loading and processing")


def process_text_file(filename: str) -> pd.DataFrame:
    """Load CSV and compute simple text-derived features.

    The file is expected to contain at least a `text` column, and, for
    training/validation, a `label` column.

    Args:
        filename: CSV path relative to the notebook directory.

    Returns:
        DataFrame with additional feature columns.

    Raises:
        AssertionError: If required columns are missing.
    """
    df = pd.read_csv(os.path.join(filename))
    assert "text" in df.columns, "CSV must contain a text column."
    df["text"] = df["text"].astype(str)

    df["text_length"] = df["text"].str.len()
    df["word_count"] = df["text"].str.split().str.len()
    df["sentence_count"] = df["text"].str.count(r"[.!?]+").replace(0, 1)
    df["avg_sentence_length"] = (df["word_count"] / df["sentence_count"]).clip(
        upper=100
    )
    df["punct_count"] = df["text"].str.count(r"[^\w\s]")
    df["punct_ratio"] = (df["punct_count"] / df["text_length"]).clip(0, 0.3)

    def ttr(text: str) -> float:
        words = re.findall(r"\S+", text.lower())
        return len(set(words)) / len(words) if len(words) > 0 else 0.0

    tqdm.pandas()
    df["ttr"] = df["text"].progress_apply(ttr)
    return df


log_step_end("Data loading and processing")

## TF–IDF features

In [None]:
log_step_start("TF–IDF features")


def add_ngram_tfidf(
    train_texts: pd.Series,
    valid_texts: pd.Series,
    test_texts: pd.Series,
    n: int = 11,
    max_features: int = 5000,
) -> Tuple[csr_matrix, csr_matrix, csr_matrix]:
    """Build an n-gram TF–IDF representation.

    Args:
        train_texts: Training texts.
        valid_texts: Validation texts.
        test_texts: Test texts.
        n: Maximum n-gram size.
        max_features: Vocabulary size cap.

    Returns:
        Tuple of sparse matrices (train, valid, test).
    """
    vectorizer = TfidfVectorizer(
        ngram_range=(1, n),
        max_features=max_features,
        stop_words="english",
        dtype=np.float32,
    )
    X_train_ng = vectorizer.fit_transform(train_texts)
    X_valid_ng = vectorizer.transform(valid_texts)
    X_test_ng = vectorizer.transform(test_texts)
    return X_train_ng, X_valid_ng, X_test_ng


log_step_end("TF–IDF features")

## Sentiment features

In [None]:
log_step_start("Sentiment features")


def add_sentiment_features(df: pd.DataFrame) -> pd.DataFrame:
    """Attach TextBlob sentiment features.

    If TextBlob is unavailable, the features are set to zeros with a
    warning.

    Args:
        df: DataFrame with `text` column.

    Returns:
        DataFrame with `sentiment_polarity` and `sentiment_subjectivity`.
    """
    tqdm.pandas()
    if TextBlob is None:
        df["sentiment_polarity"] = 0.0
        df["sentiment_subjectivity"] = 0.0
        return df

    def _pol(x: str) -> float:
        return float(TextBlob(x).sentiment.polarity)

    def _subj(x: str) -> float:
        return float(TextBlob(x).sentiment.subjectivity)

    df["sentiment_polarity"] = df["text"].progress_apply(_pol)
    df["sentiment_subjectivity"] = df["text"].progress_apply(_subj)
    return df


log_step_end("Sentiment features")

## Load data (train/val/test)

In [None]:
log_step_start("Load data (train/val/test)")

from pathlib import Path
import hashlib

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / DATA_DIR).exists():
    for parent in PROJECT_ROOT.parents:
        if (parent / DATA_DIR).exists():
            PROJECT_ROOT = parent
            break


def resolve_path(path_str: str) -> str:
    p = Path(path_str)
    if p.is_absolute():
        return str(p)
    if p.parent == Path("."):
        data_dir = Path(DATA_DIR)
        if not data_dir.is_absolute():
            data_dir = PROJECT_ROOT / data_dir
        candidate = data_dir / p.name
        if candidate.exists():
            return str(candidate)
    return str((PROJECT_ROOT / p).resolve())


# Reassemble chunked CSVs if needed
def ensure_chunked_csv(path: Path) -> None:
    if path.exists():
        return
    parts = sorted(path.parent.glob(path.name + ".part*"))
    if not parts:
        raise FileNotFoundError(f"Missing {path} and no chunk files found.")
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    if tmp_path.exists():
        tmp_path.unlink()
    hasher = hashlib.sha256()
    with tmp_path.open("wb") as out:
        for part in parts:
            with part.open("rb") as f:
                while True:
                    chunk = f.read(1024 * 1024)
                    if not chunk:
                        break
                    out.write(chunk)
                    hasher.update(chunk)
    sha_path = path.with_suffix(path.suffix + ".sha256")
    if sha_path.exists():
        expected = sha_path.read_text().split()[0]
        actual = hasher.hexdigest()
        if expected != actual:
            tmp_path.unlink(missing_ok=True)
            raise ValueError(
                f"SHA256 mismatch for {path}: expected {expected} got {actual}"
            )
    tmp_path.replace(path)
    log_step(f"Reassembled {path} from {len(parts)} chunks.")


# Strict file names in the `data/` folder
train_path = Path(resolve_path(TRAIN_CSV))
val_path = Path(resolve_path(VAL_CSV))
test_path = Path(resolve_path(TEST_CSV))
ensure_chunked_csv(train_path)
ensure_chunked_csv(val_path)
ensure_chunked_csv(test_path)

train = process_text_file(str(train_path))
validation = process_text_file(str(val_path))
test = process_text_file(str(test_path))

# Basic schema checks
for name, df in [("train", train), ("val", validation), ("test", test)]:
    assert "text" in df.columns, f"{name} missing 'text' column"
assert "label" in train.columns, "train must have label"
assert "label" in validation.columns, "val must have label"
assert "label" not in test.columns, "test must NOT have label"

print("Rows: train", len(train), " val", len(validation), " test", len(test))
log_step_end("Load data (train/val/test)")


## Add sentiment features

In [None]:
log_step_start("Add sentiment features")
train = add_sentiment_features(train)
validation = add_sentiment_features(validation)
test = add_sentiment_features(test)
log_step_end("Add sentiment features")

## Assemble features

In [None]:
log_step_start("Assemble features")
feature_cols: List[str] = [
    "text_length",
    "word_count",
    "ttr",
    "sentence_count",
    "avg_sentence_length",
    "punct_ratio",
    "sentiment_polarity",
    "sentiment_subjectivity",
]

X_train_basic = train[feature_cols]
X_valid_basic = validation[feature_cols]
X_test_basic = test[feature_cols]

X_train_ngram, X_valid_ngram, X_test_ngram = add_ngram_tfidf(
    train["text"], validation["text"], test["text"], n=11, max_features=5000
)

X_train = hstack(
    [csr_matrix(X_train_basic.to_numpy(dtype=np.float32)), X_train_ngram]
)
X_valid = hstack(
    [csr_matrix(X_valid_basic.to_numpy(dtype=np.float32)), X_valid_ngram]
)
X_test = hstack(
    [csr_matrix(X_test_basic.to_numpy(dtype=np.float32)), X_test_ngram]
)

y_train = train["label"]
y_valid = validation["label"]

print("Shapes:")
print("  X_train:", X_train.shape)
print("  X_valid:", X_valid.shape)
print("  X_test :", X_test.shape)
_gc()
log_step_end("Assemble features")

## Class balance and scale_pos_weight

In [None]:
log_step_start("Class balance and scale_pos_weight")
counter = Counter(y_train)
assert 0 in counter and 1 in counter, "labels must be binary {0,1}"
scale_pos_weight = counter[0] / counter[1]
print("Class counts:", counter)
print("scale_pos_weight:", scale_pos_weight)
log_step_end("Class balance and scale_pos_weight")

## Train XGBoost and Logistic Regression

In [None]:
log_step_start("Train XGBoost and Logistic Regression")

import json
import hashlib
import joblib
from pathlib import Path

ckpt_root = Path(PROJECT_ROOT) if "PROJECT_ROOT" in globals() else Path.cwd()
ckpt_dir = ckpt_root / "checkpoints"
ckpt_dir.mkdir(parents=True, exist_ok=True)


def _model_signature(name, params, X_shape, y_shape):
    payload = {
        "name": name,
        "params": params,
        "X_shape": list(X_shape),
        "y_shape": list(y_shape),
    }
    raw = json.dumps(payload, sort_keys=True, default=str)
    return hashlib.md5(raw.encode()).hexdigest()[:10]


xgb_params = {
    "n_estimators": 500,
    "learning_rate": 0.1,
    "max_depth": 6,
    "random_state": 42,
    "eval_metric": "logloss",
    "tree_method": "hist",
    "max_bin": 256,
    "n_jobs": 1,
    "scale_pos_weight": scale_pos_weight,
}

es_rounds = 50
total_rounds = xgb_params["n_estimators"]
chunk_rounds = 20

xgb_sig = _model_signature("xgb", xgb_params, X_train.shape, y_train.shape)
ckpt_prefix = f"hvsm_prod_xgb_{xgb_sig}_iter"
legacy_ckpt = ckpt_dir / f"hvsm_prod_xgb_{xgb_sig}.joblib"

log_step(f"XGB checkpoints dir: {ckpt_dir}")
log_step(
    f"XGB checkpoint prefix: {ckpt_prefix} "
    f"(chunk={chunk_rounds}, total={total_rounds})"
)


def _ckpt_path(rounds: int) -> Path:
    return ckpt_dir / f"{ckpt_prefix}{rounds:04d}.joblib"


def _latest_ckpt():
    candidates = list(ckpt_dir.glob(f"{ckpt_prefix}*.joblib"))
    if not candidates:
        return None, 0
    def _rounds(p: Path) -> int:
        name = p.stem
        try:
            return int(name.rsplit("iter", 1)[-1])
        except Exception:
            return 0
    best = max(candidates, key=_rounds)
    return best, _rounds(best)


log_step_start("Fold 1/1 (single split)")
log_step_start("XGB training epochs")

ckpt_path, current_round = _latest_ckpt()
if ckpt_path is not None:
    xgb = joblib.load(ckpt_path)
    log_step(f"Loaded XGB checkpoint: {ckpt_path}")
elif legacy_ckpt.exists():
    xgb = joblib.load(legacy_ckpt)
    current_round = xgb.get_booster().num_boosted_rounds()
    log_step(f"Loaded legacy XGB checkpoint: {legacy_ckpt}")
else:
    xgb = XGBClassifier(**xgb_params)

fit_kwargs = {"eval_set": [(X_valid, y_valid)], "verbose": True}
fit_sig = inspect.signature(xgb.fit)
if "early_stopping_rounds" in fit_sig.parameters:
    fit_kwargs["early_stopping_rounds"] = es_rounds
else:
    warnings.warn(
        "XGBClassifier.fit does not support early_stopping_rounds; "
        "running without it."
    )

while current_round < total_rounds:
    rounds_to_add = min(chunk_rounds, total_rounds - current_round)
    xgb.set_params(n_estimators=rounds_to_add)
    xgb_model = xgb.get_booster() if current_round > 0 else None
    xgb.fit(X_train, y_train, xgb_model=xgb_model, **fit_kwargs)
    new_rounds = xgb.get_booster().num_boosted_rounds()
    if new_rounds <= current_round:
        warnings.warn("XGBoost made no progress in this chunk; stopping.")
        break
    current_round = new_rounds
    ckpt_path = _ckpt_path(current_round)
    joblib.dump(xgb, ckpt_path)
    log_step(f"Saved XGB checkpoint: {ckpt_path}")

log_step_end("XGB training epochs")

lr_params = {"max_iter": 1000, "random_state": 42}

lr_sig = _model_signature("lr", lr_params, X_train.shape, y_train.shape)
lr_ckpt = ckpt_dir / f"hvsm_prod_lr_{lr_sig}.joblib"

log_step_start("LR fit")
if lr_ckpt.exists():
    lr = joblib.load(lr_ckpt)
    log_step(f"Loaded LR checkpoint: {lr_ckpt}")
else:
    lr = LogisticRegression(**lr_params)
    lr.fit(X_train, y_train)
    joblib.dump(lr, lr_ckpt)
    log_step(f"Saved LR checkpoint: {lr_ckpt}")
log_step_end("LR fit")
log_step_end("Fold 1/1 (single split)")
print("Models trained.")
_gc()
log_step_end("Train XGBoost and Logistic Regression")


## Calibrate with Platt scaling

In [None]:
log_step_start("Calibrate with Platt scaling")
calibrated_xgb = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit")
calibrated_xgb.fit(X_valid, y_valid)

calibrated_lr = CalibratedClassifierCV(lr, method="sigmoid", cv="prefit")
calibrated_lr.fit(X_valid, y_valid)
print("Models calibrated.")
_gc()
log_step_end("Calibrate with Platt scaling")

## Ensemble and threshold tuning

In [None]:
log_step_start("Ensemble and threshold tuning")
val_pred_proba_xgb = calibrated_xgb.predict_proba(X_valid)[:, 1]
val_pred_proba_lr = calibrated_lr.predict_proba(X_valid)[:, 1]
val_pred_proba_ensemble = 0.6 * val_pred_proba_xgb + 0.4 * val_pred_proba_lr

thresholds = np.arange(0.1, 0.9, 0.01)
best_threshold = 0.5
best_f1 = -1.0

for thr in thresholds:
    val_pred_thr = (val_pred_proba_ensemble >= thr).astype(int)
    f1 = f1_score(y_valid, val_pred_thr)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = float(thr)

print(f"Best threshold: {best_threshold:.2f} with F1: {best_f1:.4f}")
log_step_end("Ensemble and threshold tuning")

## Validation report and diagnostics

In [None]:
log_step_start("Validation report and diagnostics")
val_pred_final = (val_pred_proba_ensemble >= best_threshold).astype(int)
print(classification_report(y_valid, val_pred_final))

# Diagnostics
residual_plot(
    y_valid.to_numpy(),
    val_pred_proba_ensemble,
    "Residuals: validation ensemble",
)
qq_plot(
    y_valid.to_numpy() - val_pred_proba_ensemble,
    "QQ plot: residuals (validation)",
)
try:
    # Violin on a basic feature (train) to visualize label differences
    violin_by_label(
        train, "label", "text_length", "Text length by label (train)"
    )
except Exception as e:
    warnings.warn(f"Violin plot skipped: {e}")
log_step_end("Validation report and diagnostics")

## Predict on test and write submission

In [None]:
log_step_start("Predict on test and write submission")
p_xgb_te = predict_proba_chunks(calibrated_xgb, X_test)
p_lr_te = predict_proba_chunks(calibrated_lr, X_test)
p_ens_te = 0.6 * p_xgb_te + 0.4 * p_lr_te
yhat_te = (p_ens_te >= best_threshold).astype(int)
submission = pd.DataFrame({"id": test["id"], "label": yhat_te})
outputs_dir = "outputs"
os.makedirs(outputs_dir, exist_ok=True)
submission_path = os.path.join(outputs_dir, "submission_hvsm_prod.csv")
submission.to_csv(submission_path, index=False)
print("Saved", submission_path, "with", len(submission), "rows")
_gc()
log_step_end("Predict on test and write submission")