# HVSM: TF–IDF + XGBoost + Logistic Regression (CV & Tuning)

This notebook upgrades your baseline pipeline to hit higher macro-F1
with stratified k-fold CV, tuning, and expanded diagnostics, while remaining OOM-aware.

**Inputs (strict):** `data/train.csv`, `data/val.csv`, `data/test.csv` in the `data/` folder. `data/test.csv` must have `id` and no `label`. The notebook creates `outputs/submission_hvsm_prod_1.csv`.


## Imports and guardrails

In [None]:
from __future__ import annotations
import os, re, warnings
import gc
import time
from typing import List, Tuple, Dict, Optional
from dataclasses import dataclass
from datetime import datetime
from collections import Counter
import numpy as np, pandas as pd
from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (accuracy_score, classification_report,
    f1_score, roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix)
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
try:
    import seaborn as sns
except Exception:
    sns = None
try:
    from textblob import TextBlob
except Exception:
    TextBlob = None
    warnings.warn('TextBlob missing; sentiment features set to zeros.')
np.set_printoptions(linewidth=79)
pd.set_option('display.width', 79)
pd.set_option('display.max_columns', 40)
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)


## Configuration

In [None]:
@dataclass
class Config:
    tfidf_max_features: int = 50000
    tfidf_ngram_max: int = 3
    use_char_ngrams: bool = False
    min_df: int = 2
    kfolds: int = 5
    xgb_iter: int = 25
    lr_iter: int = 25
    plot_level: str = 'full'
CFG = Config()
print(CFG)


## Plotting helpers

In [None]:
def _gc() -> None:
    gc.collect()


_STEP_STARTS = {}

def log_step(msg: str) -> None:
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{ts}] {msg}", flush=True)

def log_step_start(name: str) -> None:
    _STEP_STARTS[name] = time.perf_counter()
    log_step(f"START: {name}")

def log_step_end(name: str) -> None:
    start = _STEP_STARTS.pop(name, None)
    if start is None:
        log_step(f"END: {name}")
    else:
        elapsed = time.perf_counter() - start
        log_step(f"END: {name} (elapsed {elapsed:.1f}s)")


def predict_proba_chunks(model, X, chunk_size: int = 50000) -> np.ndarray:
    n = X.shape[0]
    out = np.empty(n, dtype=np.float32)
    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)
        out[start:end] = model.predict_proba(X[start:end])[:, 1]
    return out


def _tight() -> None:
    plt.tight_layout()
def qq_plot(residuals: np.ndarray, title: str) -> None:
    plt.figure(figsize=(5, 4)); stats.probplot(residuals, dist='norm',
                                               plot=plt)
    plt.title(title); _tight(); plt.show()
def residual_plot(y_true: np.ndarray, y_prob: np.ndarray,
                  title: str) -> None:
    resid = y_true - y_prob
    plt.figure(figsize=(5, 4)); plt.scatter(y_prob, resid, s=8)
    plt.axhline(0.0, linestyle='--'); plt.xlabel('p(y=1)');
    plt.ylabel('residual'); plt.title(title); _tight(); plt.show()
def violin_by_label(df: pd.DataFrame, label_col: str, feat_col: str,
                    title: str) -> None:
    if sns is None:
        df.boxplot(column=feat_col, by=label_col, figsize=(5, 4))
        plt.title(title); plt.suptitle(''); _tight(); plt.show(); return
    plt.figure(figsize=(5, 4));
    sns.violinplot(data=df, x=label_col, y=feat_col)
    plt.title(title); _tight(); plt.show()
def plot_roc_pr(y_true: np.ndarray, y_prob: np.ndarray, title: str)->None:
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    prec, rec, _ = precision_recall_curve(y_true, y_prob)
    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    ax[0].plot(fpr, tpr)
    ax[0].set_title(f'ROC AUC={roc_auc_score(y_true, y_prob):.3f}')
    ax[0].set_xlabel('FPR'); ax[0].set_ylabel('TPR')
    ax[1].plot(rec, prec); ax[1].set_title('Precision–Recall')
    ax[1].set_xlabel('Recall'); ax[1].set_ylabel('Precision')
    _tight(); plt.show()
def plot_confusion(y_true: np.ndarray, y_hat: np.ndarray, title: str)->None:
    cm = confusion_matrix(y_true, y_hat)
    plt.figure(figsize=(4, 3)); plt.imshow(cm, cmap='Blues')
    plt.title(title); plt.colorbar()
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, int(cm[i, j]), ha='center', va='center')
    plt.xlabel('Pred'); plt.ylabel('True'); _tight(); plt.show()


## Processing and feature engineering

In [None]:
log_step_start('Processing and feature engineering')
def _ttr(text: str) -> float:
    words = re.findall(r'\S+', text.lower())
    return float(len(set(words)) / len(words)) if words else 0.0
def _sentiment(df: pd.DataFrame) -> pd.DataFrame:
    if TextBlob is None:
        df['sentiment_polarity'] = 0.0
        df['sentiment_subjectivity'] = 0.0
        return df
    tqdm.pandas()
    df['sentiment_polarity'] = df['text'].progress_apply(
        lambda x: float(TextBlob(x).sentiment.polarity))
    df['sentiment_subjectivity'] = df['text'].progress_apply(
        lambda x: float(TextBlob(x).sentiment.subjectivity))
    return df
def process_text_file(filename: str) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(filename))
    assert 'text' in df.columns, 'CSV must contain a text column.'
    df['text'] = df['text'].astype(str)
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['sentence_count'] = df['text'].str.count(r'[.!?]+').replace(0, 1)
    df['avg_sentence_length'] = (
        (df['word_count']/df['sentence_count']).clip(upper=100))
    df['punct_count'] = df['text'].str.count(r'[^\w\s]')
    df['punct_ratio'] = (
        (df['punct_count']/df['text_length']).clip(0, 0.3))
    df['ttr'] = df['text'].apply(_ttr)
    df['digit_ratio'] = df['text'].str.count(r'\d') / (
        df['text_length'].replace(0, 1))
    df['upper_ratio'] = df['text'].str.count(r'[A-Z]') / (
        df['text_length'].replace(0, 1))
    df['bangs'] = df['text'].str.count(r'!')
    df['questions'] = df['text'].str.count(r'\?')
    return df
log_step_end('Processing and feature engineering')


## Load data

In [None]:
log_step_start('Load data')
train = process_text_file('data/train.csv')
val = process_text_file('data/val.csv')
test = process_text_file('data/test.csv')
assert 'label' in train.columns and 'label' in val.columns
assert 'label' not in test.columns
assert 'id' in test.columns
print('Rows:', len(train), len(val), len(test))
log_step_end('Load data')


## Sentiment features

In [None]:
log_step_start('Sentiment features')
train = _sentiment(train); val = _sentiment(val); test = _sentiment(test)
log_step_end('Sentiment features')


## Numeric and TF–IDF features

In [None]:
log_step_start('Numeric and TF–IDF features')
feature_cols: List[str] = [
    'text_length','word_count','ttr','sentence_count','avg_sentence_length',
    'punct_ratio','sentiment_polarity','sentiment_subjectivity',
    'digit_ratio','upper_ratio','bangs','questions'
]
Xtr_basic = csr_matrix(train[feature_cols].to_numpy(dtype=np.float32))
Xva_basic = csr_matrix(val[feature_cols].to_numpy(dtype=np.float32))
Xte_basic = csr_matrix(test[feature_cols].to_numpy(dtype=np.float32))
vec_word = TfidfVectorizer(ngram_range=(1, 3), max_features=50000,
                           min_df=2, stop_words='english', dtype=np.float32)
Xtr_w = vec_word.fit_transform(train['text'])
Xva_w = vec_word.transform(val['text'])
Xte_w = vec_word.transform(test['text'])
X_train = hstack([Xtr_basic, Xtr_w])
X_val = hstack([Xva_basic, Xva_w])
X_test = hstack([Xte_basic, Xte_w])
y_train = train['label'].astype(int).values
y_val = val['label'].astype(int).values
print('Shapes:', X_train.shape, X_val.shape, X_test.shape)
_gc()
log_step_end('Numeric and TF–IDF features')


## Stratified k-fold CV + tuning

In [None]:
log_step_start('Stratified k-fold CV + tuning')
def _xgb_space() -> Dict[str, List]:
    return {
        'n_estimators': list(range(300, 901, 100)),
        'max_depth': list(range(3, 9)),
        'learning_rate': [0.03, 0.05, 0.07, 0.1],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0.0, 0.1, 0.5, 1.0],
        'reg_lambda': [0.5, 1.0, 1.5, 2.0],
    }
def _lr_space() -> Dict[str, List]:
    return {
        'C': [0.1, 0.3, 1.0, 3.0, 10.0],
        'penalty': ['l2'],
        'solver': ['liblinear', 'lbfgs'],
        'class_weight': [None, 'balanced'],
    }
def tune_xgb(X, y) -> XGBClassifier:
    base = XGBClassifier(random_state=42, eval_metric='logloss',
                         tree_method='hist', n_jobs=1)
    rs = RandomizedSearchCV(base, _xgb_space(), n_iter=25, scoring='f1',
                            n_jobs=1, cv=5, verbose=3, random_state=42,
                            pre_dispatch=1,
                            refit=True)
    log_step_start('XGB randomized search')
    rs.fit(X, y)
    log_step_end('XGB randomized search')
    print('Best XGB params:', rs.best_params_)
    return rs.best_estimator_
def tune_lr(X, y) -> LogisticRegression:
    base = LogisticRegression(max_iter=2000, random_state=42)
    rs = RandomizedSearchCV(base, _lr_space(), n_iter=25, scoring='f1',
                            n_jobs=1, cv=5, verbose=3, random_state=42,
                            pre_dispatch=1,
                            refit=True)
    log_step_start('LR randomized search')
    rs.fit(X, y)
    log_step_end('LR randomized search')
    print('Best LR params:', rs.best_params_)
    return rs.best_estimator_
xgb_tuned = tune_xgb(X_train, y_train)
lr_tuned = tune_lr(X_train, y_train)
_gc()
log_step_end('Stratified k-fold CV + tuning')


## Fit final models and calibrate

In [None]:
log_step_start('Fit final models and calibrate')
from scipy.sparse import vstack
X_trval = vstack([X_train, X_val])
y_trval = np.concatenate([y_train, y_val])
log_step_start('Fold 1/1 (single split)')
log_step_start('XGB training epochs')
xgb_tuned.fit(X_trval, y_trval, eval_set=[(X_val, y_val)], verbose=True)
log_step_end('XGB training epochs')
log_step_start('LR fit')
lr_tuned.fit(X_trval, y_trval)
log_step_end('LR fit')
log_step_end('Fold 1/1 (single split)')
cal_xgb = CalibratedClassifierCV(xgb_tuned, method='sigmoid',
                                 cv='prefit')
cal_xgb.fit(X_val, y_val)
cal_lr = CalibratedClassifierCV(lr_tuned, method='sigmoid', cv='prefit')
cal_lr.fit(X_val, y_val)
_gc()
log_step_end('Fit final models and calibrate')


## Ensemble and threshold tuning on val

In [None]:
log_step_start('Ensemble and threshold tuning on val')
p_xgb = cal_xgb.predict_proba(X_val)[:, 1]
p_lr = cal_lr.predict_proba(X_val)[:, 1]
best_w, best_f1, best_thr = 0.5, -1.0, 0.5
for w in np.linspace(0.0, 1.0, 21):
    p = w * p_xgb + (1.0 - w) * p_lr
    for thr in np.arange(0.1, 0.91, 0.01):
        yhat = (p >= thr).astype(int)
        f1 = f1_score(y_val, yhat)
        if f1 > best_f1:
            best_w, best_f1, best_thr = float(w), float(f1), float(thr)
print(f'Ensemble w={best_w:.2f} thr={best_thr:.2f} F1={best_f1:.4f}')
log_step_end('Ensemble and threshold tuning on val')


## Validation diagnostics

In [None]:
log_step_start('Validation diagnostics')
p_ens = best_w * p_xgb + (1.0 - best_w) * p_lr
yhat_val = (p_ens >= best_thr).astype(int)
print(classification_report(y_val, yhat_val))
residual_plot(y_val, p_ens, 'Residuals: validation ensemble')
qq_plot(y_val - p_ens, 'QQ: residuals (validation)')
plot_roc_pr(y_val, p_ens, 'Validation ROC/PR (ensemble)')
plot_confusion(y_val, yhat_val, 'Confusion (validation)')
violin_by_label(train, 'label', 'text_length', 'Text length by label')
log_step_end('Validation diagnostics')


## Feature importance snapshots

In [None]:
log_step_start('Feature importance snapshots')
try:
    booster = xgb_tuned.get_booster()
    scores = booster.get_score(importance_type='gain')
    items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    print('Top 25 XGB features by gain:')
    for k, v in items[:25]:
        print(f'{k}: {v:.4f}')
except Exception as e:
    warnings.warn(f'XGB importance unavailable: {e}')
try:
    if hasattr(lr_tuned, 'coef_'):
        coef = lr_tuned.coef_.ravel()
        idx = np.argsort(np.abs(coef))[::-1][:25]
        print('Top 25 |coef| for LR: indices and values:')
        for i in idx:
            print(i, float(coef[i]))
except Exception as e:
    warnings.warn(f'LR coef summary failed: {e}')
log_step_end('Feature importance snapshots')


## Predict on test and save submission

In [None]:
log_step_start('Predict on test and save submission')
p_xgb_te = predict_proba_chunks(cal_xgb, X_test)
p_lr_te = predict_proba_chunks(cal_lr, X_test)
p_ens_te = best_w * p_xgb_te + (1.0 - best_w) * p_lr_te
yhat_te = (p_ens_te >= best_thr).astype(int)
submission = pd.DataFrame({'id': test['id'], 'label': yhat_te})
outputs_dir = 'outputs'
os.makedirs(outputs_dir, exist_ok=True)
submission_path = os.path.join(outputs_dir, 'submission_hvsm_prod_1.csv')
submission.to_csv(submission_path, index=False)
print('Saved', submission_path, 'with', len(submission), 'rows')
_gc()
log_step_end('Predict on test and save submission')


## Final checks

In [None]:
log_step_start('Final checks')
assert submission['label'].isin([0, 1]).all()
print('Done. All checks passed.')
log_step_end('Final checks')
