# HVSM — GPU TF-IDF + cuML LR/NB

This notebook mirrors the baseline pipeline but runs GPU-first with Polars + cuML. It expects `data/train.csv`, `data/val.csv`, and `data/test.csv` to reside in `data/`. The outputs include validation reports and `outputs/submission_hvsm_prod_a.csv`.

The workflow:
1. Load CSVs and engineer basic text features.
2. Compute TF-IDF up to trigrams.
3. Train cuML Logistic Regression and MultinomialNB.
4. Calibrate via Platt scaling.
5. Ensemble (weighted average) and threshold tune on validation.
6. Generate predictions on `data/test.csv`.

Additional diagnostics: QQ plot, residual plot, violin plot, and a brief sanity audit of the inputs.


In [None]:
from __future__ import annotations

from typing import List, Tuple
import os
import re
import warnings
import gc
import time
from collections import Counter

import numpy as np
import polars as pl
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy import stats

try:
    import cupy as cp
    import cudf
    import cupyx.scipy.sparse as cpx_sparse
    import cuml
    from cuml.feature_extraction.text import TfidfVectorizer
    from cuml.linear_model import LogisticRegression
    from cuml.naive_bayes import MultinomialNB
except Exception as e:
    raise RuntimeError('cuML + CUDA (cupy/cudf) required for GPU-first run.') from e

try:
    import seaborn as sns  # optional, for violin plots
except Exception:  # pragma: no cover
    sns = None

try:
    from textblob import TextBlob
except Exception as e:  # pragma: no cover
    TextBlob = None
    warnings.warn(
        'TextBlob not available; sentiment features will be zeros.'
    )

cuml.set_global_output_type('cupy')
RANDOM_SEED = 42
from datetime import datetime


## Utilities and plotting

In [None]:
def _gc() -> None:
    gc.collect()


_STEP_STARTS = {}

def log_step(msg: str) -> None:
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{ts}] {msg}", flush=True)

def log_step_start(name: str) -> None:
    _STEP_STARTS[name] = time.perf_counter()
    log_step(f"START: {name}")

def log_step_end(name: str) -> None:
    start = _STEP_STARTS.pop(name, None)
    if start is None:
        log_step(f"END: {name}")
    else:
        elapsed = time.perf_counter() - start
        log_step(f"END: {name} (elapsed {elapsed:.1f}s)")
    try:
        cp.get_default_memory_pool().free_all_blocks()
    except Exception:
        pass


def predict_proba_chunks(model, X, chunk_size: int = 50000):
    n = X.shape[0]
    out = cp.empty(n, dtype=cp.float32)
    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)
        out[start:end] = model.predict_proba(X[start:end])[:, 1]
        _gc()
    return out


def _to_numpy(x):
    if isinstance(x, np.ndarray):
        return x
    if hasattr(x, 'get'):
        return x.get()
    return np.asarray(x)


def f1_score_np(y_true, y_pred) -> float:
    y_true = _to_numpy(y_true).astype(int)
    y_pred = _to_numpy(y_pred).astype(int)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    precision = tp / (tp + fp + 1e-12)
    recall = tp / (tp + fn + 1e-12)
    return float(2 * precision * recall / (precision + recall + 1e-12))


def classification_report_np(y_true, y_pred) -> str:
    y_true = _to_numpy(y_true).astype(int)
    y_pred = _to_numpy(y_pred).astype(int)
    def _prf(label):
        tp = int(((y_true == label) & (y_pred == label)).sum())
        fp = int(((y_true != label) & (y_pred == label)).sum())
        fn = int(((y_true == label) & (y_pred != label)).sum())
        precision = tp / (tp + fp + 1e-12)
        recall = tp / (tp + fn + 1e-12)
        f1 = 2 * precision * recall / (precision + recall + 1e-12)
        support = int((y_true == label).sum())
        return precision, recall, f1, support
    p0, r0, f0, s0 = _prf(0)
    p1, r1, f1, s1 = _prf(1)
    acc = float((y_true == y_pred).mean())
    macro_p = (p0 + p1) / 2
    macro_r = (r0 + r1) / 2
    macro_f = (f0 + f1) / 2
    total = s0 + s1
    w_p = (p0 * s0 + p1 * s1) / max(total, 1)
    w_r = (r0 * s0 + r1 * s1) / max(total, 1)
    w_f = (f0 * s0 + f1 * s1) / max(total, 1)
    lines = [
        '              precision    recall  f1-score   support',
        f'           0       {p0:0.3f}      {r0:0.3f}      {f0:0.3f}      {s0:5d}',
        f'           1       {p1:0.3f}      {r1:0.3f}      {f1:0.3f}      {s1:5d}',
        '',
        f'    accuracy                           {acc:0.3f}      {total:5d}',
        f'   macro avg       {macro_p:0.3f}      {macro_r:0.3f}      {macro_f:0.3f}      {total:5d}',
        f'weighted avg       {w_p:0.3f}      {w_r:0.3f}      {w_f:0.3f}      {total:5d}',
    ]
    return '
'.join(lines)


def qq_plot(residuals: np.ndarray, title: str) -> None:
    plt.figure(figsize=(5, 4))
    stats.probplot(residuals, dist='norm', plot=plt)
    plt.title(title)
    plt.tight_layout()
    plt.show()


def residual_plot(y_true: np.ndarray, y_prob: np.ndarray,
                  title: str) -> None:
    resid = y_true - y_prob
    plt.figure(figsize=(5, 4))
    plt.scatter(y_prob, resid, s=8)
    plt.axhline(0.0, linestyle='--')
    plt.xlabel('p(y=1)')
    plt.ylabel('residual')
    plt.title(title)
    plt.tight_layout()
    plt.show()


def violin_by_label(df: pl.DataFrame, label_col: str,
                    feat_col: str, title: str) -> None:
    y = df.select(label_col).to_numpy().ravel()
    x = df.select(feat_col).to_numpy().ravel()
    if sns is None:
        plt.figure(figsize=(5, 4))
        plt.boxplot([x[y == 0], x[y == 1]], labels=['0', '1'])
        plt.title(title)
        plt.tight_layout()
        plt.show()
        return
    plt.figure(figsize=(5, 4))
    sns.violinplot(x=y, y=x)
    plt.title(title)
    plt.tight_layout()
    plt.show()


## Data loading and processing

In [None]:
log_step_start('Data loading and processing')
def _ttr(text: str) -> float:
    words = re.findall(r'\S+', text.lower())
    return float(len(set(words)) / len(words)) if words else 0.0


def process_text_file(filename: str) -> pl.DataFrame:
    df = pl.read_csv(os.path.join(filename))
    assert 'text' in df.columns, 'CSV must contain a text column.'
    df = df.with_columns(pl.col('text').cast(pl.Utf8))

    df = df.with_columns([
        pl.col('text').str.len_chars().alias('text_length'),
        pl.col('text').str.count_matches(r'\S+').alias('word_count'),
        pl.col('text').str.count_matches(r'[.!?]+').alias('sentence_count'),
        pl.col('text').str.count_matches(r'[^\w\s]').alias('punct_count'),
    ])
    df = df.with_columns([
        pl.when(pl.col('sentence_count') == 0).then(1)
            .otherwise(pl.col('sentence_count')).alias('sentence_count'),
        pl.when(pl.col('text_length') == 0).then(1)
            .otherwise(pl.col('text_length')).alias('text_length_safe'),
    ])
    avg_sentence_expr = pl.col('word_count') / pl.col('sentence_count')
    punct_expr = pl.col('punct_count') / pl.col('text_length_safe')
    df = df.with_columns([
        pl.when(avg_sentence_expr > 100).then(100)
            .otherwise(avg_sentence_expr).alias('avg_sentence_length'),
        pl.when(punct_expr > 0.3).then(0.3)
            .otherwise(punct_expr).alias('punct_ratio'),
        pl.col('text').map_elements(_ttr, return_dtype=pl.Float64)
            .alias('ttr'),
    ])
    df = df.drop(['text_length_safe'])
    return df
log_step_end('Data loading and processing')


## TF–IDF features

In [None]:
log_step_start('TF–IDF features')
def add_ngram_tfidf(
    train_texts: cudf.Series,
    valid_texts: cudf.Series,
    test_texts: cudf.Series,
    n: int = 3,
    max_features: int = 5000,
) -> Tuple[cpx_sparse.csr_matrix, cpx_sparse.csr_matrix, cpx_sparse.csr_matrix]:
    vectorizer = TfidfVectorizer(
        ngram_range=(1, n), max_features=max_features,
        stop_words='english'
    )
    X_train_ng = vectorizer.fit_transform(train_texts)
    X_valid_ng = vectorizer.transform(valid_texts)
    X_test_ng = vectorizer.transform(test_texts)
    return X_train_ng, X_valid_ng, X_test_ng
log_step_end('TF–IDF features')


## Sentiment features

In [None]:
log_step_start('Sentiment features')
def add_sentiment_features(df: pl.DataFrame) -> pl.DataFrame:
    if TextBlob is None:
        return df.with_columns([
            pl.lit(0.0).alias('sentiment_polarity'),
            pl.lit(0.0).alias('sentiment_subjectivity'),
        ])

    def _pol(x: str) -> float:
        return float(TextBlob(x).sentiment.polarity)

    def _subj(x: str) -> float:
        return float(TextBlob(x).sentiment.subjectivity)

    return df.with_columns([
        pl.col('text').map_elements(_pol, return_dtype=pl.Float64)
            .alias('sentiment_polarity'),
        pl.col('text').map_elements(_subj, return_dtype=pl.Float64)
            .alias('sentiment_subjectivity'),
    ])
log_step_end('Sentiment features')


## Load data (train/val/test)

In [None]:
log_step_start('Load data (train/val/test)')
# Strict file names in the `data/` folder
train = process_text_file('data/train.csv')
validation = process_text_file('data/val.csv')
test = process_text_file('data/test.csv')

# Basic schema checks
for name, df in [('train', train), ('val', validation), ('test', test)]:
    assert 'text' in df.columns, f"{name} missing 'text' column"
assert 'label' in train.columns, 'train must have label'
assert 'label' in validation.columns, 'val must have label'
assert 'label' not in test.columns, 'test must NOT have label'
assert 'id' in test.columns, 'test must have id'

print('Rows: train', train.height, ' val', validation.height,
      ' test', test.height)
log_step_end('Load data (train/val/test)')


## Add sentiment features

In [None]:
log_step_start('Add sentiment features')
train = add_sentiment_features(train)
validation = add_sentiment_features(validation)
test = add_sentiment_features(test)
log_step_end('Add sentiment features')


## Assemble features

In [None]:
log_step_start('Assemble features')
feature_cols: List[str] = [
    'text_length', 'word_count', 'ttr', 'sentence_count',
    'avg_sentence_length', 'punct_ratio',
    'sentiment_polarity', 'sentiment_subjectivity',
]

X_train_basic = cpx_sparse.csr_matrix(
    cp.asarray(train.select(feature_cols).to_numpy().astype(np.float32))
)
X_valid_basic = cpx_sparse.csr_matrix(
    cp.asarray(validation.select(feature_cols).to_numpy().astype(np.float32))
)
X_test_basic = cpx_sparse.csr_matrix(
    cp.asarray(test.select(feature_cols).to_numpy().astype(np.float32))
)

train_text = cudf.Series(train['text'].to_list())
valid_text = cudf.Series(validation['text'].to_list())
test_text = cudf.Series(test['text'].to_list())

X_train_ngram, X_valid_ngram, X_test_ngram = add_ngram_tfidf(
    train_text, valid_text, test_text, n=3, max_features=5000
)

X_train = cpx_sparse.hstack([X_train_basic, X_train_ngram]).tocsr()
X_valid = cpx_sparse.hstack([X_valid_basic, X_valid_ngram]).tocsr()
X_test = cpx_sparse.hstack([X_test_basic, X_test_ngram]).tocsr()

y_train = cp.asarray(train['label'].to_numpy()).astype(cp.int32)
y_valid = cp.asarray(validation['label'].to_numpy()).astype(cp.int32)

print('Shapes:')
print('  X_train:', X_train.shape)
print('  X_valid:', X_valid.shape)
print('  X_test :', X_test.shape)
_gc()
log_step_end('Assemble features')


## Class balance and scale_pos_weight

In [None]:
log_step_start('Class balance and scale_pos_weight')
counter = Counter(_to_numpy(y_train))
assert 0 in counter and 1 in counter, 'labels must be binary {0,1}'
scale_pos_weight = counter[0] / max(counter[1], 1)
print('Class counts:', counter)
print('scale_pos_weight:', scale_pos_weight)
log_step_end('Class balance and scale_pos_weight')


## Train cuML Logistic Regression and MultinomialNB


In [None]:
log_step_start('Train cuML Logistic Regression and MultinomialNB')
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
nb = MultinomialNB()

log_step_start('Fold 1/1 (single split)')
log_step_start('LR fit')
lr.fit(X_train, y_train)
log_step_end('LR fit')
log_step_start('NB fit')
nb.fit(X_train, y_train)
log_step_end('NB fit')
log_step_end('Fold 1/1 (single split)')
print('Models trained.')
_gc()
log_step_end('Train cuML Logistic Regression and MultinomialNB')


## Calibrate with Platt scaling

In [None]:
log_step_start('Calibrate with Platt scaling')
class PlattCalibrator:
    def __init__(self):
        self.model = LogisticRegression(max_iter=1000)
    def fit(self, scores, y):
        scores = cp.asarray(scores).reshape(-1, 1)
        self.model.fit(scores, y)
        return self
    def predict_proba(self, scores):
        scores = cp.asarray(scores).reshape(-1, 1)
        return self.model.predict_proba(scores)[:, 1]

calibrated_nb = PlattCalibrator().fit(nb.predict_proba(X_valid)[:, 1], y_valid)
calibrated_lr = PlattCalibrator().fit(lr.predict_proba(X_valid)[:, 1], y_valid)
print('Models calibrated.')
_gc()
log_step_end('Calibrate with Platt scaling')


## Ensemble and threshold tuning

In [None]:
log_step_start('Ensemble and threshold tuning')
scores_nb = predict_proba_chunks(nb, X_valid)
scores_lr = predict_proba_chunks(lr, X_valid)
val_pred_proba_nb = _to_numpy(calibrated_nb.predict_proba(scores_nb))
val_pred_proba_lr = _to_numpy(calibrated_lr.predict_proba(scores_lr))
val_pred_proba_ensemble = (
    0.6 * val_pred_proba_nb + 0.4 * val_pred_proba_lr
)

y_valid_np = _to_numpy(y_valid)
thresholds = np.arange(0.1, 0.9, 0.01)
best_threshold = 0.5
best_f1 = -1.0

for thr in thresholds:
    val_pred_thr = (val_pred_proba_ensemble >= thr).astype(int)
    f1 = f1_score_np(y_valid_np, val_pred_thr)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = float(thr)

print(
    f'Best threshold: {best_threshold:.2f} with F1: {best_f1:.4f}'
)
_gc()
log_step_end('Ensemble and threshold tuning')


## Validation report and diagnostics

In [None]:
log_step_start('Validation report and diagnostics')
val_pred_final = (val_pred_proba_ensemble >= best_threshold).astype(int)
print(classification_report_np(y_valid_np, val_pred_final))

residual_plot(y_valid_np, val_pred_proba_ensemble,
              'Residuals: validation ensemble')
qq_plot(y_valid_np - val_pred_proba_ensemble,
        'QQ plot: residuals (validation)')
try:
    violin_by_label(train, 'label', 'text_length',
                    'Text length by label (train)')
except Exception as e:
    warnings.warn(f'Violin plot skipped: {e}')
log_step_end('Validation report and diagnostics')


## Predict on test and write submission

In [None]:
log_step_start('Predict on test and write submission')
scores_nb = predict_proba_chunks(nb, X_test)
scores_lr = predict_proba_chunks(lr, X_test)
p_nb_te = _to_numpy(calibrated_nb.predict_proba(scores_nb))
p_lr_te = _to_numpy(calibrated_lr.predict_proba(scores_lr))
p_ens_te = 0.6 * p_nb_te + 0.4 * p_lr_te
yhat_te = (p_ens_te >= best_threshold).astype(int)

submission = pl.DataFrame({'id': test['id'], 'label': yhat_te})
outputs_dir = 'outputs'
os.makedirs(outputs_dir, exist_ok=True)
submission_path = os.path.join(outputs_dir, 'submission_hvsm_prod_a.csv')
submission.write_csv(submission_path)
print('Saved', submission_path, 'with', submission.height, 'rows')
_gc()
log_step_end('Predict on test and write submission')
