<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [2]</a>'.</span>

# HVSM: Local CSVs, Time‑boxed Training, Rich Features + Models

This notebook reads `./train.csv`, `./val.csv`, and `./test.csv` from the current folder. It implements non‑sklearn baselines and deep models in PyTorch, with strict time budgets per algorithm, robust early exits, and many seaborn visualizations. All model weights are saved under `./weights/`.


## 1. Environment and Installs

In [1]:
# %%capture
!pip -q install \
  transformers==4.44.2 datasets==2.21.0 torchtext==0.18.0 \
  ftfy==6.2.3 emoji==2.14.0 ipywidgets==8.1.5 xgboost==2.1.1 \
  spacy==3.7.5 spacy-lookups-data==1.0.5 wordfreq==3.1.1 \
  textstat==0.7.4 nltk==3.9.1 scipy==1.11.4 \
  requests_mock==1.11.0 requests==2.31.0 clyent==1.2.1

import warnings
warnings.filterwarnings('ignore')
print('Installs complete.')


[31mERROR: Cannot install datasets==2.21.0, requests==2.31.0 and transformers==4.44.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

Installs complete.


## 2. Imports, Device, Seeds

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [2]:
from __future__ import annotations

import os
import re
import time
import math
import json
import random
import string
from collections import Counter
from dataclasses import dataclass
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from scipy import sparse as sp

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(context='notebook', style='whitegrid')

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
from wordfreq import zipf_frequency
import textstat
import xgboost as xgb

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
)

RANDOM_SEED: int = 12345
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

DEVICE: torch.device = (
    torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
)
print(f"Using device: {DEVICE}")

nltk.download('vader_lexicon', quiet=True)
try:
    _ = spacy.load('en_core_web_sm')
except OSError:
    from spacy.cli import download as _spacy_dl
    _spacy_dl('en_core_web_sm')
nlp = spacy.load('en_core_web_sm', disable=['ner'])
sia = SentimentIntensityAnalyzer()

os.makedirs('./weights', exist_ok=True)


ModuleNotFoundError: No module named 'spacy'

## 3. Load Local CSVs

In [None]:
TRAIN_PATH: str = './train.csv'
VAL_PATH: str = './val.csv'
TEST_PATH: str = './test.csv'
assert os.path.exists(TRAIN_PATH)
assert os.path.exists(VAL_PATH)
assert os.path.exists(TEST_PATH)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    text = text.replace('\r', '\n')
    text = ''.join(ch for ch in text if ch == '\n' or ch >= ' ')
    return text

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)
for df in (train_df, val_df, test_df):
    df['text'] = df['text'].fillna('')
    df['text'] = df['text'].astype(str).map(clean_text)
display(train_df.head())
display(val_df.head())
display(test_df.head())


## 4. Metrics and Cross‑Validation Utilities

In [None]:
def f1_macro(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    classes = np.unique(np.concatenate([y_true, y_pred]))
    f1s: List[float] = []
    for c in classes:
        tp = np.sum((y_true == c) & (y_pred == c))
        fp = np.sum((y_true != c) & (y_pred == c))
        fn = np.sum((y_true == c) & (y_pred != c))
        prec = tp / (tp + fp + 1e-12)
        rec = tp / (tp + fn + 1e-12)
        f1 = 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)
        f1s.append(float(f1))
    return float(np.mean(f1s))

def kfold_indices(n: int, k: int, seed: int) -> List[Tuple[np.ndarray,
                                                          np.ndarray]]:
    idx = np.arange(n)
    rng = np.random.default_rng(seed)
    rng.shuffle(idx)
    folds: List[np.ndarray] = np.array_split(idx, k)
    splits: List[Tuple[np.ndarray, np.ndarray]] = []
    for i in range(k):
        val_idx = folds[i]
        tr_idx = np.concatenate([f for j, f in enumerate(folds) if j != i])
        splits.append((tr_idx, val_idx))
    return splits


## 5. EDA: Distributions, Correlations, Stylometry

In [None]:
def add_basic_feats(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy()
    tmp['n_chars'] = tmp['text'].str.len()
    tmp['n_words'] = tmp['text'].str.split().map(len)
    tmp['pct_punct'] = tmp['text'].str.count(r"[\.,;:!?]") \
        / tmp['n_chars'].clip(lower=1)
    tmp['pct_digits'] = tmp['text'].str.count(r"\d") \
        / tmp['n_chars'].clip(lower=1)
    tmp['pct_upper'] = tmp['text'].str.count(r"[A-Z]") \
        / tmp['n_chars'].clip(lower=1)
    return tmp

train_feats = add_basic_feats(train_df)
fig, axes = plt.subplots(2, 3, figsize=(13, 7))
sns.countplot(x='label', data=train_df, ax=axes[0,0])
axes[0,0].set_title('Label distribution')
sns.kdeplot(data=train_feats, x='n_chars', hue='label', ax=axes[0,1])
axes[0,1].set_title('Chars by class')
sns.kdeplot(data=train_feats, x='n_words', hue='label', ax=axes[0,2])
axes[0,2].set_title('Words by class')
sns.kdeplot(data=train_feats, x='pct_punct', hue='label', ax=axes[1,0])
axes[1,0].set_title('Punctuation ratio')
sns.kdeplot(data=train_feats, x='pct_digits', hue='label', ax=axes[1,1])
axes[1,1].set_title('Digits ratio')
sns.kdeplot(data=train_feats, x='pct_upper', hue='label', ax=axes[1,2])
axes[1,2].set_title('Uppercase ratio')
plt.tight_layout(); plt.show()


## 6. Rich Feature Engineering

In [None]:
def word_ngrams(tokens: List[str], n_min: int, n_max: int) -> List[str]:
    grams: List[str] = []
    for n in range(n_min, n_max + 1):
        for i in range(0, len(tokens) - n + 1):
            grams.append('_'.join(tokens[i:i+n]))
    return grams

def char_ngrams(text: str, n_min: int, n_max: int) -> List[str]:
    grams: List[str] = []
    for n in range(n_min, n_max + 1):
        for i in range(0, len(text) - n + 1):
            grams.append(text[i:i+n])
    return grams

def hash_token(tok: str, dim: int, seed: int) -> int:
    return abs(hash((tok, seed))) % dim

def build_hashed_csr(texts: List[str], dim: int, seed: int,
                     w_ng: Tuple[int, int], c_ng: Tuple[int, int]
                     ) -> sp.csr_matrix:
    data: List[float] = []
    rows: List[int] = []
    cols: List[int] = []
    for r, t in enumerate(texts):
        toks = t.split()
        feats = toks + word_ngrams(toks, w_ng[0], w_ng[1]) \
            + char_ngrams(t, c_ng[0], c_ng[1])
        counts: Dict[int, int] = {}
        for f in feats:
            j = hash_token(f, dim, seed)
            counts[j] = counts.get(j, 0) + 1
        for j, v in counts.items():
            rows.append(r); cols.append(j); data.append(float(v))
    mat = sp.csr_matrix((np.array(data), (np.array(rows), np.array(cols))),
                        shape=(len(texts), dim), dtype=np.float32)
    return mat

def tfidf_transform(mat: sp.csr_matrix) -> sp.csr_matrix:
    df = np.asarray((mat > 0).sum(axis=0)).ravel()
    n_docs = mat.shape[0]
    idf = np.log((1 + n_docs) / (1 + df)) + 1.0
    mat = mat.tocoo(copy=True)
    mat.data = np.log1p(mat.data)
    mat = mat.tocsr(copy=True)
    mat = mat.multiply(idf)
    row_norms = np.sqrt(mat.multiply(mat).sum(axis=1)).A1
    row_norms[row_norms == 0] = 1.0
    inv = sp.diags(1.0 / row_norms)
    return inv.dot(mat)

def pos_dep_feats(text: str) -> Dict[str, float]:
    doc = nlp(text)
    pos_cnt = Counter(t.pos_ for t in doc)
    dep_cnt = Counter(t.dep_ for t in doc)
    total = max(1, len(doc))
    pos_props = {f'pos_{k}': v / total for k, v in pos_cnt.items()}
    dep_props = {f'dep_{k}': v / total for k, v in dep_cnt.items()}
    return {**pos_props, **dep_props}

def sentiment_feats(text: str) -> Dict[str, float]:
    scores = sia.polarity_scores(text)
    return {f'sent_{k}': float(v) for k, v in scores.items()}

def readability_feats(text: str) -> Dict[str, float]:
    try:
        fl = float(textstat.flesch_reading_ease(text))
        smog = float(textstat.smog_index(text))
        ari = float(textstat.automated_readability_index(text))
        gf = float(textstat.gunning_fog(text))
    except Exception:
        fl, smog, ari, gf = 0.0, 0.0, 0.0, 0.0
    return {
        'read_flesch': fl, 'read_smog': smog,
        'read_ari': ari, 'read_gunning': gf
    }

def lexical_richness(tokens: List[str]) -> Dict[str, float]:
    n = len(tokens)
    types = set(tokens)
    ttr = len(types) / max(1, n)
    cnt = Counter(tokens)
    hapax = sum(1 for k, v in cnt.items() if v == 1) / max(1, len(cnt))
    dis = sum(1 for k, v in cnt.items() if v == 2) / max(1, len(cnt))
    m1 = sum(cnt.values())
    m2 = sum(v*v for v in cnt.values())
    yule_k = 1e4 * (m2 - m1) / (m1 * m1 + 1e-12)
    return {
        'ttr': float(ttr), 'hapax_ratio': float(hapax),
        'dis_ratio': float(dis), 'yule_k': float(yule_k)
    }

def zipf_feats(tokens: List[str]) -> Dict[str, float]:
    if not tokens:
        return {'zipf_mean': 0.0, 'zipf_std': 0.0}
    z = [zipf_frequency(t, 'en') for t in tokens]
    return {'zipf_mean': float(np.mean(z)), 'zipf_std': float(np.std(z))}

def build_dense_features(df: pd.DataFrame) -> pd.DataFrame:
    rows: List[Dict[str, float]] = []
    for t in df['text'].tolist():
        toks = t.split()
        feats: Dict[str, float] = {}
        feats.update(lexical_richness(toks))
        feats.update(pos_dep_feats(t))
        feats.update(sentiment_feats(t))
        feats.update(readability_feats(t))
        feats.update(zipf_feats(toks))
        rows.append(feats)
    dense = pd.DataFrame(rows).fillna(0.0)
    return dense.reset_index(drop=True)

W_DIM: int = 2**19
SEED_HASH: int = 13

def build_sparse_features(df: pd.DataFrame) -> sp.csr_matrix:
    Xw = build_hashed_csr(df['text'].tolist(), W_DIM, SEED_HASH,
                          w_ng=(1, 2), c_ng=(0, -1))
    Xc = build_hashed_csr(df['text'].tolist(), W_DIM, SEED_HASH,
                          w_ng=(0, -1), c_ng=(3, 5))
    Xw = tfidf_transform(Xw)
    Xc = tfidf_transform(Xc)
    return sp.hstack([Xw, Xc], format='csr')

def build_all_features(df: pd.DataFrame) -> Tuple[sp.csr_matrix, pd.DataFrame]:
    dense = build_dense_features(df)
    sparse = build_sparse_features(df)
    return sparse, dense


## 7. Feature Visualization

In [None]:
train_dense = build_dense_features(train_df)
corr = train_dense.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Dense feature correlations')
plt.tight_layout(); plt.show()

keep_cols = [c for c in train_dense.columns if c.startswith('pos_')][:8]
plt.figure(figsize=(10, 5))
melted = pd.concat([train_dense[keep_cols], train_df['label']], axis=1) \
    .melt(id_vars='label', var_name='feat', value_name='val')
sns.violinplot(data=melted, x='feat', y='val', hue='label', split=True)
plt.xticks(rotation=30); plt.title('POS proportions by class')
plt.tight_layout(); plt.show()


## 8. Classical Models (no sklearn)

In [None]:
class LinearClassifier(nn.Module):
    def __init__(self, in_dim: int, n_classes: int = 2) -> None:
        super().__init__()
        self.fc = nn.Linear(in_dim, n_classes)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc(x)

def train_linear_model(
    Xs: sp.csr_matrix,
    Xd: np.ndarray,
    y: np.ndarray,
    Xs_val: sp.csr_matrix,
    Xd_val: np.ndarray,
    y_val: np.ndarray,
    loss: str = 'log',
    epochs: int = 6,
    bs: int = 256,
    lr: float = 1e-2,
    name: str = 'linear'
) -> Tuple[LinearClassifier, float]:
    start = time.time()
    time_budget = 3 * 60 * 60  # 3 hours
    early_check = 30 * 60  # 30 minutes

    X = sp.hstack([Xs, sp.csr_matrix(Xd)], format='csr')
    V = sp.hstack([Xs_val, sp.csr_matrix(Xd_val)], format='csr')
    in_dim = X.shape[1]
    model = LinearClassifier(in_dim).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    def batcher(A: sp.csr_matrix, b: np.ndarray, batch: int):
        n = A.shape[0]
        idx = np.arange(n)
        np.random.shuffle(idx)
        for i in range(0, n, batch):
            j = idx[i:i+batch]
            Xt = torch.tensor(A[j].toarray(), dtype=torch.float32,
                               device=DEVICE)
            yt = torch.tensor(b[j], dtype=torch.long, device=DEVICE)
            yield Xt, yt

    def predict(A: sp.csr_matrix) -> np.ndarray:
        outs: List[np.ndarray] = []
        model.eval()
        with torch.no_grad():
            for i in range(0, A.shape[0], bs):
                sl = slice(i, min(i+bs, A.shape[0]))
                Xt = torch.tensor(A[sl].toarray(), dtype=torch.float32,
                                   device=DEVICE)
                outs.append(model(Xt).cpu().numpy())
        return np.vstack(outs)

    best = -1.0
    history: List[Tuple[int, float]] = []
    for ep in range(1, epochs + 1):
        if time.time() - start > time_budget:
            print(f"[STOP] {name} exceeded 3h.")
            break
        model.train(); total = 0.0
        for Xt, yt in batcher(X, y, bs):
            opt.zero_grad(set_to_none=True)
            logits = model(Xt)
            if loss == 'hinge':
                yt_oh = torch.nn.functional.one_hot(yt, num_classes=2)
                margins = 1 - logits * (2 * yt_oh - 1)
                l = torch.clamp(margins, min=0.0).mean()
            else:
                l = nn.CrossEntropyLoss()(logits, yt)
            l.backward(); opt.step(); total += float(l.item())
        val_logits = predict(V)
        val_pred = val_logits.argmax(1)
        f1 = f1_macro(y_val, val_pred)
        history.append((ep, f1))
        print(f"Epoch {ep} | val F1: {f1:.4f} | loss: {total:.2f}")
        best = max(best, f1)
        if time.time() - start > early_check and best < 0.65:
            print(f"[DROP] {name} <0.65 F1 after 30 min. Aborting.")
            break
    # Save weights
    torch.save(model.state_dict(), f'./weights/{name}.pt')
    # Learning curve
    if history:
        epc = [e for e, _ in history]
        f1s = [v for _, v in history]
        plt.figure(figsize=(5, 3))
        sns.lineplot(x=epc, y=f1s)
        plt.xlabel('epoch'); plt.ylabel('val F1')
        plt.title(f'Learning curve: {name}')
        plt.tight_layout(); plt.show()
    return model, best

def random_projection(X: sp.csr_matrix, out_dim: int, seed: int
                     ) -> np.ndarray:
    rng = np.random.default_rng(seed)
    R = rng.standard_normal((X.shape[1], out_dim)).astype(np.float32)
    return X.dot(R).astype(np.float32)

def knn_predict(X_tr: np.ndarray, y_tr: np.ndarray, X_te: np.ndarray,
                k: int = 11) -> np.ndarray:
    def norm_rows(A: np.ndarray) -> np.ndarray:
        n = np.linalg.norm(A, axis=1, keepdims=True)
        n[n == 0] = 1.0
        return A / n
    A = norm_rows(X_tr)
    B = norm_rows(X_te)
    preds: List[int] = []
    bs = 1024
    for i in range(0, B.shape[0], bs):
        Sl = slice(i, min(i+bs, B.shape[0]))
        sims = B[Sl] @ A.T
        idx = np.argpartition(sims, -k, axis=1)[:, -k:]
        votes = y_tr[idx]
        maj = (votes.mean(1) >= 0.5).astype(int)
        preds.extend(list(maj))
    return np.array(preds)

def train_xgboost(Xs: sp.csr_matrix, Xd: np.ndarray, y: np.ndarray,
                  Xs_v: sp.csr_matrix, Xd_v: np.ndarray, y_v: np.ndarray,
                  name: str = 'xgboost', rounds: int = 200
                  ) -> Tuple[xgb.Booster, float]:
    start = time.time(); time_budget = 3 * 60 * 60
    X = sp.hstack([Xs, sp.csr_matrix(Xd)], format='csr')
    V = sp.hstack([Xs_v, sp.csr_matrix(Xd_v)], format='csr')
    dtr = xgb.DMatrix(X, label=y)
    dva = xgb.DMatrix(V, label=y_v)
    params = {
        'objective': 'binary:logistic', 'tree_method': 'hist',
        'eval_metric': 'logloss', 'max_depth': 7, 'eta': 0.12,
        'subsample': 0.9, 'colsample_bytree': 0.9
    }
    bst = xgb.train(params, dtr, num_boost_round=rounds,
                    evals=[(dtr, 'train'), (dva, 'val')],
                    verbose_eval=50)
    if time.time() - start > 30 * 60:
        preds = (bst.predict(dva) >= 0.5).astype(int)
        f1 = f1_macro(y_v, preds)
        if f1 < 0.65:
            print('[DROP] XGBoost <0.65 F1 after 30 min. Aborting.')
            return bst, f1
    preds = (bst.predict(dva) >= 0.5).astype(int)
    f1 = f1_macro(y_v, preds)
    bst.save_model('./weights/xgboost.json')
    print(f"XGB val F1: {f1:.4f}")
    return bst, f1


## 9. PyTorch Text Models with Time Guards

In [None]:
class HashBagModel(nn.Module):
    def __init__(self, dim: int, n_classes: int = 2) -> None:
        super().__init__()
        self.emb = nn.EmbeddingBag(dim, n_classes, mode='sum')
    def forward(self, idxs: torch.Tensor, offs: torch.Tensor
                ) -> torch.Tensor:
        return self.emb(idxs, offs)

def build_indices(text: str, dim: int, seed: int,
                  w_ng: Tuple[int, int], c_ng: Tuple[int, int]
                  ) -> List[int]:
    toks = text.split()
    feats = toks + word_ngrams(toks, w_ng[0], w_ng[1]) \
        + char_ngrams(text, c_ng[0], c_ng[1])
    return list({hash_token(f, dim, seed) for f in feats})

class HashedDataset(Dataset):
    def __init__(self, df: pd.DataFrame, dim: int, seed: int,
                 w_ng: Tuple[int, int], c_ng: Tuple[int, int]) -> None:
        self.df = df.reset_index(drop=True)
        self.dim = int(dim)
        self.seed = int(seed)
        self.w_ng = w_ng
        self.c_ng = c_ng
    def __len__(self) -> int:
        return int(len(self.df))
    def __getitem__(self, i: int) -> Tuple[torch.Tensor, Optional[int]]:
        row = self.df.iloc[i]
        idxs = build_indices(row['text'], self.dim, self.seed,
                             self.w_ng, self.c_ng)
        x = torch.tensor(idxs, dtype=torch.long)
        y: Optional[int] = None
        if 'label' in row and not pd.isna(row['label']):
            y = int(row['label'])
        return x, y

def collate_hashed(
    batch: List[Tuple[torch.Tensor, Optional[int]]]
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    feats: List[torch.Tensor] = []
    offsets: List[int] = [0]
    labels: List[int] = []
    total = 0
    for x, y in batch:
        feats.append(x)
        total += int(x.numel())
        offsets.append(total)
        if y is not None:
            labels.append(int(y))
    feats_cat = torch.cat(feats) if feats else torch.tensor([], dtype=torch.long)
    offs = torch.tensor(offsets[:-1], dtype=torch.long)
    y_t: Optional[torch.Tensor] = None
    if len(labels) == len(batch):
        y_t = torch.tensor(labels, dtype=torch.long)
    return feats_cat, offs, y_t

def train_hash_model(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    dim: int = 2**19,
    epochs: int = 3,
    bs: int = 128,
    lr: float = 5e-3,
    seed: int = 13,
    name: str = 'embbag'
) -> Tuple[HashBagModel, float]:
    start = time.time()
    early_check = 30 * 60
    time_budget = 3 * 60 * 60
    tr_ds = HashedDataset(train_df, dim, seed, (1, 2), (3, 5))
    va_ds = HashedDataset(val_df, dim, seed, (1, 2), (3, 5))
    tr_dl = DataLoader(tr_ds, batch_size=bs, shuffle=True,
                       collate_fn=collate_hashed)
    va_dl = DataLoader(va_ds, batch_size=bs, shuffle=False,
                       collate_fn=collate_hashed)
    model = HashBagModel(dim).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    best = -1.0
    hist: List[Tuple[int, float]] = []
    for ep in range(1, epochs + 1):
        if time.time() - start > time_budget:
            print('[STOP] EmbBag exceeded 3h.')
            break
        model.train(); total = 0.0
        for x, offs, y in tr_dl:
            x = x.to(DEVICE); offs = offs.to(DEVICE)
            assert y is not None; y = y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            loss = loss_fn(model(x, offs), y)
            loss.backward(); opt.step(); total += float(loss.item())
        model.eval(); y_t, y_p = [], []
        with torch.no_grad():
            for x, offs, y in va_dl:
                p = model(x.to(DEVICE), offs.to(DEVICE)).argmax(1)
                if y is not None:
                    y_t.extend(list(y.numpy()))
                    y_p.extend(list(p.cpu().numpy()))
        f1 = f1_macro(np.array(y_t), np.array(y_p))
        hist.append((ep, f1))
        print(f"Epoch {ep} | val F1: {f1:.4f} | loss: {total:.2f}")
        best = max(best, f1)
        if time.time() - start > early_check and best < 0.65:
            print('[DROP] EmbBag <0.65 F1 after 30 min. Aborting.')
            break
    torch.save(model.state_dict(), './weights/embbag.pt')
    if hist:
        epc = [e for e, _ in hist]; f1s = [v for _, v in hist]
        plt.figure(figsize=(5, 3))
        sns.lineplot(x=epc, y=f1s)
        plt.title('Learning curve: EmbBag'); plt.tight_layout(); plt.show()
    return model, best


In [None]:
class CharCNN(nn.Module):
    def __init__(self, vocab: str, emb_dim: int = 32, n_classes: int = 2
                 ) -> None:
        super().__init__()
        self.vocab = vocab
        self.map = {c: i + 1 for i, c in enumerate(vocab)}
        self.emb = nn.Embedding(len(vocab) + 1, emb_dim)
        self.conv = nn.Sequential(
            nn.Conv1d(emb_dim, 128, 7, padding=3), nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Conv1d(128, 256, 5, padding=2), nn.ReLU(),
            nn.AdaptiveMaxPool1d(1)
        )
        self.fc = nn.Linear(256, n_classes)
    def encode(self, text: str, max_len: int = 1024) -> List[int]:
        ids = [self.map.get(c, 0) for c in text[:max_len]]
        if len(ids) < max_len:
            ids += [0] * (max_len - len(ids))
        return ids
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        emb = self.emb(x).transpose(1, 2)
        h = self.conv(emb).squeeze(-1)
        return self.fc(h)

class CharDataset(Dataset):
    def __init__(self, df: pd.DataFrame, enc: Callable[[str], List[int]]) -> None:
        self.df = df.reset_index(drop=True)
        self.enc = enc
    def __len__(self) -> int: return int(len(self.df))
    def __getitem__(self, i: int) -> Tuple[torch.Tensor, Optional[int]]:
        row = self.df.iloc[i]
        x = torch.tensor(self.enc(row['text']), dtype=torch.long)
        y: Optional[int] = None
        if 'label' in row and not pd.isna(row['label']):
            y = int(row['label'])
        return x, y

def train_char_cnn(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    epochs: int = 3,
    bs: int = 128,
    lr: float = 2e-3,
    name: str = 'charcnn'
) -> Tuple[CharCNN, float]:
    start = time.time(); early_check = 30 * 60; time_budget = 3 * 60 * 60
    vocab = string.ascii_letters + string.digits + string.punctuation + ' \n\t'
    model = CharCNN(vocab).to(DEVICE)
    enc = lambda s: model.encode(s, 1024)
    tr_dl = DataLoader(CharDataset(train_df, enc), batch_size=bs,
                       shuffle=True)
    va_dl = DataLoader(CharDataset(val_df, enc), batch_size=bs,
                       shuffle=False)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss(); best = -1.0
    hist: List[Tuple[int, float]] = []
    for ep in range(1, epochs + 1):
        if time.time() - start > time_budget:
            print('[STOP] CharCNN exceeded 3h.')
            break
        model.train(); tot = 0.0
        for x, y in tr_dl:
            x = x.to(DEVICE); assert y is not None; y = y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            loss = loss_fn(model(x), y)
            loss.backward(); opt.step(); tot += float(loss.item())
        model.eval(); y_t, y_p = [], []
        with torch.no_grad():
            for x, y in va_dl:
                p = model(x.to(DEVICE)).argmax(1).cpu().numpy()
                if y is not None:
                    y_t.extend(list(y.numpy()))
                    y_p.extend(list(p))
        f1 = f1_macro(np.array(y_t), np.array(y_p))
        hist.append((ep, f1))
        print(f"Epoch {ep} | val F1: {f1:.4f} | loss: {tot:.2f}")
        best = max(best, f1)
        if time.time() - start > early_check and best < 0.65:
            print('[DROP] CharCNN <0.65 F1 after 30 min. Aborting.')
            break
    torch.save(model.state_dict(), './weights/charcnn.pt')
    if hist:
        epc = [e for e, _ in hist]; f1s = [v for _, v in hist]
        plt.figure(figsize=(5, 3))
        sns.lineplot(x=epc, y=f1s)
        plt.title('Learning curve: CharCNN')
        plt.tight_layout(); plt.show()
    return model, best


In [None]:
def tokenize_batch(tokenizer: Any, texts: List[str], max_len: int
                  ) -> Dict[str, torch.Tensor]:
    enc = tokenizer(texts, padding=True, truncation=True,
                    max_length=max_len, return_tensors='pt')
    return {k: v for k, v in enc.items()}

class HFClsDataset(Dataset):
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df.reset_index(drop=True)
    def __len__(self) -> int: return int(len(self.df))
    def __getitem__(self, i: int) -> Tuple[str, Optional[int]]:
        row = self.df.iloc[i]
        y: Optional[int] = None
        if 'label' in row and not pd.isna(row['label']):
            y = int(row['label'])
        return str(row['text']), y

def collate_hf(
    batch: List[Tuple[str, Optional[int]]],
    tokenizer: Any,
    max_len: int
) -> Tuple[Dict[str, torch.Tensor], Optional[torch.Tensor]]:
    texts = [b[0] for b in batch]
    enc = tokenize_batch(tokenizer, texts, max_len)
    labels: Optional[torch.Tensor] = None
    if all(b[1] is not None for b in batch):
        labels = torch.tensor([int(b[1]) for b in batch],
                             dtype=torch.long)
    return enc, labels

def train_transformer(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    model_name: str = 'roberta-base',
    epochs: int = 2,
    bs: int = 16,
    lr: float = 2e-5,
    max_len: int = 512,
    name: str = 'roberta'
) -> Tuple[nn.Module, Any, float]:
    start = time.time(); early_check = 30 * 60; time_budget = 3 * 60 * 60
    tok = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    ).to(DEVICE)
    tr_ds = HFClsDataset(train_df)
    va_ds = HFClsDataset(val_df)
    coll = lambda b: collate_hf(b, tok, max_len)
    tr_dl = DataLoader(tr_ds, batch_size=bs, shuffle=True,
                       collate_fn=coll)
    va_dl = DataLoader(va_ds, batch_size=bs, shuffle=False,
                       collate_fn=coll)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    steps = max(1, len(tr_dl) * epochs)
    sch = get_linear_schedule_with_warmup(opt, int(0.1 * steps), steps)
    loss_fn = nn.CrossEntropyLoss(); best = -1.0
    hist: List[Tuple[int, float]] = []
    for ep in range(1, epochs + 1):
        if time.time() - start > time_budget:
            print('[STOP] Transformer exceeded 3h.')
            break
        model.train(); tot = 0.0
        for enc, y in tr_dl:
            enc = {k: v.to(DEVICE) for k, v in enc.items()}
            assert y is not None; y = y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            out = model(**enc); loss = loss_fn(out.logits, y)
            loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step(); sch.step(); tot += float(loss.item())
        model.eval(); y_t, y_p = [], []
        with torch.no_grad():
            for enc, y in va_dl:
                enc = {k: v.to(DEVICE) for k, v in enc.items()}
                p = model(**enc).logits.argmax(1).cpu().numpy()
                if y is not None:
                    y_t.extend(list(y.numpy()))
                    y_p.extend(list(p))
        f1 = f1_macro(np.array(y_t), np.array(y_p))
        hist.append((ep, f1))
        print(f"Epoch {ep} | val F1: {f1:.4f} | loss: {tot:.2f}")
        best = max(best, f1)
        if time.time() - start > early_check and best < 0.65:
            print('[DROP] Transformer <0.65 F1 after 30 min. Aborting.')
            break
    model.save_pretrained('./weights/roberta')
    tok.save_pretrained('./weights/roberta')
    if hist:
        epc = [e for e, _ in hist]; f1s = [v for _, v in hist]
        plt.figure(figsize=(5, 3))
        sns.lineplot(x=epc, y=f1s)
        plt.title('Learning curve: RoBERTa')
        plt.tight_layout(); plt.show()
    return model, tok, best


## 10. Build Features, CV, and Run Experiments

In [None]:
X_train_s, X_train_d = build_all_features(train_df)
X_val_s, X_val_d = build_all_features(val_df)
y_train = train_df['label'].values
y_val = val_df['label'].values

def cv_linear(loss: str, tag: str) -> float:
    splits = kfold_indices(X_train_s.shape[0], 5, RANDOM_SEED)
    scores: List[float] = []
    for i, (tr, va) in enumerate(splits):
        m, f1 = train_linear_model(
            X_train_s[tr], X_train_d.iloc[tr].to_numpy(), y_train[tr],
            X_train_s[va], X_train_d.iloc[va].to_numpy(), y_train[va],
            loss=loss, epochs=4, bs=256, lr=1e-2, name=f'{tag}_cv'
        )
        scores.append(f1)
        print(f"Fold {i+1}/5 F1: {f1:.4f}")
    return float(np.mean(scores))

def cv_knn() -> float:
    splits = kfold_indices(X_train_s.shape[0], 5, RANDOM_SEED)
    scores: List[float] = []
    for i, (tr, va) in enumerate(splits):
        A = random_projection(sp.hstack([X_train_s[tr],
                                         sp.csr_matrix(X_train_d.iloc[tr].to_numpy())],
                                        format='csr'), 512, 13)
        B = random_projection(sp.hstack([X_train_s[va],
                                         sp.csr_matrix(X_train_d.iloc[va].to_numpy())],
                                        format='csr'), 512, 13)
        pred = knn_predict(A, y_train[tr], B, k=11)
        f1 = f1_macro(y_train[va], pred)
        scores.append(f1)
        print(f"Fold {i+1}/5 F1: {f1:.4f}")
    return float(np.mean(scores))

results: List[Dict[str, Any]] = []
cv_lr = cv_linear('log', 'logreg')
results.append({'model': 'LogReg', 'cv_f1': cv_lr})
cv_svm = cv_linear('hinge', 'linsvm')
results.append({'model': 'LinSVM', 'cv_f1': cv_svm})
cv_kn = cv_knn()
results.append({'model': 'KNN(Proj)', 'cv_f1': cv_kn})

# Fit on full training and evaluate on held-out val
m_lr, f1_lr = train_linear_model(
    X_train_s, X_train_d.to_numpy(), y_train,
    X_val_s, X_val_d.to_numpy(), y_val,
    loss='log', epochs=6, bs=256, lr=1e-2, name='logreg'
)
m_svm, f1_svm = train_linear_model(
    X_train_s, X_train_d.to_numpy(), y_train,
    X_val_s, X_val_d.to_numpy(), y_val,
    loss='hinge', epochs=6, bs=256, lr=1e-2, name='linsvm'
)
A_tr = random_projection(sp.hstack([X_train_s,
                                    sp.csr_matrix(X_train_d.to_numpy())],
                                   format='csr'), 512, 13)
A_va = random_projection(sp.hstack([X_val_s,
                                    sp.csr_matrix(X_val_d.to_numpy())],
                                   format='csr'), 512, 13)
pred_kn = knn_predict(A_tr, y_train, A_va, k=11)
f1_kn = f1_macro(y_val, pred_kn)

bst, f1_xg = train_xgboost(
    X_train_s, X_train_d.to_numpy(), y_train,
    X_val_s, X_val_d.to_numpy(), y_val,
    name='xgboost', rounds=200
)

mh, f1_hash = train_hash_model(train_df, val_df)
mc, f1_char = train_char_cnn(train_df, val_df)
mt, tok, f1_tr = train_transformer(train_df, val_df)

res = pd.DataFrame([
    {'model': 'LogReg', 'cv_f1': cv_lr, 'val_f1': f1_lr},
    {'model': 'LinSVM', 'cv_f1': cv_svm, 'val_f1': f1_svm},
    {'model': 'KNN(Proj)', 'cv_f1': cv_kn, 'val_f1': f1_kn},
    {'model': 'XGBoost', 'cv_f1': np.nan, 'val_f1': f1_xg},
    {'model': 'EmbBag', 'cv_f1': np.nan, 'val_f1': f1_hash},
    {'model': 'CharCNN', 'cv_f1': np.nan, 'val_f1': f1_char},
    {'model': 'RoBERTa', 'cv_f1': np.nan, 'val_f1': f1_tr},
])
display(res)
plt.figure(figsize=(9, 4))
sns.barplot(data=res.melt(id_vars='model', value_vars=['cv_f1','val_f1'],
                         var_name='split', value_name='f1'),
            x='model', y='f1', hue='split')
plt.xticks(rotation=20); plt.title('Model comparison (macro-F1)')
plt.tight_layout(); plt.show()


## 11. Error Analysis with More Visuals

In [None]:
best_row = res.sort_values('val_f1', ascending=False).iloc[0]
print('Best model:', best_row['model'])

def confusion(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    cm = np.zeros((2, 2), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[int(t), int(p)] += 1
    return cm

def preds_from_best() -> np.ndarray:
    if best_row['model'] == 'LogReg':
        V = sp.hstack([X_val_s, sp.csr_matrix(X_val_d.to_numpy())],
                      format='csr')
        logits = m_lr(torch.tensor(V.toarray(), dtype=torch.float32,
                                   device=DEVICE)).cpu().numpy()
        return logits.argmax(1)
    if best_row['model'] == 'LinSVM':
        V = sp.hstack([X_val_s, sp.csr_matrix(X_val_d.to_numpy())],
                      format='csr')
        logits = m_svm(torch.tensor(V.toarray(), dtype=torch.float32,
                                    device=DEVICE)).cpu().numpy()
        return logits.argmax(1)
    if best_row['model'] == 'KNN(Proj)':
        return pred_kn
    if best_row['model'] == 'XGBoost':
        dva = xgb.DMatrix(sp.hstack([X_val_s, sp.csr_matrix(X_val_d.to_numpy())],
                                    format='csr'))
        return (bst.predict(dva) >= 0.5).astype(int)
    if best_row['model'] == 'EmbBag':
        ds = HashedDataset(val_df, 2**19, 13, (1, 2), (3, 5))
        dl = DataLoader(ds, batch_size=128, shuffle=False,
                        collate_fn=collate_hashed)
        preds: List[int] = []
        with torch.no_grad():
            for x, offs, _ in dl:
                p = mh(x.to(DEVICE), offs.to(DEVICE)).argmax(1).cpu().numpy()
                preds.extend(list(p))
        return np.array(preds)
    if best_row['model'] == 'CharCNN':
        enc = lambda s: mc.encode(s, 1024)
        dl = DataLoader(CharDataset(val_df, enc), batch_size=128,
                        shuffle=False)
        preds: List[int] = []
        with torch.no_grad():
            for x, _ in dl:
                p = mc(x.to(DEVICE)).argmax(1).cpu().numpy()
                preds.extend(list(p))
        return np.array(preds)
    ds = HFClsDataset(val_df)
    coll = lambda b: collate_hf(b, tok, 512)
    dl = DataLoader(ds, batch_size=32, shuffle=False, collate_fn=coll)
    preds: List[int] = []
    with torch.no_grad():
        for enc, _ in dl:
            enc = {k: v.to(DEVICE) for k, v in enc.items()}
            p = mt(**enc).logits.argmax(1).cpu().numpy()
            preds.extend(list(p))
    return np.array(preds)

y_hat = preds_from_best()
cm = confusion(y_val, y_hat)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('Confusion (val)')
plt.tight_layout(); plt.show()

val_dense = X_val_d.copy()
val_dense = val_dense.assign(pred=y_hat, label=y_val)
plt.figure(figsize=(7, 4))
sns.kdeplot(data=val_dense, x='ttr', hue='label')
plt.title('TTR by label'); plt.tight_layout(); plt.show()

plt.figure(figsize=(7, 4))
sns.violinplot(data=val_dense, x='pred', y='read_flesch')
plt.title('Flesch vs prediction'); plt.tight_layout(); plt.show()

err = val_df.assign(pred=y_hat)
err = err[err['label'] != err['pred']].copy()
display(err.head(12)[['id','label','pred','text']])


## 12. Final Train on Train+Val and Save Submission

In [None]:
full_df = pd.concat([train_df, val_df], axis=0).reset_index(drop=True)
X_full_s, X_full_d = build_all_features(full_df)
X_test_s, X_test_d = build_all_features(test_df)
y_full = full_df['label'].values

def fit_and_predict(best: str) -> pd.DataFrame:
    if best == 'LogReg':
        m, _ = train_linear_model(
            X_full_s, X_full_d.to_numpy(), y_full,
            X_val_s[:1], X_val_d[:1].to_numpy(), y_val[:1],
            loss='log', epochs=6, bs=256, lr=1e-2, name='logreg_full'
        )
        V = sp.hstack([X_test_s, sp.csr_matrix(X_test_d.to_numpy())],
                      format='csr')
        p = m(torch.tensor(V.toarray(), dtype=torch.float32,
                           device=DEVICE)).argmax(1).cpu().numpy()
        return pd.DataFrame({'id': test_df['id'], 'label': p})
    if best == 'LinSVM':
        m, _ = train_linear_model(
            X_full_s, X_full_d.to_numpy(), y_full,
            X_val_s[:1], X_val_d[:1].to_numpy(), y_val[:1],
            loss='hinge', epochs=6, bs=256, lr=1e-2,
            name='linsvm_full'
        )
        V = sp.hstack([X_test_s, sp.csr_matrix(X_test_d.to_numpy())],
                      format='csr')
        p = m(torch.tensor(V.toarray(), dtype=torch.float32,
                           device=DEVICE)).argmax(1).cpu().numpy()
        return pd.DataFrame({'id': test_df['id'], 'label': p})
    if best == 'KNN(Proj)':
        A = random_projection(sp.hstack([X_full_s,
                                         sp.csr_matrix(X_full_d.to_numpy())],
                                        format='csr'), 512, 13)
        B = random_projection(sp.hstack([X_test_s,
                                         sp.csr_matrix(X_test_d.to_numpy())],
                                        format='csr'), 512, 13)
        p = knn_predict(A, y_full, B, k=11)
        return pd.DataFrame({'id': test_df['id'], 'label': p})
    if best == 'XGBoost':
        bst, _ = train_xgboost(
            X_full_s, X_full_d.to_numpy(), y_full,
            X_val_s[:1], X_val_d[:1].to_numpy(), y_val[:1],
            name='xgboost_full', rounds=250
        )
        dt = xgb.DMatrix(sp.hstack([X_test_s,
                                    sp.csr_matrix(X_test_d.to_numpy())],
                                   format='csr'))
        p = (bst.predict(dt) >= 0.5).astype(int)
        return pd.DataFrame({'id': test_df['id'], 'label': p})
    if best == 'EmbBag':
        mh_full, _ = train_hash_model(full_df, val_df.iloc[:0])
        ds = HashedDataset(test_df, 2**19, 13, (1, 2), (3, 5))
        dl = DataLoader(ds, batch_size=128, shuffle=False,
                        collate_fn=collate_hashed)
        preds: List[int] = []
        with torch.no_grad():
            for x, offs, _ in dl:
                p = mh_full(x.to(DEVICE), offs.to(DEVICE)).argmax(1)
                preds.extend(list(p.cpu().numpy()))
        torch.save(mh_full.state_dict(), './weights/embbag_full.pt')
        return pd.DataFrame({'id': test_df['id'], 'label': preds})
    if best == 'CharCNN':
        mc_full, _ = train_char_cnn(full_df, val_df.iloc[:0])
        enc = lambda s: mc_full.encode(s, 1024)
        dl = DataLoader(CharDataset(test_df, enc), batch_size=128,
                        shuffle=False)
        preds: List[int] = []
        with torch.no_grad():
            for x, _ in dl:
                p = mc_full(x.to(DEVICE)).argmax(1).cpu().numpy()
                preds.extend(list(p))
        torch.save(mc_full.state_dict(), './weights/charcnn_full.pt')
        return pd.DataFrame({'id': test_df['id'], 'label': preds})
    # RoBERTa
    mt_full, tok, _ = train_transformer(full_df, val_df.iloc[:0])
    mt_full.save_pretrained('./weights/roberta_full')
    tok.save_pretrained('./weights/roberta_full')
    ds = HFClsDataset(test_df)
    coll = lambda b: collate_hf(b, tok, 512)
    dl = DataLoader(ds, batch_size=32, shuffle=False, collate_fn=coll)
    preds: List[int] = []
    with torch.no_grad():
        for enc, _ in dl:
            enc = {k: v.to(DEVICE) for k, v in enc.items()}
            p = mt_full(**enc).logits.argmax(1).cpu().numpy()
            preds.extend(list(p))
    return pd.DataFrame({'id': test_df['id'], 'label': preds})

best_name = str(res.sort_values('val_f1', ascending=False).iloc[0]['model'])
sub_df = fit_and_predict(best_name)
SUB_PATH = './submission.csv'
sub_df.to_csv(SUB_PATH, index=False)
print('Saved submission:', SUB_PATH)
display(sub_df.head())


## 13. Save Results

In [None]:
res.to_csv('./model_results.csv', index=False)
print('Saved results to ./model_results.csv')
