# üß™ Stability & Statistical Significance Test (TabM)

This notebook performs a **5-seed evaluation** of the TabM model to ensure that reported performance metrics (AUC, PR-AUC, etc.) are statistically robust and not a result of "seed luck."

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, f1_score, brier_score_loss
from sklearn.model_selection import train_test_split
import joblib
import os
import time

# --- Setup ---
DEVICE = "cpu"
MODELS_PATH = "../backend/models"
DATA_PATH = "../data/raw/online_shoppers_intention.csv"

print(f"Using device: {DEVICE}")

Using device: cpu


## 1. Model Architecture

Re-defining the **TabM (K=4)** architecture for the stability test.

In [2]:
class FeatureEmbedding(nn.Module):
    def __init__(self, cat_dims, embed_dim):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(d, embed_dim) for d in cat_dims])
    def forward(self, x_cat):
        return torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)], dim=1)

class TabM_K4(nn.Module):
    def __init__(self, cat_dims, num_dim, hidden_dim=128, n_ensemble=4):
        super().__init__()
        self.embedding = FeatureEmbedding(cat_dims, embed_dim=4)
        input_dim = (len(cat_dims) * 4) + num_dim
        self.bn_in = nn.BatchNorm1d(input_dim)
        self.ensemble_blocks = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.GELU(), nn.Dropout(0.1),
                nn.Linear(hidden_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.GELU()
            ) for _ in range(n_ensemble)
        ])
        self.heads = nn.ModuleList([nn.Linear(hidden_dim, 1) for _ in range(n_ensemble)])
    def forward(self, x_cat, x_num):
        emb_cat = self.embedding(x_cat).flatten(1)
        x_in = self.bn_in(torch.cat([emb_cat, x_num], dim=1))
        outputs = [head(block(x_in)) for block, head in zip(self.ensemble_blocks, self.heads)]
        return torch.stack(outputs, dim=0).mean(dim=0)

## 2. Data Preparation

Loading and preprocessing the Shopper Intention dataset.

In [3]:
if not os.path.exists(DATA_PATH):
    # Attempting to find data if run from root or notebooks dir
    DATA_PATH = "data/raw/online_shoppers_intention.csv"

df = pd.read_csv(DATA_PATH)
df['Revenue'] = df['Revenue'].astype(int)
df['Weekend'] = df['Weekend'].astype(int)

cat_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
num_cols = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 
            'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

# Pre-train setup
from sklearn.preprocessing import LabelEncoder, StandardScaler
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
cat_dims = [df[col].nunique() for col in cat_cols]

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

X_cat = torch.tensor(df[cat_cols].values, dtype=torch.long)
X_num = torch.tensor(df[num_cols].values, dtype=torch.float32)
y = torch.tensor(df['Revenue'].values, dtype=torch.float32)

print(f"Data loaded. Shape: {df.shape}")

Data loaded. Shape: (12330, 18)


## 3. Evaluation Loop

Running the model across multiple seeds to calculate Mean and Standard Deviation.

In [4]:
def train_eval_seed(seed):
    print(f"‚ö° Processing Seed {seed}...")
    indices = np.arange(len(df))
    train_idx, test_idx = train_test_split(indices, test_size=0.1, stratify=y, random_state=seed)
    
    model = TabM_K4(cat_dims, len(num_cols)).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    
    for epoch in range(30):
        model.train()
        optimizer.zero_grad()
        out = model(X_cat[train_idx], X_num[train_idx]).squeeze()
        loss = criterion(out, y[train_idx])
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        out = model(X_cat[test_idx], X_num[test_idx]).squeeze()
        probs = torch.sigmoid(out).cpu().numpy()
        preds = (probs > 0.5).astype(int)
    
    y_true = y[test_idx].numpy()
    return {
        'AUC': roc_auc_score(y_true, probs),
        'PR-AUC': average_precision_score(y_true, probs),
        'Recall': recall_score(y_true, preds),
        'F1': f1_score(y_true, preds),
        'Brier': brier_score_loss(y_true, probs)
    }

seeds = [1, 7, 21, 42, 99]
results = []
for s in seeds:
    start_time = time.time()
    res = train_eval_seed(s)
    results.append(res)
    print(f"   Done in {time.time()-start_time:.1f}s | AUC: {res['AUC']:.4f}")

res_df = pd.DataFrame(results)
summary = res_df.agg(['mean', 'std']).T

print("\nüèÜ STABILITY SUMMARY (TabM)")
display(summary)

‚ö° Processing Seed 1...
   Done in 85.5s | AUC: 0.9027
‚ö° Processing Seed 7...
   Done in 27.7s | AUC: 0.9052
‚ö° Processing Seed 21...
   Done in 29.3s | AUC: 0.9017
‚ö° Processing Seed 42...
   Done in 28.6s | AUC: 0.8980
‚ö° Processing Seed 99...
   Done in 29.7s | AUC: 0.9021

üèÜ STABILITY SUMMARY (TabM)


Unnamed: 0,mean,std
AUC,0.901925,0.002582
PR-AUC,0.649302,0.020397
Recall,0.525654,0.038044
F1,0.587971,0.022722
Brier,0.101404,0.002343


## 4. Save results

In [5]:
os.makedirs("../reports/metrics", exist_ok=True)
summary.to_json("../reports/metrics/stability_test.json")
print("Results saved to reports/metrics/stability_test.json")

Results saved to reports/metrics/stability_test.json
