# Phase 2: TabM Deep Learning Model Training

This notebook implements the training pipeline for the **TabM** (Tableau Multi-Layer Perceptron) model. TabM is a state-of-the-art architecture designed specifically for tabular data, utilizing an efficient ensemble of parallel MLP heads.

### üß† Model Overview:
1. **Ensemble Architecture**: Uses a shared backbone with multiple parallel output heads to provide robust predictions.
2. **PyTorch Implementation**: Leveraging GPU acceleration (if available) and the AdamW optimizer.
3. **Comparison Against Baselines**: Directly benchmarks the results against the Random Forest and XGBoost models from Phase 1.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from datetime import datetime

# Premium Styling
plt.style.use('seaborn-v0_8-whitegrid')
PRIMARY_COLOR = '#1E4FA8'
SECONDARY_COLOR = '#E8F0FF'

# Device Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
DATA_PATH = "../data/raw/online_shoppers_intention.csv"
MODELS_PATH = "../backend/models"
REPORTS_PATH = "../reports/metrics"

os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(REPORTS_PATH, exist_ok=True)

print(f"‚úÖ Environment Ready! Using device: {DEVICE}")

## 1. Advanced Data Loading
We implement the same **80/10/10 split** used in the baseline to ensure a fair comparison. The data is then converted into PyTorch tensors.

In [None]:
df = pd.read_csv(DATA_PATH)

# 1. Features Handling
X = df.drop('Revenue', axis=1)
y = df['Revenue'].astype(int)

categorical_cols = ['Month', 'VisitorType', 'Weekend']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# 2. 80/10/10 Split Strategy
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# 3. Scaling
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_val_scaled = X_val.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])
X_val_scaled[numerical_cols] = scaler.transform(X_val[numerical_cols])

# 4. PyTorch Conversion
X_train_t = torch.FloatTensor(X_train_scaled.values).to(DEVICE)
X_val_t = torch.FloatTensor(X_val_scaled.values).to(DEVICE)
X_test_t = torch.FloatTensor(X_test_scaled.values).to(DEVICE)
y_train_t = torch.FloatTensor(y_train.values).to(DEVICE)
y_val_t = torch.FloatTensor(y_val.values).to(DEVICE)
y_test_t = torch.FloatTensor(y_test.values).to(DEVICE)

print(f"‚úÖ Data Split Complete!")
print(f"   ‚Üí Training:   {len(X_train_t)}")
print(f"   ‚Üí Validation: {len(X_val_t)}")
print(f"   ‚Üí Testing:    {len(X_test_t)}")

## 2. Model Architecture: TabM
TabM (Tableau Multi-Layer Perceptron) consists of an ensemble of MLP blocks. Each block learns independently while sharing basic feature normalization.

In [None]:
class MLPBlock(nn.Module):
    def __init__(self, in_features, hidden_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()
        
    def forward(self, x):
        x = self.activation(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.activation(self.bn2(self.fc2(x)))
        return x

class TabMModel(nn.Module):
    def __init__(self, n_features, hidden_dim=128, n_ensemble=8, dropout=0.1):
        super().__init__()
        self.input_bn = nn.BatchNorm1d(n_features)
        self.ensemble_blocks = nn.ModuleList([
            MLPBlock(n_features, hidden_dim, dropout) for _ in range(n_ensemble)
        ])
        self.output_heads = nn.ModuleList([
            nn.Linear(hidden_dim, 1) for _ in range(n_ensemble)
        ])
        
    def forward(self, x):
        x = self.input_bn(x)
        outputs = [head(block(x)) for block, head in zip(self.ensemble_blocks, self.output_heads)]
        stacked = torch.stack(outputs, dim=0)
        return stacked.mean(dim=0).squeeze(-1)

print("‚úÖ Architecture Defined!")

## 3. Training Loop
Applying early stopping and cosine annealing for optimal convergence.

In [None]:
config = {
    'hidden_dim': 128,
    'n_ensemble': 4,
    'dropout': 0.15,
    'lr': 0.001,
    'epochs': 100,
    'batch_size': 256,
    'patience': 15
}

model = TabMModel(X_train_t.shape[1], config['hidden_dim'], config['n_ensemble'], config['dropout']).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['epochs'])

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=config['batch_size'], shuffle=True)

history = {'train_loss': [], 'val_auc': []}
best_val_auc = 0
best_weights = None
patience_counter = 0

for epoch in range(config['epochs']):
    # Training Mode
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    scheduler.step()
    
    # Validation Mode
    model.eval()
    with torch.no_grad():
        val_preds = torch.sigmoid(model(X_val_t)).cpu().numpy()
        val_auc = roc_auc_score(y_val_t.cpu().numpy(), val_preds)
    
    history['train_loss'].append(epoch_loss / len(train_loader))
    history['val_auc'].append(val_auc)
    
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_weights = model.state_dict().copy()
        patience_counter = 0
    else:
        patience_counter += 1
        
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:03d} | Loss: {epoch_loss/len(train_loader):.4f} | Val AUC: {val_auc:.4f}")
    
    if patience_counter >= config['patience']:
        print(f"‚èπÔ∏è Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_weights)
print(f"‚≠ê Best Val AUC: {best_val_auc:.4f}")

## 4. Performance Evaluation
Comparing the TabM results with our traditional baseline models.

In [None]:
model.eval()
with torch.no_grad():
    test_probs = torch.sigmoid(model(X_test_t)).cpu().numpy()
    test_preds = (test_probs >= 0.5).astype(int)

tabm_metrics = {
    'model': 'TabM (Deep Learning)',
    'auc_roc': float(roc_auc_score(y_test_t.cpu().numpy(), test_probs)),
    'f1': float(f1_score(y_test_t.cpu().numpy(), test_preds)),
    'precision': float(precision_score(y_test_t.cpu().numpy(), test_preds)),
    'recall': float(recall_score(y_test_t.cpu().numpy(), test_preds))
}

# Visualize Training History
fig, ax1 = plt.subplots(figsize=(10, 5))

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss', color='tab:red')
ax1.plot(history['train_loss'], color='tab:red', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('AUC-ROC', color='tab:blue')
ax2.plot(history['val_auc'], color='tab:blue', label='Val AUC')
ax2.tick_params(axis='y', labelcolor='tab:blue')

plt.title('TabM Training History')
plt.show()

## 5. Final Comparison Table
We load the previous baseline results and add TabM to the leaderboard.

In [None]:
with open(f"{REPORTS_PATH}/baseline_comparison.json", "r") as f:
    comparison = json.load(f)

comparison.append(tabm_metrics)
df_comp = pd.DataFrame(comparison).set_index('model').sort_values('auc_roc', ascending=False)

print("üèÜ Leaderboard updated with TabM!")
df_comp

## 6. Archival
Saving the trained weights and the full comparison report.

In [None]:
torch.save({
    'state_dict': model.state_dict(),
    'config': config,
    'n_features': X_train_t.shape[1]
}, f"{MODELS_PATH}/tabm_model.pt")

with open(f"{REPORTS_PATH}/full_model_comparison.json", "w") as f:
    json.dump(comparison, f, indent=4)

print("‚úÖ Model and comparison report saved!")