# Entrenamiento con MLP Residual

Este cuaderno se centra exclusivamente en un MLP residual rápido para datos tabulares preprocesados. Incluye Optuna para tuning, validación estratificada (K=5), métricas por fold y exportación de artefactos.


In [None]:

!pip install -q lightning optuna torchmetrics seaborn matplotlib

In [None]:
import gc
import json
import math
import random
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.loggers import CSVLogger
from torchmetrics.classification import MulticlassAccuracy

pl.seed_everything(42, workers=True)
optuna.logging.set_verbosity(optuna.logging.WARNING)

INFO: Seed set to 42
INFO:lightning.fabric.utilities.seed:Seed set to 42


In [None]:
# Configuración global y rutas relevantes
DATA_PATH = Path("processed_train.parquet")
TARGET_COL = "RENDIMIENTO_GLOBAL"
CLASS_NAMES = ["alto", "medio-alto", "medio-bajo", "bajo"]
CLASS2IDX = {cls: idx for idx, cls in enumerate(CLASS_NAMES)}
IDX2CLASS = {idx: cls for cls, idx in CLASS2IDX.items()}

N_SPLITS = 5
RANDOM_STATE = 42
N_JOBS = 2
BATCH_SIZE_SPACE = [512, 1024, 2048, 4096]
ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device} | Artifacts: {ARTIFACT_DIR.resolve()}")

Device: cuda | Artifacts: /content/artifacts


In [None]:
# Utilidades compartidas

def set_seed(seed: int = RANDOM_STATE) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def make_dataloaders(X: np.ndarray, y: np.ndarray, train_idx: np.ndarray,
                     val_idx: np.ndarray, batch_size: int, num_workers: int = 2):
    """Construye DataLoaders tensoriales con pin_memory para GPU."""
    X_train = torch.from_numpy(X[train_idx]).float()
    y_train = torch.from_numpy(y[train_idx]).long()
    X_val = torch.from_numpy(X[val_idx]).float()
    y_val = torch.from_numpy(y[val_idx]).long()

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=(device == "cuda"),
        drop_last=False,
    )
    val_loader = DataLoader(
        TensorDataset(X_val, y_val),
        batch_size=min(batch_size, 2048),
        shuffle=False,
        num_workers=num_workers,
        pin_memory=(device == "cuda"),
    )
    return train_loader, val_loader


def collect_fold_metrics(name: str, fold_scores: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(fold_scores)
    summary = {
        "model": name,
        "mean_acc": df["accuracy"].mean(),
        "std_acc": df["accuracy"].std(ddof=0),
        "min_acc": df["accuracy"].min(),
        "max_acc": df["accuracy"].max(),
    }
    return df, summary


def logits_to_numpy(logits_list):
    return torch.cat(logits_list).softmax(dim=1).cpu().numpy()


def describe_class_balance(labels: np.ndarray):
    counts = pd.Series(labels).value_counts().sort_index()
    display(counts.rename(index=IDX2CLASS))


In [None]:
# Carga de datos preprocesados y pipeline de referencia
assert DATA_PATH.exists(), f"No se encontró {DATA_PATH}"

df = pd.read_parquet(DATA_PATH)
print(f"Shape: {df.shape} | Memoria ~{df.memory_usage().sum() / 1e6:.1f} MB")

y = df[TARGET_COL].map(CLASS2IDX).to_numpy(dtype=np.int64)
X = df.drop(columns=[TARGET_COL]).to_numpy(dtype=np.float32)

describe_class_balance(y)
print(f"Feature dims: {X.shape[1]}")

del df
_ = gc.collect()

Shape: (692500, 28) | Memoria ~155.1 MB


Unnamed: 0,count
alto,175619
medio-alto,171619
medio-bajo,172275
bajo,172987


Feature dims: 27


In [None]:
# Definición de folds estratificados K=5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
print(f"Folds preparados: {len(folds)}")

Folds preparados: 5


In [None]:
# --- MLP Residual (ResNet-like) ---------------------------------------------

class ResidualBlock(nn.Module):
    def __init__(self, dim: int, dropout: float, residual_dropout: float):
        super().__init__()
        self.block = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.Dropout(residual_dropout),
        )

    def forward(self, x):
        return x + self.block(x)


class ResidualMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, depth: int,
                 dropout: float, residual_dropout: float, n_classes: int):
        super().__init__()
        self.input_proj = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
        )
        self.blocks = nn.ModuleList(
            [ResidualBlock(hidden_dim, dropout, residual_dropout) for _ in range(depth)]
        )
        self.head = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, n_classes),
        )

    def forward(self, x):
        h = self.input_proj(x)
        for block in self.blocks:
            h = block(h)
        return self.head(h)


class ResidualMLPModule(pl.LightningModule):
    def __init__(self, input_dim: int, n_classes: int, hidden_dim: int, depth: int,
                 dropout: float, residual_dropout: float, lr: float, weight_decay: float):
        super().__init__()
        self.save_hyperparameters()
        self.model = ResidualMLP(input_dim, hidden_dim, depth, dropout, residual_dropout, n_classes)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y).float().mean()
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, sync_dist=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True, sync_dist=True)
        return {"logits": logits.detach(), "targets": y.detach()}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


In [None]:
# Optuna + entrenamiento para el MLP residual

def tune_residual_mlp(X, y, folds, n_trials: int = 15):
    input_dim = X.shape[1]

    def objective(trial: optuna.Trial) -> float:
        params = {
            "hidden_dim": trial.suggest_categorical("hidden_dim", [256, 384, 512, 640]),
            "depth": trial.suggest_int("depth", 3, 6),
            "dropout": trial.suggest_float("dropout", 0.0, 0.3),
            "residual_dropout": trial.suggest_float("residual_dropout", 0.0, 0.2),
            "lr": trial.suggest_float("lr", 5e-4, 5e-3, log=True),
            "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True),
            "batch_size": trial.suggest_categorical("batch_size", BATCH_SIZE_SPACE),
            "max_epochs": trial.suggest_int("max_epochs", 12, 30),
        }
        train_idx, val_idx = folds[0]
        train_loader, val_loader = make_dataloaders(X, y, train_idx, val_idx, params["batch_size"])
        module = ResidualMLPModule(
            input_dim=input_dim,
            n_classes=len(CLASS_NAMES),
            hidden_dim=params["hidden_dim"],
            depth=params["depth"],
            dropout=params["dropout"],
            residual_dropout=params["residual_dropout"],
            lr=params["lr"],
            weight_decay=params["weight_decay"],
        )
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=params["max_epochs"],
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
        )
        trainer.fit(module, train_loader, val_loader)
        val_acc = float(trainer.callback_metrics.get("val_acc", torch.tensor(0.0)).cpu())
        torch.cuda.empty_cache()
        return val_acc

    study = optuna.create_study(direction="maximize", study_name="residual_mlp")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study, study.best_params


def train_residual_mlp_cv(best_params: dict, X, y, folds):
    params = best_params.copy()
    batch_size = params.pop("batch_size")
    max_epochs = params.pop("max_epochs")
    input_dim = X.shape[1]

    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    checkpoints = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[ResidualMLP] Fold {fold_id}")
        train_loader, val_loader = make_dataloaders(X, y, tr_idx, val_idx, batch_size)
        module = ResidualMLPModule(input_dim=input_dim, n_classes=len(CLASS_NAMES), **params)
        callbacks = [
            EarlyStopping(monitor="val_acc", patience=5, mode="max"),
            ModelCheckpoint(
                dirpath=ARTIFACT_DIR / f"residual_mlp_fold{fold_id}",
                filename="res-mlp-{epoch:02d}-{val_acc:.4f}",
                monitor="val_acc",
                mode="max",
                save_top_k=1,
            ),
        ]
        logger = CSVLogger(save_dir=ARTIFACT_DIR / f"logs_res_fold{fold_id}", name="resmlp")
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=max_epochs,
            callbacks=callbacks,
            logger=logger,
            gradient_clip_val=1.0,
        )
        trainer.fit(module, train_loader, val_loader)
        best_ckpt = callbacks[1].best_model_path
        checkpoints.append(best_ckpt)
        best_model = ResidualMLPModule.load_from_checkpoint(best_ckpt)
        best_model.to(device).eval()
        val_logits = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                val_logits.append(best_model(xb).cpu())
        probs = torch.cat(val_logits).softmax(dim=1).numpy()
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        fold_scores.append({"fold": fold_id, "accuracy": accuracy_score(y[val_idx], preds)})
        del best_model
        gc.collect()
        torch.cuda.empty_cache()
    return {
        "name": "ResidualMLP",
        "best_params": best_params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": checkpoints,
    }


In [None]:
# Tuning y entrenamiento del MLP residual
N_TRIALS_RES = 20
res_study, res_best_params = tune_residual_mlp(X, y, folds, n_trials=N_TRIALS_RES)
print(f"Residual params: {json.dumps(res_best_params, indent=2)}")

res_results = train_residual_mlp_cv(res_best_params, X, y, folds)
res_fold_df, res_summary = collect_fold_metrics(res_results["name"], res_results["fold_metrics"])
res_fold_df

  0%|          | 0/20 [00:00<?, ?it/s]

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
INFO: 
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | ResidualMLP      | 2.1 M  | train
1 | criterion | CrossEntropyLoss | 0      | train
--------------------------

In [None]:
# Evaluación final del Residual MLP
res_summary

In [None]:
# Matriz de confusión y reporte del Residual MLP
best_oof = res_results["oof_predictions"]
y_pred = best_oof.argmax(axis=1)
acc = accuracy_score(y, y_pred)
print(f"Accuracy OOF global Residual MLP: {acc:.4f}")

conf_mat = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues",
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de confusión - Residual MLP")
plt.tight_layout()
plt.show()

print(classification_report(y, y_pred, target_names=CLASS_NAMES))

In [None]:
# Exportación e inferencia con Residual MLP
pipeline = joblib.load(PIPELINE_PATH)

best_hparams = {k: v for k, v in res_results["best_params"].items()
                if k in ["hidden_dim", "depth", "dropout", "residual_dropout", "lr", "weight_decay"]}

sample_checkpoint = res_results["model_paths"][0] if res_results["model_paths"] else None
print(f"Checkpoint de ejemplo: {sample_checkpoint}")


def predict_with_residual_mlp(raw_df: pd.DataFrame, checkpoint_path: str) -> pd.DataFrame:
    assert checkpoint_path is not None, "No hay checkpoint disponible"
    processed = pipeline.transform(raw_df)
    tensor = torch.from_numpy(np.asarray(processed, dtype=np.float32))
    model = ResidualMLPModule.load_from_checkpoint(
        checkpoint_path,
        input_dim=tensor.shape[1],
        n_classes=len(CLASS_NAMES),
        **best_hparams,
    )
    model.eval()
    with torch.no_grad():
        logits = model(tensor)
        probas = torch.softmax(logits, dim=1).numpy()
    preds = probas.argmax(axis=1)
    result = raw_df.copy()
    result["RENDIMIENTO_GLOBAL_PRED"] = [IDX2CLASS[idx] for idx in preds]
    for idx, cls in enumerate(CLASS_NAMES):
        result[f"proba_{cls}"] = probas[:, idx]
    return result

# Ejemplo de uso:
# sample = some_raw_df.head(3)
# predict_with_residual_mlp(sample, sample_checkpoint)

## Notas prácticas y bibliografía resumida
- **Boosting vs. deep tabular**: LightGBM/CatBoost siguen dominando con datos medianos-grandes, especialmente cuando las features ya fueron cuidadosamente codificadas. Requiere poca ingeniería de hiperparámetros y ofrece interpretabilidad vía importancia de variables.
- **FT-Transformer / SAINT**: útiles cuando hay fuertes interacciones no lineales entre atributos heterogéneos y se dispone de GPU. Regulariza con dropout y weight decay; usa lotes grandes (512–4096) y AMP para saturar la T4.
- **ResNet tabular**: baseline estable para escenarios donde se quiera una red poco compleja y rápida de ajustar; añadir stochastic depth o mixup ayuda si aparece overfitting.
- **TabPFN**: excelente para prototipar cuando el dataset es pequeño (<50k muestras) o se requiere una predicción rápida sin tuning, pero escala de manera cuadrática por lo que aquí se limita a un muestreo.
- **Regularización**: prioriza early stopping + bagging en boosting; en deep models combina dropout, weight decay y augmentation (CutMix tabular) si la precisión se estanca.
- **Reporte final**: comunica accuracy macro, matriz de confusión, curvas de calibración/Roc por clase (si aplican) y tiempos de entrenamiento por modelo.

**Bibliografía 2020–2025**
1. Gorishniy et al., *Revisiting Deep Learning Models for Tabular Data* (NeurIPS 2021). https://arxiv.org/abs/2106.11959
2. Gorishniy et al., *FT-Transformer: Fast and Accurate Modeling of Tabular Data* (ICML 2021 Workshop). https://arxiv.org/abs/2106.01126
3. Somepalli et al., *SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pretraining* (NeurIPS 2021). https://arxiv.org/abs/2106.01342
4. Hollmann et al., *TabPFN: A Transformer that Solves Small Tabular Classification Problems in a Second* (NeurIPS 2022). https://arxiv.org/abs/2207.01848
5. Misra et al., *A Survey on Deep Learning for Tabular Data* (ACM Computing Surveys 2023). https://arxiv.org/abs/2207.07454