# Entrenamiento multiclase para `RENDIMIENTO_GLOBAL`
Este cuaderno está optimizado para Google Colab T4 (GPU) e implementa un pipeline completo para comparar boosting (LightGBM), FT-Transformer y un MLP residual rápido sobre datos tabulares ya preprocesados. Se incluyen Optuna, validación estratificada (K=5), métricas detalladas y artefactos exportables.

> **Atajos operativos**: Ejecuta las celdas en orden, ajusta `N_TRIALS_*` según el tiempo disponible y revisa el resumen final para elegir el modelo a desplegar.

In [None]:
%%capture
# Dependencias necesarias (ejecutar una vez por sesión de Colab T4)
%pip install -q pytorch-lightning==2.4.0 optuna==3.6.1 lightgbm==4.3.0 catboost==1.2.5 \
    tabpfn==0.1.10 rtdl==0.0.13 torchmetrics==1.4.0 seaborn==0.13.2 matplotlib==3.8.4

In [None]:
import gc
import json
import math
import random
from collections import defaultdict
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset

import lightgbm as lgb
from tabpfn import TabPFNClassifier
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger
from torchmetrics.classification import MulticlassAccuracy

pl.seed_everything(42, workers=True)
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# Configuración global y rutas relevantes
DATA_PATH = Path("/mnt/data/processed_train.parquet")
PIPELINE_PATH = Path("/mnt/data/preprocessing_pipeline.joblib")
TARGET_COL = "RENDIMIENTO_GLOBAL"
CLASS_NAMES = ["alto", "medio-alto", "medio-bajo", "bajo"]
CLASS2IDX = {cls: idx for idx, cls in enumerate(CLASS_NAMES)}
IDX2CLASS = {idx: cls for cls, idx in CLASS2IDX.items()}

N_SPLITS = 5
RANDOM_STATE = 42
N_JOBS = 2
BATCH_SIZE_SPACE = [512, 1024, 2048, 4096]
ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device} | Artifacts: {ARTIFACT_DIR.resolve()}")

In [None]:
# Utilidades compartidas

def set_seed(seed: int = RANDOM_STATE) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def make_dataloaders(X: np.ndarray, y: np.ndarray, train_idx: np.ndarray,
                     val_idx: np.ndarray, batch_size: int, num_workers: int = 2):
    """Construye DataLoaders tensoriales con pin_memory para GPU."""
    X_train = torch.from_numpy(X[train_idx]).float()
    y_train = torch.from_numpy(y[train_idx]).long()
    X_val = torch.from_numpy(X[val_idx]).float()
    y_val = torch.from_numpy(y[val_idx]).long()

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=(device == "cuda"),
        drop_last=False,
    )
    val_loader = DataLoader(
        TensorDataset(X_val, y_val),
        batch_size=min(batch_size, 2048),
        shuffle=False,
        num_workers=num_workers,
        pin_memory=(device == "cuda"),
    )
    return train_loader, val_loader


def collect_fold_metrics(name: str, fold_scores: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(fold_scores)
    summary = {
        "model": name,
        "mean_acc": df["accuracy"].mean(),
        "std_acc": df["accuracy"].std(ddof=0),
        "min_acc": df["accuracy"].min(),
        "max_acc": df["accuracy"].max(),
    }
    return df, summary


def logits_to_numpy(logits_list):
    return torch.cat(logits_list).softmax(dim=1).cpu().numpy()


def describe_class_balance(labels: np.ndarray):
    counts = pd.Series(labels).value_counts().sort_index()
    display(counts.rename(index=IDX2CLASS))


In [None]:
# Carga de datos preprocesados y pipeline de referencia
assert DATA_PATH.exists(), f"No se encontró {DATA_PATH}"
assert PIPELINE_PATH.exists(), f"No se encontró {PIPELINE_PATH}"

pipeline = joblib.load(PIPELINE_PATH)
df = pd.read_parquet(DATA_PATH)
print(f"Shape: {df.shape} | Memoria ~{df.memory_usage().sum() / 1e6:.1f} MB")

y = df[TARGET_COL].map(CLASS2IDX).to_numpy(dtype=np.int64)
X = df.drop(columns=[TARGET_COL]).to_numpy(dtype=np.float32)

describe_class_balance(y)
print(f"Feature dims: {X.shape[1]}")

del df
_ = gc.collect()

In [None]:
# Definición de folds estratificados K=5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
print(f"Folds preparados: {len(folds)}")

In [None]:
# --- LightGBM + Optuna -----------------------------------------------------

def tune_lightgbm(X, y, folds, n_trials: int = 20):
    def objective(trial: optuna.Trial) -> float:
        params = {
            "objective": "multiclass",
            "num_class": len(CLASS_NAMES),
            "metric": "multi_logloss",
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 31, 255),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_child_samples": trial.suggest_int("min_child_samples", 20, 200),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "num_threads": N_JOBS,
        }
        fold_accs = []
        for fold_id, (tr_idx, val_idx) in enumerate(folds[:2]):  # usar 2 folds para acelerar la búsqueda
            lgb_train = lgb.Dataset(X[tr_idx], label=y[tr_idx])
            lgb_val = lgb.Dataset(X[val_idx], label=y[val_idx])
            booster = lgb.train(
                params,
                lgb_train,
                valid_sets=[lgb_val],
                num_boost_round=3000,
                callbacks=[
                    lgb.early_stopping(stopping_rounds=200, verbose=False),
                ],
            )
            preds = booster.predict(X[val_idx], num_iteration=booster.best_iteration)
            fold_accs.append(accuracy_score(y[val_idx], preds.argmax(axis=1)))
        return float(np.mean(fold_accs))

    study = optuna.create_study(direction="maximize", study_name="lgbm_study")
    study.optimize(objective, n_trials=n_trials, n_jobs=N_JOBS, show_progress_bar=True)
    best_params = study.best_params
    best_params.update({
        "objective": "multiclass",
        "num_class": len(CLASS_NAMES),
        "metric": "multi_logloss",
        "num_threads": N_JOBS,
    })
    return study, best_params


def train_lightgbm_cv(params: dict, X: np.ndarray, y: np.ndarray, folds):
    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    model_paths = []
    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[LightGBM] Fold {fold_id}")
        lgb_train = lgb.Dataset(X[tr_idx], label=y[tr_idx])
        lgb_val = lgb.Dataset(X[val_idx], label=y[val_idx])
        booster = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            valid_names=["train", "valid"],
            num_boost_round=5000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=300, verbose=False),
            ],
        )
        preds = booster.predict(X[val_idx], num_iteration=booster.best_iteration)
        oof_pred[val_idx] = preds
        acc = accuracy_score(y[val_idx], preds.argmax(axis=1))
        fold_scores.append({
            "fold": fold_id,
            "accuracy": acc,
            "best_iteration": booster.best_iteration,
        })
        model_path = ARTIFACT_DIR / f"lightgbm_fold{fold_id}.txt"
        booster.save_model(str(model_path))
        model_paths.append(model_path)
    return {
        "name": "LightGBM",
        "best_params": params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": model_paths,
    }


In [None]:
# Tuning y entrenamiento LightGBM
N_TRIALS_LGB = 20
lgb_study, lgb_best_params = tune_lightgbm(X, y, folds, n_trials=N_TRIALS_LGB)
print(f"LightGBM best params: {json.dumps(lgb_best_params, indent=2)}")

lgb_results = train_lightgbm_cv(lgb_best_params, X, y, folds)
lgb_fold_df, lgb_summary = collect_fold_metrics(lgb_results["name"], lgb_results["fold_metrics"])
lgb_fold_df

In [None]:
# --- FT-Transformer Lightning Module ----------------------------------------

class NumericalFeatureTokenizer(nn.Module):
    def __init__(self, input_dim: int, d_token: int):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(input_dim, d_token))
        self.bias = nn.Parameter(torch.zeros(input_dim, d_token))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (batch, dim) -> tokens: (batch, dim, d_token)
        return x.unsqueeze(-1) * self.weight.unsqueeze(0) + self.bias.unsqueeze(0)


class FTTransformerBackbone(nn.Module):
    def __init__(self, input_dim: int, n_classes: int, d_token: int = 192,
                 n_layers: int = 3, n_heads: int = 8, dropout: float = 0.2,
                 ffn_dropout: float = 0.2):
        super().__init__()
        self.input_dim = input_dim
        self.tokenizer = NumericalFeatureTokenizer(input_dim, d_token)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_heads,
            batch_first=True,
            dropout=dropout,
            dim_feedforward=d_token * 4,
            activation="gelu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))
        self.pos_embedding = nn.Parameter(torch.randn(1, input_dim + 1, d_token) * 0.02)
        self.norm = nn.LayerNorm(d_token)
        self.head = nn.Linear(d_token, n_classes)
        self.ffn_dropout = nn.Dropout(ffn_dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        tokens = self.tokenizer(x)
        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
        tokens = torch.cat([cls_tokens, tokens], dim=1) + self.pos_embedding[:, : tokens.size(1), :]
        encoded = self.encoder(tokens)
        cls_rep = self.norm(encoded[:, 0])
        cls_rep = self.ffn_dropout(cls_rep)
        return self.head(cls_rep)


class FTTransformerModule(pl.LightningModule):
    def __init__(self, input_dim: int, n_classes: int, d_token: int, n_layers: int,
                 n_heads: int, dropout: float, ffn_dropout: float,
                 lr: float, weight_decay: float):
        super().__init__()
        self.save_hyperparameters()
        self.model = FTTransformerBackbone(
            input_dim=input_dim,
            n_classes=n_classes,
            d_token=d_token,
            n_layers=n_layers,
            n_heads=n_heads,
            dropout=dropout,
            ffn_dropout=ffn_dropout,
        )
        self.criterion = nn.CrossEntropyLoss()
        self.val_acc = MulticlassAccuracy(num_classes=n_classes).to(device)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y).float().mean()
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
        self.log("val_acc", acc, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
        return {"logits": logits.detach(), "targets": y.detach()}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


In [None]:
# Búsqueda Optuna para FT-Transformer

def tune_ft_transformer(X, y, folds, n_trials: int = 15):
    input_dim = X.shape[1]

    def objective(trial: optuna.Trial) -> float:
        params = {
            "d_token": trial.suggest_categorical("d_token", [128, 192, 256]),
            "n_layers": trial.suggest_int("n_layers", 2, 4),
            "n_heads": trial.suggest_categorical("n_heads", [4, 8]),
            "dropout": trial.suggest_float("dropout", 0.0, 0.3),
            "ffn_dropout": trial.suggest_float("ffn_dropout", 0.0, 0.3),
            "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
            "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True),
            "batch_size": trial.suggest_categorical("batch_size", BATCH_SIZE_SPACE),
            "max_epochs": trial.suggest_int("max_epochs", 15, 35),
        }
        train_idx, val_idx = folds[0]
        train_loader, val_loader = make_dataloaders(X, y, train_idx, val_idx, params["batch_size"])
        module = FTTransformerModule(
            input_dim=input_dim,
            n_classes=len(CLASS_NAMES),
            d_token=params["d_token"],
            n_layers=params["n_layers"],
            n_heads=params["n_heads"],
            dropout=params["dropout"],
            ffn_dropout=params["ffn_dropout"],
            lr=params["lr"],
            weight_decay=params["weight_decay"],
        )
        callbacks = [
            EarlyStopping(monitor="val_acc", patience=5, mode="max"),
        ]
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=params["max_epochs"],
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
        )
        trainer.fit(module, train_loader, val_loader)
        val_acc = float(trainer.callback_metrics.get("val_acc", torch.tensor(0.0)).cpu())
        torch.cuda.empty_cache()
        return val_acc

    study = optuna.create_study(direction="maximize", study_name="ft_transformer")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study, study.best_params


def train_ft_transformer_cv(best_params: dict, X, y, folds):
    params = best_params.copy()
    batch_size = params.pop("batch_size")
    max_epochs = params.pop("max_epochs")
    input_dim = X.shape[1]

    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    checkpoints = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[FT-Transformer] Fold {fold_id}")
        train_loader, val_loader = make_dataloaders(X, y, tr_idx, val_idx, batch_size)
        module = FTTransformerModule(input_dim=input_dim, n_classes=len(CLASS_NAMES), **params)
        callbacks = [
            EarlyStopping(monitor="val_acc", patience=6, mode="max"),
            ModelCheckpoint(
                dirpath=ARTIFACT_DIR / f"ft_transformer_fold{fold_id}",
                filename="ft-transformer-{epoch:02d}-{val_acc:.4f}",
                monitor="val_acc",
                mode="max",
                save_top_k=1,
            ),
            LearningRateMonitor(logging_interval="epoch"),
        ]
        logger = CSVLogger(save_dir=ARTIFACT_DIR / f"logs_ft_fold{fold_id}", name="ft")
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=max_epochs,
            callbacks=callbacks,
            logger=logger,
            gradient_clip_val=1.0,
            deterministic=False,
        )
        trainer.fit(module, train_loader, val_loader)
        best_ckpt = callbacks[1].best_model_path
        checkpoints.append(best_ckpt)
        best_model = FTTransformerModule.load_from_checkpoint(best_ckpt)
        best_model.to(device)
        best_model.eval()
        val_logits = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                logits = best_model(xb)
                val_logits.append(logits.cpu())
        probs = torch.cat(val_logits).softmax(dim=1).numpy()
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        acc = accuracy_score(y[val_idx], preds)
        fold_scores.append({"fold": fold_id, "accuracy": acc})
        del best_model
        gc.collect()
        torch.cuda.empty_cache()
    return {
        "name": "FT-Transformer",
        "best_params": best_params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": checkpoints,
    }


In [None]:
# Tuning y entrenamiento FT-Transformer
N_TRIALS_FT = 15
ft_study, ft_best_params = tune_ft_transformer(X, y, folds, n_trials=N_TRIALS_FT)
print(f"FT params: {json.dumps(ft_best_params, indent=2)}")

ft_results = train_ft_transformer_cv(ft_best_params, X, y, folds)
ft_fold_df, ft_summary = collect_fold_metrics(ft_results["name"], ft_results["fold_metrics"])
ft_fold_df

In [None]:
# --- MLP Residual (ResNet-like) ---------------------------------------------

class ResidualBlock(nn.Module):
    def __init__(self, dim: int, dropout: float, residual_dropout: float):
        super().__init__()
        self.block = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.Dropout(residual_dropout),
        )

    def forward(self, x):
        return x + self.block(x)


class ResidualMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, depth: int,
                 dropout: float, residual_dropout: float, n_classes: int):
        super().__init__()
        self.input_proj = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
        )
        self.blocks = nn.ModuleList(
            [ResidualBlock(hidden_dim, dropout, residual_dropout) for _ in range(depth)]
        )
        self.head = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, n_classes),
        )

    def forward(self, x):
        h = self.input_proj(x)
        for block in self.blocks:
            h = block(h)
        return self.head(h)


class ResidualMLPModule(pl.LightningModule):
    def __init__(self, input_dim: int, n_classes: int, hidden_dim: int, depth: int,
                 dropout: float, residual_dropout: float, lr: float, weight_decay: float):
        super().__init__()
        self.save_hyperparameters()
        self.model = ResidualMLP(input_dim, hidden_dim, depth, dropout, residual_dropout, n_classes)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y).float().mean()
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, sync_dist=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True, sync_dist=True)
        return {"logits": logits.detach(), "targets": y.detach()}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


In [None]:
# Optuna + entrenamiento para el MLP residual

def tune_residual_mlp(X, y, folds, n_trials: int = 15):
    input_dim = X.shape[1]

    def objective(trial: optuna.Trial) -> float:
        params = {
            "hidden_dim": trial.suggest_categorical("hidden_dim", [256, 384, 512, 640]),
            "depth": trial.suggest_int("depth", 3, 6),
            "dropout": trial.suggest_float("dropout", 0.0, 0.3),
            "residual_dropout": trial.suggest_float("residual_dropout", 0.0, 0.2),
            "lr": trial.suggest_float("lr", 5e-4, 5e-3, log=True),
            "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True),
            "batch_size": trial.suggest_categorical("batch_size", BATCH_SIZE_SPACE),
            "max_epochs": trial.suggest_int("max_epochs", 12, 30),
        }
        train_idx, val_idx = folds[0]
        train_loader, val_loader = make_dataloaders(X, y, train_idx, val_idx, params["batch_size"])
        module = ResidualMLPModule(
            input_dim=input_dim,
            n_classes=len(CLASS_NAMES),
            hidden_dim=params["hidden_dim"],
            depth=params["depth"],
            dropout=params["dropout"],
            residual_dropout=params["residual_dropout"],
            lr=params["lr"],
            weight_decay=params["weight_decay"],
        )
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=params["max_epochs"],
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
        )
        trainer.fit(module, train_loader, val_loader)
        val_acc = float(trainer.callback_metrics.get("val_acc", torch.tensor(0.0)).cpu())
        torch.cuda.empty_cache()
        return val_acc

    study = optuna.create_study(direction="maximize", study_name="residual_mlp")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study, study.best_params


def train_residual_mlp_cv(best_params: dict, X, y, folds):
    params = best_params.copy()
    batch_size = params.pop("batch_size")
    max_epochs = params.pop("max_epochs")
    input_dim = X.shape[1]

    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    checkpoints = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[ResidualMLP] Fold {fold_id}")
        train_loader, val_loader = make_dataloaders(X, y, tr_idx, val_idx, batch_size)
        module = ResidualMLPModule(input_dim=input_dim, n_classes=len(CLASS_NAMES), **params)
        callbacks = [
            EarlyStopping(monitor="val_acc", patience=5, mode="max"),
            ModelCheckpoint(
                dirpath=ARTIFACT_DIR / f"residual_mlp_fold{fold_id}",
                filename="res-mlp-{epoch:02d}-{val_acc:.4f}",
                monitor="val_acc",
                mode="max",
                save_top_k=1,
            ),
        ]
        logger = CSVLogger(save_dir=ARTIFACT_DIR / f"logs_res_fold{fold_id}", name="resmlp")
        trainer = pl.Trainer(
            accelerator="gpu" if device == "cuda" else "cpu",
            devices=1,
            precision="16-mixed" if device == "cuda" else "32-true",
            max_epochs=max_epochs,
            callbacks=callbacks,
            logger=logger,
            gradient_clip_val=1.0,
        )
        trainer.fit(module, train_loader, val_loader)
        best_ckpt = callbacks[1].best_model_path
        checkpoints.append(best_ckpt)
        best_model = ResidualMLPModule.load_from_checkpoint(best_ckpt)
        best_model.to(device).eval()
        val_logits = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                val_logits.append(best_model(xb).cpu())
        probs = torch.cat(val_logits).softmax(dim=1).numpy()
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        fold_scores.append({"fold": fold_id, "accuracy": accuracy_score(y[val_idx], preds)})
        del best_model
        gc.collect()
        torch.cuda.empty_cache()
    return {
        "name": "ResidualMLP",
        "best_params": best_params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": checkpoints,
    }


In [None]:
# Tuning y entrenamiento del MLP residual
N_TRIALS_RES = 15
res_study, res_best_params = tune_residual_mlp(X, y, folds, n_trials=N_TRIALS_RES)
print(f"Residual params: {json.dumps(res_best_params, indent=2)}")

res_results = train_residual_mlp_cv(res_best_params, X, y, folds)
res_fold_df, res_summary = collect_fold_metrics(res_results["name"], res_results["fold_metrics"])
res_fold_df

In [None]:
# --- TabPFN baseline (subset para rapidez) -----------------------------------

def evaluate_tabpfn(X, y, folds, max_train_samples: int = 8000):
    accs = []
    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[TabPFN] Fold {fold_id}")
        sampled_idx = np.random.choice(tr_idx, size=min(max_train_samples, len(tr_idx)), replace=False)
        clf = TabPFNClassifier(
            device="cuda" if torch.cuda.is_available() else "cpu",
            N_ensemble_configurations=32,
            batch_size=256,
        )
        clf.fit(X[sampled_idx], y[sampled_idx])
        preds = clf.predict(X[val_idx])
        acc = accuracy_score(y[val_idx], preds)
        accs.append({"fold": fold_id, "accuracy": acc})
    return {
        "name": "TabPFN (subset)",
        "fold_metrics": accs,
        "oof_predictions": None,
    }


tabpfn_results = evaluate_tabpfn(X, y, folds)
tabpfn_fold_df, tabpfn_summary = collect_fold_metrics(tabpfn_results["name"], tabpfn_results["fold_metrics"])
tabpfn_fold_df

In [None]:
# Resumen comparativo de modelos
result_summaries = [lgb_summary, ft_summary, res_summary, tabpfn_summary]
summary_df = pd.DataFrame(result_summaries).sort_values(by="mean_acc", ascending=False)
summary_df

In [None]:
# Matriz de confusión y reporte para el mejor modelo disponible
model_lookup = {res["name"]: res for res in [lgb_results, ft_results, res_results]}
best_name = None
for _, row in summary_df.iterrows():
    candidate = model_lookup.get(row["model"])
    if candidate and candidate.get("oof_predictions") is not None:
        best_name = row["model"]
        break
assert best_name is not None, "No hay modelo con predicciones OOF"

best_result = model_lookup[best_name]
y_pred = best_result["oof_predictions"].argmax(axis=1)
acc = accuracy_score(y, y_pred)
print(f"Modelo seleccionado: {best_name} | Acc global OOF: {acc:.4f}")

conf_mat = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues",
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title(f"Matriz de confusión - {best_name}")
plt.tight_layout()
plt.show()

report = classification_report(y, y_pred, target_names=CLASS_NAMES, output_dict=False)
print(report)

In [None]:
# Ejemplo de exportación + inferencia (LightGBM como ejemplo)
joblib.dump(CLASS_NAMES, ARTIFACT_DIR / "class_names.joblib")
print(f"Clases guardadas en {ARTIFACT_DIR}")

lightgbm_models = [str(p) for p in lgb_results["model_paths"]]
print("Modelos LightGBM:", lightgbm_models)

pipeline = joblib.load(PIPELINE_PATH)

def predict_with_lightgbm(raw_df: pd.DataFrame) -> pd.DataFrame:
    processed = pipeline.transform(raw_df)
    processed = np.asarray(processed, dtype=np.float32)
    probas = np.zeros((processed.shape[0], len(CLASS_NAMES)), dtype=np.float32)
    for model_path in lightgbm_models:
        booster = lgb.Booster(model_file=model_path)
        probas += booster.predict(processed)
    probas /= len(lightgbm_models)
    preds = probas.argmax(axis=1)
    result = raw_df.copy()
    result["RENDIMIENTO_GLOBAL_PRED"] = [IDX2CLASS[idx] for idx in preds]
    result[[f"proba_{cls}" for cls in CLASS_NAMES]] = probas
    return result

# Ejemplo de uso (suponiendo raw_df con las columnas originales sin descartar):
# sample_raw = some_raw_df.head(3)
# prediction_df = predict_with_lightgbm(sample_raw)
# display(prediction_df)

## Notas prácticas y bibliografía resumida
- **Boosting vs. deep tabular**: LightGBM/CatBoost siguen dominando con datos medianos-grandes, especialmente cuando las features ya fueron cuidadosamente codificadas. Requiere poca ingeniería de hiperparámetros y ofrece interpretabilidad vía importancia de variables.
- **FT-Transformer / SAINT**: útiles cuando hay fuertes interacciones no lineales entre atributos heterogéneos y se dispone de GPU. Regulariza con dropout y weight decay; usa lotes grandes (512–4096) y AMP para saturar la T4.
- **ResNet tabular**: baseline estable para escenarios donde se quiera una red poco compleja y rápida de ajustar; añadir stochastic depth o mixup ayuda si aparece overfitting.
- **TabPFN**: excelente para prototipar cuando el dataset es pequeño (<50k muestras) o se requiere una predicción rápida sin tuning, pero escala de manera cuadrática por lo que aquí se limita a un muestreo.
- **Regularización**: prioriza early stopping + bagging en boosting; en deep models combina dropout, weight decay y augmentation (CutMix tabular) si la precisión se estanca.
- **Reporte final**: comunica accuracy macro, matriz de confusión, curvas de calibración/Roc por clase (si aplican) y tiempos de entrenamiento por modelo.

**Bibliografía 2020–2025**
1. Gorishniy et al., *Revisiting Deep Learning Models for Tabular Data* (NeurIPS 2021). https://arxiv.org/abs/2106.11959
2. Gorishniy et al., *FT-Transformer: Fast and Accurate Modeling of Tabular Data* (ICML 2021 Workshop). https://arxiv.org/abs/2106.01126
3. Somepalli et al., *SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pretraining* (NeurIPS 2021). https://arxiv.org/abs/2106.01342
4. Hollmann et al., *TabPFN: A Transformer that Solves Small Tabular Classification Problems in a Second* (NeurIPS 2022). https://arxiv.org/abs/2207.01848
5. Misra et al., *A Survey on Deep Learning for Tabular Data* (ACM Computing Surveys 2023). https://arxiv.org/abs/2207.07454