# Entrenamiento con LightGBM

Este notebook implementa un pipeline completo para comparar resultados (LightGBM) sobre datos tabulares ya preprocesados. Se incluyen Optuna, validación estratificada (K=5), métricas detalladas y artefactos exportables.


In [2]:
# Dependencias necesarias (ejecutar una vez por sesión de Colab T4)
!pip install -q catboost optuna seaborn matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import gc
import json
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold

optuna.logging.set_verbosity(optuna.logging.WARNING)
random.seed(42)
np.random.seed(42)

In [4]:
# Configuración global y rutas relevantes
DATA_PATH = Path("processed_train.parquet")
TARGET_COL = "RENDIMIENTO_GLOBAL"
CLASS_NAMES = ["alto", "medio-alto", "medio-bajo", "bajo"]
CLASS2IDX = {cls: idx for idx, cls in enumerate(CLASS_NAMES)}
IDX2CLASS = {idx: cls for cls, idx in CLASS2IDX.items()}

N_SPLITS = 5
RANDOM_STATE = 42
N_JOBS = 2
ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

print(f"Artifacts: {ARTIFACT_DIR.resolve()}")

Artifacts: /content/artifacts


In [5]:
# Utilidades compartidas

def set_seed(seed: int = RANDOM_STATE) -> None:
    random.seed(seed)
    np.random.seed(seed)


def collect_fold_metrics(name: str, fold_scores: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(fold_scores)
    summary = {
        "model": name,
        "mean_acc": df["accuracy"].mean(),
        "std_acc": df["accuracy"].std(ddof=0),
        "min_acc": df["accuracy"].min(),
        "max_acc": df["accuracy"].max(),
    }
    return df, summary


def describe_class_balance(labels: np.ndarray):
    counts = pd.Series(labels).value_counts().sort_index()
    display(counts.rename(index=IDX2CLASS))

In [6]:
# Carga de datos preprocesados y pipeline de referencia
assert DATA_PATH.exists(), f"No se encontró {DATA_PATH}"

df = pd.read_parquet(DATA_PATH)
print(f"Shape: {df.shape} | Memoria ~{df.memory_usage().sum() / 1e6:.1f} MB")

y = df[TARGET_COL].map(CLASS2IDX).to_numpy(dtype=np.int64)
X = df.drop(columns=[TARGET_COL]).to_numpy(dtype=np.float32)

describe_class_balance(y)
print(f"Feature dims: {X.shape[1]}")

del df
_ = gc.collect()

Shape: (692500, 28) | Memoria ~155.1 MB


Unnamed: 0,count
alto,175619
medio-alto,171619
medio-bajo,172275
bajo,172987


Feature dims: 27


In [7]:
# Definición de folds estratificados K=5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
print(f"Folds preparados: {len(folds)}")

Folds preparados: 5


In [10]:
# --- CatBoost + Optuna ------------------------------------------------------

def tune_catboost(X, y, folds, n_trials: int = 20):
    set_seed(RANDOM_STATE)

    def objective(trial: optuna.Trial) -> float:
        params = {
            "iterations": trial.suggest_int("iterations", 500, 2500, step=250),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
            "leaf_estimation_iterations": trial.suggest_int("leaf_estimation_iterations", 1, 10),
            "random_strength": trial.suggest_float("random_strength", 0.5, 5.0),
        }
        fold_accs = []
        for fold_id, (tr_idx, val_idx) in enumerate(folds[:2]):  # usar 2 folds para acelerar la búsqueda
            train_pool = Pool(X[tr_idx], y[tr_idx])
            val_pool = Pool(X[val_idx], y[val_idx])
            model = CatBoostClassifier(
                loss_function="MultiClass",
                eval_metric="Accuracy",
                task_type="GPU",
                devices="0",
                learning_rate=params["learning_rate"],
                depth=params["depth"],
                iterations=params["iterations"],
                l2_leaf_reg=params["l2_leaf_reg"],
                border_count=params["border_count"],
                bagging_temperature=params["bagging_temperature"],
                leaf_estimation_iterations=params["leaf_estimation_iterations"],
                random_strength=params["random_strength"],
                auto_class_weights="Balanced",
                random_seed=RANDOM_STATE,
                verbose=False,
            )
            model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=150, verbose=False)
            preds = model.predict(val_pool)
            fold_accs.append(accuracy_score(y[val_idx], preds))
        return float(np.mean(fold_accs))

    study = optuna.create_study(direction="maximize", study_name="catboost_study")
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)
    best_params = study.best_params
    return study, best_params


def train_catboost_cv(best_params: dict, X: np.ndarray, y: np.ndarray, folds):
    set_seed(RANDOM_STATE)
    oof_pred = np.zeros((len(y), len(CLASS_NAMES)), dtype=np.float32)
    fold_scores = []
    model_paths = []

    for fold_id, (tr_idx, val_idx) in enumerate(folds):
        print(f"[CatBoost] Fold {fold_id}")
        train_pool = Pool(X[tr_idx], y[tr_idx])
        val_pool = Pool(X[val_idx], y[val_idx])
        model = CatBoostClassifier(
            loss_function="MultiClass",
            eval_metric="Accuracy",
            task_type="GPU",
            devices="0",
            auto_class_weights="Balanced",
            random_seed=RANDOM_STATE + fold_id,
            verbose=200,
            **best_params,
        )
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=200, verbose=200)
        probs = model.predict_proba(val_pool)
        oof_pred[val_idx] = probs
        preds = probs.argmax(axis=1)
        fold_scores.append({
            "fold": fold_id,
            "accuracy": accuracy_score(y[val_idx], preds),
            "best_iteration": model.best_iteration_,
        })
        model_path = ARTIFACT_DIR / f"catboost_fold{fold_id}.cbm"
        model.save_model(model_path)
        model_paths.append(model_path)
    return {
        "name": "CatBoost",
        "best_params": best_params,
        "fold_metrics": fold_scores,
        "oof_predictions": oof_pred,
        "model_paths": model_paths,
    }


In [11]:
# Tuning y entrenamiento CatBoost
N_TRIALS_CAT = 20
cat_study, cat_best_params = tune_catboost(X, y, folds, n_trials=N_TRIALS_CAT)
print(f"CatBoost best params: {json.dumps(cat_best_params, indent=2)}")

cat_results = train_catboost_cv(cat_best_params, X, y, folds)
cat_fold_df, cat_summary = collect_fold_metrics(cat_results["name"], cat_results["fold_metrics"])
cat_fold_df

  0%|          | 0/20 [00:00<?, ?it/s]

CatBoost best params: {
  "iterations": 2250,
  "depth": 8,
  "learning_rate": 0.019207257946801497,
  "l2_leaf_reg": 0.06499697508902857,
  "border_count": 198,
  "bagging_temperature": 0.24068201778271264,
  "leaf_estimation_iterations": 3,
  "random_strength": 2.4522629037447214
}
[CatBoost] Fold 0
0:	learn: 0.3891597	test: 0.3875984	best: 0.3875984 (0)	total: 49.3ms	remaining: 1m 50s
200:	learn: 0.4195663	test: 0.4182839	best: 0.4182839 (200)	total: 6.45s	remaining: 1m 5s
400:	learn: 0.4261565	test: 0.4237854	best: 0.4237854 (400)	total: 10.9s	remaining: 50.4s
600:	learn: 0.4304575	test: 0.4264408	best: 0.4264985 (599)	total: 16.4s	remaining: 45.1s
800:	learn: 0.4345708	test: 0.4286487	best: 0.4287216 (799)	total: 20.9s	remaining: 37.7s
1000:	learn: 0.4390961	test: 0.4304008	best: 0.4304370 (999)	total: 28.5s	remaining: 35.5s
1200:	learn: 0.4430537	test: 0.4317725	best: 0.4318001 (1179)	total: 33.1s	remaining: 28.9s
1400:	learn: 0.4467680	test: 0.4323748	best: 0.4324335 (1396)	tota

Unnamed: 0,fold,accuracy,best_iteration
0,0,0.434751,1846
1,1,0.432996,2138
2,2,0.432397,2249
3,3,0.433264,2100
4,4,0.432332,1733


In [12]:
cat_summary

{'model': 'CatBoost',
 'mean_acc': np.float64(0.4331480144404332),
 'std_acc': 0.00087579769095689,
 'min_acc': 0.4323321299638989,
 'max_acc': 0.4347509025270758}

In [None]:
from catboost import CatBoostClassifier

# Crear instancia vacía
loaded_model = CatBoostClassifier()

# Cargar desde archivo .cbm
loaded_model.load_model("catboost_model.cbm")

## Notas prácticas y bibliografía resumida
- **Boosting vs. deep tabular**: LightGBM/CatBoost siguen dominando con datos medianos-grandes, especialmente cuando las features ya fueron cuidadosamente codificadas. Requiere poca ingeniería de hiperparámetros y ofrece interpretabilidad vía importancia de variables.

## **Bibliografía**
1. Gorishniy et al., *Revisiting Deep Learning Models for Tabular Data* (NeurIPS 2021). https://arxiv.org/abs/2106.11959
2. Gorishniy et al., *FT-Transformer: Fast and Accurate Modeling of Tabular Data* (ICML 2021 Workshop). https://arxiv.org/abs/2106.01126
3. Somepalli et al., *SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pretraining* (NeurIPS 2021). https://arxiv.org/abs/2106.01342
4. Hollmann et al., *TabPFN: A Transformer that Solves Small Tabular Classification Problems in a Second* (NeurIPS 2022). https://arxiv.org/abs/2207.01848
5. Misra et al., *A Survey on Deep Learning for Tabular Data* (ACM Computing Surveys 2023). https://arxiv.org/abs/2207.07454