# 05 - Stacking avanzado (XGBoost + LightGBM + GradientBoosting)
Construir un stacking seguro en memoria que entrene XGBoost, LightGBM y GradientBoosting,
genere predicciones OOF para un meta-modelo (LogisticRegression).

In [None]:
# === Configuración Kaggle ===
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json || true

!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia -p ./data || true
!unzip -o ./data/udea*.zip -d ./data > /dev/null 2>&1 || true
!ls -la ./data || true


Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to ./data
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.40GB/s]
total 233348
drwxr-xr-x 2 root root      4096 Nov 18 04:34 .
drwxr-xr-x 1 root root      4096 Nov 18 04:34 ..
-rw-r--r-- 1 root root   4716673 Sep 16 01:46 submission_example.csv
-rw-r--r-- 1 root root  59185238 Sep 16 01:46 test.csv
-rw-r--r-- 1 root root 143732437 Sep 16 01:46 train.csv
-rw-r--r-- 1 root root  31301114 Sep 16 01:46 udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip


In [None]:
# === Cargar datos ===
import pandas as pd
train = pd.read_csv("./data/train.csv")
test  = pd.read_csv("./data/test.csv")
print("train:", train.shape, "test:", test.shape)
train.head()


train: (692500, 21) test: (296786, 20)


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


In [None]:

# === Preprocesado ligero y consistente ===
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np

target_col = "RENDIMIENTO_GLOBAL"

# Separar y codificar target
y = train[target_col].astype(str).fillna("missing")
le = LabelEncoder()
y_enc = le.fit_transform(y)

X = train.drop(columns=[target_col]).copy()
X_test = test.copy()

# Detectar columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))
print("Ejemplos cat:", cat_cols[:10])

num_cols: 6 cat_cols: 14
Ejemplos cat: ['E_PRGM_ACADEMICO', 'E_PRGM_DEPARTAMENTO', 'E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA', 'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_EDUCACIONPADRE', 'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD']


In [None]:

# Preprocesado para modelos de árbol: imputación numérica + OrdinalEncoder para categóricas
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

num_transform = Pipeline([('imputer_num', SimpleImputer(strategy='median'))])
cat_transform = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor_tree = ColumnTransformer([
    ('num', num_transform, num_cols),
    ('cat', cat_transform, cat_cols)
], remainder='drop')

# Fit transformers on full data (train+test) para evitar mismatch en categorías
X_all = pd.concat([X[num_cols+cat_cols], X_test[num_cols+cat_cols]], axis=0, ignore_index=True)
preprocessor_tree.fit(X_all)
print('Preprocessor ajustado (train+test)')

Preprocessor ajustado (train+test)


In [None]:

# Transformar conjuntos (esto producirá matrices numéricas para los modelos de árbol)
X_proc = preprocessor_tree.transform(X[num_cols+cat_cols])
X_test_proc = preprocessor_tree.transform(X_test[num_cols+cat_cols])

print("X_proc shape:", X_proc.shape, "X_test_proc shape:", X_test_proc.shape)


X_proc shape: (692500, 20) X_test_proc shape: (296786, 20)


In [None]:

# === Modelos y función OOF (out-of-fold) ===
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Intentar importar xgboost y lightgbm
try:
    import xgboost as xgb
except Exception:
    !pip install xgboost -q
    import xgboost as xgb

try:
    import lightgbm as lgb
except Exception:
    !pip install lightgbm -q
    import lightgbm as lgb

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def get_oof_preds(clf, X, y, X_test, n_splits=5, random_state=42):
    """
    Genera predicciones out-of-fold para stacking.
    Retorna: oof_train (n_samples x n_classes) y oof_test_avg (n_test x n_classes).
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_classes = len(np.unique(y))
    oof_train = np.zeros((X.shape[0], n_classes))
    oof_test = np.zeros((X_test.shape[0], n_classes))
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}/{n_splits}")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        clf.fit(X_tr, y_tr)
        oof_train[val_idx] = clf.predict_proba(X_val)
        oof_test += clf.predict_proba(X_test) / n_splits
    return oof_train, oof_test


In [None]:

# === Definir modelos base ===
gb_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)

xgb_clf = XGBClassifier(objective='multi:softprob', eval_metric='mlogloss',
                        use_label_encoder=False, num_class=len(le.classes_),
                        n_estimators=300, max_depth=6, learning_rate=0.05,
                        subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0)

lgb_clf = LGBMClassifier(objective='multiclass', num_class=len(le.classes_),
                         n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=42)

print("Modelos base definidos")


Modelos base definidos


In [None]:

# === Generar OOF predictions para cada modelo ===
X_for_models = X_proc
X_test_for_models = X_test_proc
y_array = y_enc

oof_gb, test_gb = get_oof_preds(gb_clf, X_for_models, y_array, X_test_for_models, n_splits=5)
oof_xgb, test_xgb = get_oof_preds(xgb_clf, X_for_models, y_array, X_test_for_models, n_splits=5)
oof_lgb, test_lgb = get_oof_preds(lgb_clf, X_for_models, y_array, X_test_for_models, n_splits=5)

print("OOF shapes:", oof_gb.shape, oof_xgb.shape, oof_lgb.shape)

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Fold 1/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1489
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.371986
[LightGBM] [Info] Start training from score -1.387096
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Fold 2/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1496
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Fold 3/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093970 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1493
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Fold 4/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1493
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Fold 5/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1494
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387096
[LightGBM] [Info] Start training from score -1.395026
[LightGBM] [Info] Start training from score -1.391216




OOF shapes: (692500, 4) (692500, 4) (692500, 4)


In [None]:
# === Construir dataset para meta-modelo ===
X_meta = np.hstack([oof_gb, oof_xgb, oof_lgb])          # shape: (n_train, n_classes * 3)
X_meta_test = np.hstack([test_gb, test_xgb, test_lgb]) # shape: (n_test, n_classes * 3)

print("X_meta shape:", X_meta.shape, "X_meta_test shape:", X_meta_test.shape)

X_meta shape: (692500, 12) X_meta_test shape: (296786, 12)


In [None]:
# === Entrenar meta-modelo (Logistic Regression multinomial) ===
meta_clf = LogisticRegression(max_iter=1000, multi_class='multinomial')
meta_clf.fit(X_meta, y_array)

# Métrica en OOF (aprox)
oof_preds_meta = meta_clf.predict(X_meta)
acc_meta = accuracy_score(y_array, oof_preds_meta)
print("Meta-model OOF accuracy (aprox):", acc_meta)



Meta-model OOF accuracy (aprox): 0.43617039711191335


In [None]:
# === Predicción final y archivo de submission ===
test_meta_preds = meta_clf.predict(X_meta_test)
test_labels = le.inverse_transform(test_meta_preds)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": test_labels
})
submission.to_csv("./data/submission_ensemble_stacking.csv", index=False)
submission.head()

Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,medio-alto
1,98545,medio-bajo
2,499179,alto
3,782980,bajo
4,785185,bajo


In [None]:
# === Envío a Kaggle ===
print("Para enviar a Kaggle ejecuta:")
!kaggle competitions submit -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia -f data/submission_ensemble_stacking.csv -m "Stacking LGBM+XGB+GB"

Para enviar a Kaggle ejecuta:
100% 4.13M/4.13M [00:00<00:00, 5.67MB/s]
Successfully submitted to UDEA/ai4eng 20252 - Pruebas Saber Pro Colombia