# 04 - modelado avanzado (GradientBoosting + XGBoost + LightGBM + Ensamble)
Preprocesado ligero y tres modelos avanzados

In [None]:

# === Configuración Kaggle ===
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json || true

# === Descarga datos desde Kaggle ===
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia -p ./data || true
!unzip -o ./data/udea*.zip -d ./data > /dev/null 2>&1 || true
!ls -la ./data || true


Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to ./data
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.69GB/s]
total 233348
drwxr-xr-x 2 root root      4096 Nov 18 03:11 .
drwxr-xr-x 1 root root      4096 Nov 18 03:11 ..
-rw-r--r-- 1 root root   4716673 Sep 16 01:46 submission_example.csv
-rw-r--r-- 1 root root  59185238 Sep 16 01:46 test.csv
-rw-r--r-- 1 root root 143732437 Sep 16 01:46 train.csv
-rw-r--r-- 1 root root  31301114 Sep 16 01:46 udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip


In [None]:

import pandas as pd

train = pd.read_csv("./data/train.csv")
test  = pd.read_csv("./data/test.csv")

print(train.shape, test.shape)
train.head()


(692500, 21) (296786, 20)


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


In [None]:

# === Preprocesado ===
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

target = "RENDIMIENTO_GLOBAL"
y = train[target]

X = train.drop(columns=[target])
X_test = test.copy()

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric = Pipeline([("imputer", SimpleImputer(strategy="median"))])
categorical = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric, num_cols),
    ("cat", categorical, cat_cols)
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

# === Modelo 1: GradientBoostingClassifier ===
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb = Pipeline([
    ("prep", preprocess),
    ("clf", GradientBoostingClassifier())
])

gb.fit(X_train, y_train)
pred_gb = gb.predict(X_val)
acc_gb = accuracy_score(y_val, pred_gb)
print("GradientBoosting Accuracy:", acc_gb)


GradientBoosting Accuracy: 0.4111913357400722


In [None]:

# === Modelo 2: XGBoost ===
!pip install xgboost -q || true
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

xgb = Pipeline([
    ("prep", preprocess),
    ("clf", XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=len(le.classes_),
        max_depth=6,
        learning_rate=0.1,
        n_estimators=300,
        subsample=0.8,
        colsample_bytree=0.8
    ))
])

xgb.fit(X_train, y_train_enc)
pred_xgb = xgb.predict(X_val)
pred_xgb_labels = le.inverse_transform(pred_xgb)

acc_xgb = accuracy_score(y_val, pred_xgb_labels)
print("XGBoost Accuracy:", acc_xgb)


XGBoost Accuracy: 0.43


In [None]:

# === Modelo 3: LightGBM ===
!pip install lightgbm -q || true
import lightgbm as lgb

lgbm = Pipeline([
    ("prep", preprocess),
    ("clf", lgb.LGBMClassifier(
        objective='multiclass',
        num_class=len(y.unique()),
        learning_rate=0.1,
        n_estimators=400,
        max_depth=-1
    ))
])

lgbm.fit(X_train, y_train)
pred_lgbm = lgbm.predict(X_val)
acc_lgbm = accuracy_score(y_val, pred_lgbm)
print("LightGBM Accuracy:", acc_lgbm)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2861
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 855
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




LightGBM Accuracy: 0.43579783393501803


In [None]:

# === Ensamble simple ===
import numpy as np

# Usamos predict_proba para promediar probabilidades
proba_gb   = gb.predict_proba(X_val)
proba_xgb  = xgb.predict_proba(X_val)
proba_lgbm = lgbm.predict_proba(X_val)

avg_proba = (proba_gb + proba_xgb + proba_lgbm) / 3
pred_ensamble = avg_proba.argmax(axis=1)
classes = gb.named_steps["clf"].classes_
pred_ens_labels = classes[pred_ensamble]

acc_ens = accuracy_score(y_val, pred_ens_labels)
print("Accuracy Ensamble:", acc_ens)




Accuracy Ensamble: 0.4324476534296029


In [None]:

# === Entrenar modelos final en todo el train ===
y_enc = le.transform(y)
gb.fit(X, y_enc)
xgb.fit(X, y_enc)
lgbm.fit(X, y_enc)

p_gb   = gb.predict_proba(X_test)
p_xgb  = xgb.predict_proba(X_test)
p_lgbm = lgbm.predict_proba(X_test)

avg = (p_gb + p_xgb + p_lgbm) / 3
pred_final = avg.argmax(axis=1)
labels_final = le.inverse_transform(pred_final)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": labels_final
})
submission.to_csv("./data/submission_ensemble.csv", index=False)
submission.head()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2949
[LightGBM] [Info] Number of data points in the train set: 692500, number of used features: 898
[LightGBM] [Info] Start training from score -1.371991
[LightGBM] [Info] Start training from score -1.387092
[LightGBM] [Info] Start training from score -1.395031
[LightGBM] [Info] Start training from score -1.391216




Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,bajo
1,98545,medio-alto
2,499179,alto
3,782980,bajo
4,785185,bajo


In [None]:

print("Enviar a Kaggle:")
!kaggle competitions submit -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia -f data/submission_ensemble.csv -m "Ensamble GB + XGB + LGBM"


Enviar a Kaggle:
100% 4.04M/4.04M [00:00<00:00, 8.82MB/s]
Successfully submitted to UDEA/ai4eng 20252 - Pruebas Saber Pro Colombia