In [1]:
# ================================================================================
# Descarga de datos desde Kaggle
# ================================================================================

import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip udea*.zip > /dev/null

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.37GB/s]


In [2]:
# ================================================================================
#     NOVA — MODELO ULTRA OPTIMIZADO v4 (LightGBM + XGBoost + CatBoost)
# ================================================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'catboost'

In [None]:
# ================================================================================
# 1. Cargar datos
# ================================================================================

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

target = "RENDIMIENTO_GLOBAL"

# Quitar columnas duplicadas tipo *.1
dups = [c for c in train.columns if ".1" in c]
train = train.drop(columns=dups, errors="ignore")
test  = test.drop(columns=dups, errors="ignore")

# Copia segura
df = train.copy()

In [None]:
# ================================================================================
# 2. Preprocesamiento: categorizar columnas y limpiar NaN
# ================================================================================

categorical_cols = []

for col in df.columns:
    if df[col].dtype == object and col != target:
        categorical_cols.append(col)

# ---- FIX: CatBoost no acepta NaN -> convertir a string ----
for col in categorical_cols:
    df[col] = df[col].astype("object").fillna("MISSING")
    if col in test.columns:
        test[col] = test[col].astype("object").fillna("MISSING")

# ---- FIX: NaN numéricos -> sentinel value seguro ----
for col in df.columns:
    if df[col].dtype != object and col != target:
        df[col] = df[col].fillna(-9999)
        if col in test.columns:
            test[col] = test[col].fillna(-9999)

# Convertir finalmente a categoría
for col in categorical_cols:
    df[col] = df[col].astype("category")
    if col in test.columns:
        test[col] = test[col].astype("category")

print(f"Categorías detectadas: {len(categorical_cols)}")

# Codificar target
le = LabelEncoder()
df[target] = le.fit_transform(df[target])

In [None]:
# ================================================================================
# 3. Train / Validation split
# ================================================================================

X = df.drop(columns=[target])
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

In [None]:
# ================================================================================
# 4. MODEL 1 — LIGHTGBM
# ================================================================================

print("\nEntrenando LightGBM...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=600,
    learning_rate=0.03,
    num_leaves=45,
    max_depth=9,
    min_child_samples=15,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.3,
    reg_lambda=2.5,
    objective="multiclass",
    class_weight="balanced",
    random_state=42
)

lgb_model.fit(
    X_train,
    y_train,
    categorical_feature=categorical_cols
)

lgb_val_pred = lgb_model.predict(X_val)
lgb_acc = accuracy_score(y_val, lgb_val_pred)

print(f" LightGBM Accuracy: {lgb_acc:.4f}")

In [None]:
# ================================================================================
# 5. MODEL 2 — XGBOOST
# ================================================================================

print("\nEntrenando XGBoost...")

# Convertir categorías -> códigos
X_train_xgb = X_train.copy()
X_val_xgb   = X_val.copy()
test_xgb    = test.copy()

for col in X_train_xgb.columns:
    if str(X_train_xgb[col].dtype) in ["category", "object"]:
        X_train_xgb[col] = X_train_xgb[col].astype("category").cat.codes
        X_val_xgb[col]   = X_val_xgb[col].astype("category").cat.codes
        test_xgb[col]    = test_xgb[col].astype("category").cat.codes

xgb_model = xgb.XGBClassifier(
    n_estimators=550,
    learning_rate=0.035,
    max_depth=8,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=3,
    reg_alpha=1.2,
    reg_lambda=3.0,
    gamma=0.2,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

xgb_model.fit(X_train_xgb, y_train)
xgb_val_pred = xgb_model.predict(X_val_xgb)

xgb_acc = accuracy_score(y_val, xgb_val_pred)

print(f" XGBoost Accuracy: {xgb_acc:.4f}")

In [None]:
# ================================================================================
# 6. MODEL 3 — CATBOOST
# ================================================================================

print("\nEntrenando CatBoost...")

cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.06,
    depth=6,
    l2_leaf_reg=3,
    loss_function="MultiClass",
    random_seed=42,
    task_type="CPU",
    verbose=0
)

cat_model.fit(
    X_train,
    y_train,
    cat_features=categorical_cols,
    eval_set=(X_val, y_val),
    verbose=0
)

cat_val_pred = cat_model.predict(X_val).flatten().astype(int)
cat_acc = accuracy_score(y_val, cat_val_pred)

print(f" CatBoost Accuracy: {cat_acc:.4f}")

In [None]:
# ================================================================================
# 7. ENSEMBLE (BLENDING)
# ================================================================================

print("\nCalculando Ensemble final...")

lgb_p = lgb_model.predict_proba(X_val)
xgb_p = xgb_model.predict_proba(X_val_xgb)
cat_p = cat_model.predict_proba(X_val)

w_lgb = 0.55
w_xgb = 0.20
w_cat = 0.25

blend = (w_lgb*lgb_p) + (w_xgb*xgb_p) + (w_cat*cat_p)
blend_pred = np.argmax(blend, axis=1)

blend_acc = accuracy_score(y_val, blend_pred)
print(f" Ensemble Accuracy: {blend_acc:.4f}")

In [None]:
# ================================================================================
# 8. PREDICCIÓN FINAL SOBRE TEST
# ================================================================================

print("Generando predicciones finales...\n")

lgb_test = lgb_model.predict_proba(test)
xgb_test = xgb_model.predict_proba(test_xgb)
cat_test = cat_model.predict_proba(test)

final_blend = (w_lgb*lgb_test) + (w_xgb*xgb_test) + (w_cat*cat_test)
final_pred = np.argmax(final_blend, axis=1)

final_labels = le.inverse_transform(final_pred)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": final_labels
})

submission.to_csv("my_submission.csv", index=False)

In [None]:
# ================================================================================
# Envío del submission a Kaggle
# ================================================================================

import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json

!kaggle competitions submit -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia \
-f my_submission.csv -m "Entrega version final - Equipo Juan y Jose"