<a href="https://colab.research.google.com/github/Godshley/Kaggle-IA/blob/main/04_modelo_con_preprocesado_y_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# 04 - MODELO ALTERNATIVO: XGBOOST

import os
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

print("Imports cargados correctamente.")

Imports cargados correctamente.


In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = "."

In [3]:
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.52GB/s]


In [4]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [5]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

TARGET = "RENDIMIENTO_GLOBAL"
ID = "ID"

X = train.drop(columns=[TARGET])
y = train[TARGET]
X_test = test.copy()

print("Datos cargados. Shapes:", X.shape, X_test.shape)

Datos cargados. Shapes: (692500, 20) (296786, 20)


In [6]:
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numéricas: {len(num_cols)}, Categóricas: {len(cat_cols)}")

Numéricas: 6, Categóricas: 14


In [7]:
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

In [8]:
label_enc = LabelEncoder()
label_cols = [c for c in cat_cols if X[c].nunique() <= 10]

for col in label_cols:
    X[col] = label_enc.fit_transform(X[col])
    X_test[col] = label_enc.transform(X_test[col])

In [9]:
onehot_cols = [c for c in cat_cols if c not in label_cols]
X = pd.get_dummies(X, columns=onehot_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=onehot_cols, drop_first=True)
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

print("Shapes después de OHE:", X.shape, X_test.shape)

Shapes después de OHE: (692500, 1015) (296786, 1015)


In [10]:
constant_cols = [c for c in X.columns if X[c].nunique() == 1]
X = X.drop(columns=constant_cols)
X_test = X_test.drop(columns=constant_cols, errors="ignore")

print("Columnas constantes eliminadas:", len(constant_cols))

Columnas constantes eliminadas: 0


In [11]:
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Clases codificadas:", dict(zip(le.classes_, le.transform(le.classes_))))

Clases codificadas: {'alto': np.int64(0), 'bajo': np.int64(1), 'medio-alto': np.int64(2), 'medio-bajo': np.int64(3)}


In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [15]:
# ============================================================
# ENTRENAR XGBOOST CON SUBSET DEL 5% PARA EVITAR RAM Y LENTITUD
# ============================================================

# Tomar un subset representativo del 5% del dataset
X_sample = X.sample(frac=0.05, random_state=42)
y_sample = y_encoded[X_sample.index]   # y_encoded ya está hecho antes

# Split train/val
X_tr, X_val, y_tr, y_val = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
)

model = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softmax",
    num_class=4,
    eval_metric="mlogloss",
    tree_method="hist",
    device="cpu"
)

print("Entrenando XGBoost con subset del 5%...")
model.fit(X_tr, y_tr)
print("Entrenamiento completado.")

# Evaluación
val_pred = model.predict(X_val)
acc = accuracy_score(y_val, val_pred)

print(f"Accuracy (XGBoost con subset 5%): {acc:.4f}")

Entrenando XGBoost con subset del 5%...
Entrenamiento completado.
Accuracy (XGBoost con subset 5%): 0.4095


In [16]:
val_pred = model.predict(X_val)
acc = accuracy_score(y_val, val_pred)
print(f"Accuracy en validación: {acc:.4f}")

Accuracy en validación: 0.4095


In [17]:
test_pred = model.predict(X_test)

submission = pd.DataFrame({
    ID: test[ID],
    TARGET: test_pred
})

submission.to_csv("submission_xgboost.csv", index=False)

print("Archivo submission_xgboost.csv generado correctamente.")

Archivo submission_xgboost.csv generado correctamente.
