<a href="https://colab.research.google.com/github/MariaPosadaBotero/Entregas-del-proyecto-de-AI-para-ingenieros---Udea-2025-2/blob/main/04-modelo_con_preprocesado_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##MODELO LIGHTGBM


In [1]:
# 1. IMPORTAR LIBRERIAS

import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# 2. CARGAR DATOS

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (692500, 21)
Test shape: (296786, 20)


In [3]:
# 3. DEFINIR LA VARIABLE OBJETIVO
target_col = "RENDIMIENTO_GLOBAL"

print("Valores únicos del target:", train[target_col].unique())


Valores únicos del target: ['medio-alto' 'bajo' 'alto' 'medio-bajo']


In [4]:
# 4. QUITAR FILAS CON TARGET VACÍO

train = train.dropna(subset=[target_col])


In [5]:
# 5. SEPARAR X y Y

X = train.drop(columns=[target_col])
y = train[target_col]


In [6]:
# 6. IDENTIFICAR COLUMNAS CATEGÓRICAS
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Columnas categóricas:", cat_cols)

# Convertir a categoría
for col in cat_cols:
    X[col] = X[col].astype("category")
    test[col] = test[col].astype("category")



Columnas categóricas: ['E_PRGM_ACADEMICO', 'E_PRGM_DEPARTAMENTO', 'E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA', 'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_EDUCACIONPADRE', 'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD', 'E_PAGOMATRICULAPROPIO', 'F_TIENECOMPUTADOR', 'F_TIENEINTERNET.1', 'F_EDUCACIONMADRE']


In [7]:
# 7. DIVIDIR EN TRAIN / VALIDATION
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# 8. CREAR MODELO LIGHTGBM

params = {
    "objective": "multiclass",
    "num_class": y.nunique(),
    "learning_rate": 0.08,
    "num_leaves": 40,
    "max_depth": -1,
    "n_estimators": 200,
    "verbose": -1
}

model = lgb.LGBMClassifier(**params)


In [9]:
# 9. ENTRENAR
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss"
)


In [10]:

# 10. EVALUACIÓN: TRAIN Y VALIDACIÓN

# Predicciones en train
pred_train = model.predict(X_train)
acc_train = accuracy_score(y_train, pred_train)
print("Accuracy en TRAIN:", acc_train)

# Predicciones en validación
pred_val = model.predict(X_val)
acc_val = accuracy_score(y_val, pred_val)
print("Accuracy en VALIDACIÓN:", acc_val)

print("\n===== CLASSIFICATION REPORT VALIDACIÓN =====")
print(classification_report(y_val, pred_val))

Accuracy en TRAIN: 0.4901606498194946
Accuracy en VALIDACIÓN: 0.43937184115523464

===== CLASSIFICATION REPORT VALIDACIÓN =====
              precision    recall  f1-score   support

        alto       0.57      0.62      0.59     35124
        bajo       0.47      0.57      0.52     34597
  medio-alto       0.33      0.27      0.30     34324
  medio-bajo       0.33      0.28      0.31     34455

    accuracy                           0.44    138500
   macro avg       0.43      0.44      0.43    138500
weighted avg       0.43      0.44      0.43    138500



In [11]:

# 11. PREDICCIÓN PARA KAGGLE

pred_test = model.predict(test)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": pred_test
})

submission.to_csv("submission_lightgbm.csv", index=False)
print("\nArchivo submission_lightgbm.csv generado")


Archivo submission_lightgbm.csv generado
