<a href="https://colab.research.google.com/github/JhoanVillaNO/proyectoKaggle/blob/main/03_modelo_con_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalacion de dependencias

In [1]:
%pip install pandas numpy scikit-learn lightgbm matplotlib



## Carga de librerias y archivos

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb


In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 931MB/s]


In [4]:
!unzip udea*.zip > /dev/null

In [15]:
import pandas as pd
train = pd.read_csv("train.csv")
train.info()
train.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692500 entries, 0 to 692499
Data columns (total 21 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           692500 non-null  int64  
 1   PERIODO_ACADEMICO            692500 non-null  int64  
 2   E_PRGM_ACADEMICO             692500 non-null  object 
 3   E_PRGM_DEPARTAMENTO          692500 non-null  object 
 4   E_VALORMATRICULAUNIVERSIDAD  686213 non-null  object 
 5   E_HORASSEMANATRABAJA         661643 non-null  object 
 6   F_ESTRATOVIVIENDA            660363 non-null  object 
 7   F_TIENEINTERNET              665871 non-null  object 
 8   F_EDUCACIONPADRE             669322 non-null  object 
 9   F_TIENELAVADORA              652727 non-null  object 
 10  F_TIENEAUTOMOVIL             648877 non-null  object 
 11  E_PRIVADO_LIBERTAD           692500 non-null  object 
 12  E_PAGOMATRICULAPROPIO        686002 non-null  object 
 13 

(692500, 21)

## Selección de variables y división de datos

In [17]:
categorical_features = [
    "E_VALORMATRICULAUNIVERSIDAD",
    "E_HORASSEMANATRABAJA",
    "F_ESTRATOVIVIENDA",
    "F_EDUCACIONPADRE",
    "F_EDUCACIONMADRE",
    "E_PRGM_DEPARTAMENTO",
    "F_TIENEINTERNET.1",
    "E_PAGOMATRICULAPROPIO",
    "E_PRGM_ACADEMICO"
]

temp = train.copy()
for col in categorical_features:
    if col in temp.columns:
        temp[col] = temp[col].astype(str).fillna('missing')

y = temp["RENDIMIENTO_GLOBAL"]
X = temp.drop(columns=["RENDIMIENTO_GLOBAL", "ID", "PERIODO_ACADEMICO"])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp)

## Procesamiento para LightGBM

In [18]:
# Identificar correctamente las columnas numéricas (excluyendo categóricas y asegurando que existan en X_train)
numeric_features = [col for col in X_train.select_dtypes(include=[np.number]).columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("scaler", StandardScaler())
    ]), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
])

# Creación del pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.1, random_state=42))
])
pipeline.fit(X_train, y_train)

# Evaluación del modelo
y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy validación: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred))

y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy test: {test_accuracy:.4f}")
print(classification_report(y_test, y_test_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2507
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 864
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Accuracy validación: 0.4327
              precision    recall  f1-score   support

        alto       0.56      0.63      0.59     30733
        bajo       0.46      0.55      0.50     30272
  medio-alto       0.32      0.27      0.29     30034
  medio-bajo       0.33      0.28      0.30     30148

    accuracy                           0.43    121187
   macro avg       0.42      0.43      0.42    121187
weighted avg       0.42      0.43      0.42    121187





Accuracy test: 0.4308
              precision    recall  f1-score   support

        alto       0.56      0.62      0.59      4391
        bajo       0.46      0.54      0.50      4325
  medio-alto       0.33      0.28      0.30      4290
  medio-bajo       0.32      0.28      0.30      4307

    accuracy                           0.43     17313
   macro avg       0.42      0.43      0.42     17313
weighted avg       0.42      0.43      0.42     17313



## Kaggle Submission

### Carga de datos

In [19]:
test_data = pd.read_csv("test.csv")

In [20]:
for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str).fillna('missing')
X_test_kaggle = test_data.drop(columns=["ID", "PERIODO"], errors='ignore')
predictions = pipeline.predict(X_test_kaggle)
submission_df = test_data[["ID"]].copy()
submission_df["RENDIMIENTO_GLOBAL"] = predictions
submission_df.to_csv("submission_lightgbm.csv", index=False)
print("Archivo de submission generado: submission_lightgbm.csv")
print(submission_df.head())



Archivo de submission generado: submission_lightgbm.csv
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo
