In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Datos_icfes_limpios.csv")

In [4]:
threshold = df['punt_global'].quantile(0.25)
df['bajo_desempeno'] = (df['punt_global'] <= threshold).astype(int)

In [5]:
cols_utiles = [col for col in df.columns if (
    col.startswith('fami_') or
    (col.startswith('cole_bilingue')) or
    col.startswith('estu_genero')
)]
X = df[cols_utiles].copy()
y = df['bajo_desempeno']

In [6]:
X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1, how='all')
X = X.dropna()
y = y.loc[X.index]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))


In [8]:
print("Columnas usadas en X:", X.columns.tolist())
print("Forma de X:", X.shape)

Columnas usadas en X: ['cole_bilingue-N', 'cole_bilingue-S', 'estu_genero-F', 'estu_genero-M', 'fami_cuartoshogar-Cinco', 'fami_cuartoshogar-Cuatro', 'fami_cuartoshogar-Dos', 'fami_cuartoshogar-Seis o mas', 'fami_cuartoshogar-Tres', 'fami_cuartoshogar-Uno', 'fami_educacionmadre-Educación profesional completa', 'fami_educacionmadre-Educación profesional incompleta', 'fami_educacionmadre-Ninguno', 'fami_educacionmadre-Postgrado', 'fami_educacionmadre-Primaria completa', 'fami_educacionmadre-Primaria incompleta', 'fami_educacionmadre-Secundaria (Bachillerato) completa', 'fami_educacionmadre-Secundaria (Bachillerato) incompleta', 'fami_educacionmadre-Técnica o tecnológica completa', 'fami_educacionmadre-Técnica o tecnológica incompleta', 'fami_educacionpadre-Educación profesional completa', 'fami_educacionpadre-Educación profesional incompleta', 'fami_educacionpadre-Ninguno', 'fami_educacionpadre-Postgrado', 'fami_educacionpadre-Primaria completa', 'fami_educacionpadre-Primaria incompleta'

In [None]:
model = Sequential([
    Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [14]:
history = model.fit(X_train_scaled, y_train,
          epochs=20,
          batch_size=32,
          validation_split=0.2,
          class_weight=class_weights,
          verbose=1)



Epoch 1/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.6532 - loss: 0.6171 - val_accuracy: 0.6379 - val_loss: 0.6002
Epoch 2/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 0.6294 - loss: 0.6129 - val_accuracy: 0.6307 - val_loss: 0.6113
Epoch 3/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 0.6343 - loss: 0.6120 - val_accuracy: 0.6014 - val_loss: 0.6346
Epoch 4/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 0.6365 - loss: 0.6103 - val_accuracy: 0.6425 - val_loss: 0.6001
Epoch 5/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 0.6380 - loss: 0.6110 - val_accuracy: 0.6125 - val_loss: 0.6265
Epoch 6/20
[1m10248/10248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 0.6366 - loss: 0.6104 - val_accuracy: 0.6328 - val_loss: 0.616

In [18]:
y_pred_prob = model.predict(X_test_scaled).ravel()
y_pred = (y_pred_prob >= 0.35).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_prob))


[1m3203/3203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 743us/step
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.33      0.48     76607
           1       0.31      0.91      0.47     25872

    accuracy                           0.48    102479
   macro avg       0.62      0.62      0.48    102479
weighted avg       0.77      0.48      0.48    102479

Confusion Matrix:
[[25135 51472]
 [ 2262 23610]]
ROC AUC Score:
0.7218842637585843
