In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [13]:
df = pd.read_csv("Datos_icfes_limpios.csv")

In [14]:
threshold = df['punt_global'].quantile(0.25)
df['bajo_desempeno'] = (df['punt_global'] <= threshold).astype(int)

In [15]:
cols_utiles = [col for col in df.columns if (
    col.startswith('fami_') or
    (col.startswith('cole_bilingue')) or
    col.startswith('estu_genero')
)]
X = df[cols_utiles].copy()
y = df['bajo_desempeno']

In [16]:
X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1, how='all')
X = X.dropna()
y = y.loc[X.index]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))


In [19]:
print("Columnas usadas en X:", X.columns.tolist())
print("Forma de X:", X.shape)

Columnas usadas en X: ['cole_bilingue-N', 'cole_bilingue-S', 'estu_genero-F', 'estu_genero-M', 'fami_cuartoshogar-Cinco', 'fami_cuartoshogar-Cuatro', 'fami_cuartoshogar-Dos', 'fami_cuartoshogar-Seis o mas', 'fami_cuartoshogar-Tres', 'fami_cuartoshogar-Uno', 'fami_educacionmadre-Educación profesional completa', 'fami_educacionmadre-Educación profesional incompleta', 'fami_educacionmadre-Ninguno', 'fami_educacionmadre-Postgrado', 'fami_educacionmadre-Primaria completa', 'fami_educacionmadre-Primaria incompleta', 'fami_educacionmadre-Secundaria (Bachillerato) completa', 'fami_educacionmadre-Secundaria (Bachillerato) incompleta', 'fami_educacionmadre-Técnica o tecnológica completa', 'fami_educacionmadre-Técnica o tecnológica incompleta', 'fami_educacionpadre-Educación profesional completa', 'fami_educacionpadre-Educación profesional incompleta', 'fami_educacionpadre-Ninguno', 'fami_educacionpadre-Postgrado', 'fami_educacionpadre-Primaria completa', 'fami_educacionpadre-Primaria incompleta'

In [36]:
import mlflow
import mlflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Recall
from sklearn.metrics import recall_score
import numpy as np

mlflow.set_experiment("experimento_red_neuronal_icfes")

with mlflow.start_run():
    # Parámetros
    n = 128
    d = 0.3
    lr = 0.001
    mlflow.log_param("neuronas", n)
    mlflow.log_param("dropout", d)
    mlflow.log_param("learning_rate", lr)

    # Modelo
    model = Sequential([
        Dense(n, input_shape=(X_train_scaled.shape[1],), activation='relu'),
        Dropout(d),
        Dense(n//2, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=lr),
        metrics=['accuracy', Recall(name='recall')]
    )

    # Entrenamiento
    history = model.fit(X_train_scaled, y_train,
          epochs=10,
          batch_size=32,
          validation_split=0.2,
          class_weight=class_weights,
          verbose=0)

    # Evaluación en test
    loss, accuracy, recall_keras = model.evaluate(X_test_scaled, y_test, verbose=0)

    # Métricas sklearn (más controladas)
    y_pred_prob = model.predict(X_test_scaled).ravel()
    y_pred = (y_pred_prob >= 0.35).astype(int)
    recall_sklearn = recall_score(y_test, y_pred)

    # Registrar métricas en MLflow
    mlflow.log_metric("loss", loss)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("recall_keras", recall_keras)
    mlflow.log_metric("recall_sklearn", recall_sklearn)

    # Guardar modelo

    mlflow.keras.log_model(model, "modelo_keras")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3203/3203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step




In [10]:
model = Sequential([
                Dense(256, input_dim=X_train_scaled.shape[1], activation='relu'),
                Dropout(0.3),
                Dense(256//2, activation='relu'),
                Dense(1, activation='sigmoid')
                ])
model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(learning_rate=0.0005),
            metrics=['accuracy']
                )


In [10]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [11]:
history = model.fit(X_train_scaled, y_train,
          epochs=10,
          batch_size=32,
          validation_split=0.2,
          class_weight=class_weights,
          verbose=0)



KeyboardInterrupt: 

In [37]:
y_pred_prob = model.predict(X_test_scaled).ravel()
y_pred = (y_pred_prob >= 0.35).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_prob))


[1m3203/3203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.32      0.47     76607
           1       0.31      0.92      0.47     25872

    accuracy                           0.47    102479
   macro avg       0.62      0.62      0.47    102479
weighted avg       0.77      0.47      0.47    102479

Confusion Matrix:
[[24404 52203]
 [ 2107 23765]]
ROC AUC Score:
0.7221588954476219
