In [None]:
# Hipotesis NO 2 a desarrollar en el ejercicio por Irenes
# Qué modelos entre Random Forest, Árbol de Decisión Optimizado, clustering KMeans(No supervisado), Modelo LSTM son más efectivos para predecir 
# si un estudiante pasa, utilizando datos del comportamiento, demografía y desempeño académico del dataset de oulad sobre su compartamiento en la plataforma virtual vle

In [None]:
# 1. Manejo de Datos y Computación Numérica
import pandas as pd
import numpy as np
import os
import scipy.stats as stats
# 2. Visualización de Datos
import matplotlib.pyplot as plt
import seaborn as sns

# 3. Machine Learning - Scikit-learn

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, mean_squared_error,
    roc_curve, auc, precision_recall_curve, r2_score, roc_curve, roc_auc_score, average_precision_score, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)
from sklearn.preprocessing import StandardScaler

# 4. Deep Learning - TensorFlow/Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
# Paso 2: Carga de archivos CSV
# Cambia esta ruta si tus archivos están en otra carpeta
# Rutas base de carpetas
ruta = "./oulad"
rutaEdaImg = "./edaimg"
rutaDataPred = "./datatopredict"
rutaOrdMapping = "./ordmapping"
rutaResult_Pred = "./result_pred"
rutaMetrics_models = "./metrics_models"

# Cargar los archivos
student_info = pd.read_csv(os.path.join(ruta, "studentInfo.csv"))
student_registration = pd.read_csv(os.path.join(ruta, "studentRegistration.csv"))
student_assessment = pd.read_csv(os.path.join(ruta, "studentAssessment.csv"))
assessments = pd.read_csv(os.path.join(ruta, "assessments.csv"))
student_vle = pd.read_csv(os.path.join(ruta, "studentVle.csv"))
vle = pd.read_csv(os.path.join(ruta, "vle.csv"))
courses = pd.read_csv(os.path.join(ruta, "courses.csv"))

# Validación de carga de archivos
dataframes = {
    "studentInfo": student_info,
    "studentRegistration": student_registration,
    "studentAssessment": student_assessment,
    "assessments": assessments,
    "studentVle": student_vle,
    "vle": vle,
    "courses": courses
}

for nombre, df in dataframes.items():
    print(f"\n✅ {nombre} cargado correctamente.")
    print(f"Dimensiones: {df.shape}")
    print(df.head(3))

In [None]:
# Paso 3: Unión y limpieza de datos

# Asegurar que la columna 'score' sea numérica
student_assessment["score"] = pd.to_numeric(student_assessment["score"], errors="coerce")

# Unir studentAssessment con assessments para obtener code_module y code_presentation
student_assessment_full = pd.merge(student_assessment, assessments, on="id_assessment", how="left")

# Unir studentInfo con studentRegistration
info_reg = pd.merge(student_info, student_registration, on=["id_student", "code_module", "code_presentation"], how="left")

# Ahora sí podemos unir con student_assessment_full
info_reg_asess = pd.merge(info_reg, student_assessment_full, on=["id_student", "code_module", "code_presentation"], how="left")

# Calculsar variables agregadas por estudiante
total_assessments = student_assessment_full.groupby("id_student")["id_assessment"].count().reset_index(name="total_assessments")
average_score = student_assessment_full.groupby("id_student")["score"].mean().reset_index(name="average_score")
failed_assessments = student_assessment_full[student_assessment_full["score"] < 40].groupby("id_student")["score"].count().reset_index(name="assignment_failed")

# Unir al dataset principal
df = info_reg.drop_duplicates(subset=["id_student"]).copy()
df = pd.merge(df, total_assessments, on="id_student", how="left")
df = pd.merge(df, average_score, on="id_student", how="left")
df = pd.merge(df, failed_assessments, on="id_student", how="left")

# Rellenar valores faltantes
df["total_assessments"] = df["total_assessments"].fillna(0)
df["average_score"] = df["average_score"].fillna(0)
df["assignment_failed"] = df["assignment_failed"].fillna(0)

# Validación
print("✅ Unión y limpieza completadas.")
print("Dimensiones del dataset final:", df.shape)
df[["id_student", "highest_education", "final_result", "total_assessments", "average_score", "assignment_failed"]].head()

In [None]:
# =========================
# 1. Crear variable objetivo
# =========================
# Crear variable binaria 'passed': 1 si final_result es 'Pass' o 'Distinction', 0 si es 'Fail' o 'Withdrawn'
df["passed"] = df["final_result"].apply(lambda x: 1 if x in ["Pass", "Distinction"] else 0)

# =========================
# 2. Codificación ordinal
# =========================
# Codificar 'gender' como binaria
df["gender"] = df["gender"].astype(str).str.strip().str.upper().map({"M": 0, "F": 1})

# Codificar 'region' como ordinal según orden alfabético
region_order = {region: idx for idx, region in enumerate(sorted(df["region"].dropna().unique()))}
df["region"] = df["region"].map(region_order)

# Codificar 'age_band' como ordinal
age_band_order = {"0-35": 0, "35-55": 1, "55<=": 2}
df["age_band"] = df["age_band"].map(age_band_order)

# Codificar 'highest_education' como ordinal según orden alfabético
education_order = {edu: idx for idx, edu in enumerate(sorted(df["highest_education"].dropna().unique()))}
df["highest_education"] = df["highest_education"].map(education_order)

# =========================
# 3. Selección de variables
# =========================
features = [
    "gender", "region", "age_band", "highest_education",
    "total_assessments", "average_score", "assignment_failed"
]

X = df[features]
y = df["passed"]

# =========================
# 4. Eliminación de columnas innecesarias
# =========================
df.drop(columns=["imd_band", "final_result", "date_unregistration","disability"], inplace=True)

# =========================
# 5. Validación
# =========================
print("✅ Dataset preparado correctamente.")
print("Variables predictoras:", list(X.columns))
print("Distribución de la variable objetivo:")
print(y.value_counts())

# =========================
# 6. Guardar dataset
# =========================
df.to_csv(os.path.join(ruta,"H2 - oulad_dataset_summary_vle_data.csv"), index=False)


In [None]:
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#modelo Random Forest

# Entrenar el modelo Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Realizar predicciones
y_pred = model.predict(X_test)

# Calcular matriz de confusión
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calcular métricas
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Mostrar resultados
print("Resultados del modelo Random Forest:")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# crear un DataFrame para las métricas
metrics = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "MSE", "RMSE", "R² Score"],
    "Value": [accuracy, precision, recall, f1, mse, rmse, r2]
}
df_metrics = pd.DataFrame(metrics)
# Guardar las métricas en un archivo CSV
df_metrics.to_csv(os.path.join(rutaMetrics_models,"H1 - logistic_regression_metrics.csv"), index=False)

# Guardar predicciones en CSV
results_df = pd.DataFrame({
    "y_test": y_test.values,
    "y_pred": y_pred
})

results_df.to_csv(os.path.join(rutaResult_Pred,"H2 - predicciones_rf.csv"), index=False)
print("✅ Archivo 'predicciones_rf.csv' guardado correctamente.")


In [None]:
# Visualización de resultados del modelo Random Forest

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)

# Curva Precision-Recall
y_scores = model.predict_proba(X_test)[:, 1]
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_scores)

# Importancia de características
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Crear figura con 3 subplots en una fila
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

# Subplot 1: Matriz de Confusión
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No pasó', 'Pasó'],
            yticklabels=['No pasó', 'Pasó'],
            ax=axes[0])
axes[0].set_title('Matriz de Confusión - Random Forest')
axes[0].set_xlabel('Predicción')
axes[0].set_ylabel('Real')

# Subplot 2: Curva Precision-Recall
axes[1].plot(recall_vals, precision_vals, label='Curva Precision-Recall', color='blue')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Curva Precision-Recall - Random Forest')
axes[1].grid(True)
axes[1].legend()

# Subplot 3: Importancia de Características
axes[2].bar(range(len(importances)), importances[indices], color='skyblue')
axes[2].set_xticks(range(len(importances)))
axes[2].set_xticklabels([features[i] for i in indices], rotation=45, ha='right')
axes[2].set_title('Importancia de Características - Random Forest')
axes[2].set_ylabel('Importancia')

plt.tight_layout()
plt.savefig(os.path.join(rutaEdaImg, "H2 - Resultados_modelo_random_forest.png"))
plt.show()


In [None]:
# Modelo de red neuronal LSTM (Long Short-Term Memory)

# Preprocesamiento: escalado y reshape para LSTM
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# LSTM espera entrada 3D: (samples, timesteps, features)
# Como no tenemos series temporales, usamos timesteps=1
X_lstm = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# División de datos
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_lstm, y, test_size=0.3, random_state=42
)

# Modelo LSTM
model_lstm = Sequential([
    LSTM(32, input_shape=(1, X.shape[1]), activation='tanh'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenamiento
history = model_lstm.fit(
    X_train_lstm, y_train_lstm,
    epochs=10, batch_size=64,
    validation_data=(X_test_lstm, y_test_lstm),
    verbose=1
)

# Convertir el historial a DataFrame
hist_df = pd.DataFrame(history.history)

# Guardar el DataFrame como CSV
hist_df.to_csv(os.path.join(rutaMetrics_models, "H2 - history_lstm.csv"), index=False)

# Evaluación
loss, accuracy = model_lstm.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f"✅ LSTM Accuracy: {accuracy:.4f}")

# Predicciones
y_pred_lstm = (model_lstm.predict(X_test_lstm) > 0.5).astype(int).flatten()

# Predicciones probabilísticas
y_pred_probs = model_lstm.predict(X_test_lstm)

# Predicciones binarias
y_pred_lstm = (y_pred_probs > 0.5).astype(int).flatten()

# Calcular probabilidades y predicciones
mse = mean_squared_error(y_test_lstm, y_pred_lstm)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_lstm, y_pred_lstm)
accuracy = accuracy_score(y_test_lstm, y_pred_lstm)
precision = precision_score(y_test_lstm, y_pred_lstm)
recall = recall_score(y_test_lstm, y_pred_lstm)
f1 = f1_score(y_test_lstm, y_pred_lstm)

# Matriz de confusión
tn, fp, fn, tp = confusion_matrix(y_test_lstm, y_pred_lstm).ravel()

# Mostrar resultados
print("Resultados del modelo LSTM:")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# crear un DataFrame para las métricas
metrics = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "MSE", "RMSE", "R² Score"],
    "Value": [accuracy, precision, recall, f1, mse, rmse, r2]
}
df_metrics = pd.DataFrame(metrics)
# Guardar las métricas en un archivo CSV
df_metrics.to_csv(os.path.join(rutaMetrics_models,"H2 - lstm_metrics.csv"), index=False)

# Guardar predicciones en CSV
results_df = pd.DataFrame({ 
    "y_test": np.ravel(y_test_lstm),
    "y_pred": y_pred_lstm.flatten(),
    "y_pred_probs": y_pred_probs.flatten()
})

results_df.to_csv(os.path.join(rutaResult_Pred, "H2 - predicciones_lstm.csv"), index=False)


In [None]:
#Visualización de resultados del modelo LSTM

fig, axes = plt.subplots(2, 3, figsize=(22, 10))

# 5. Evolución de accuracy
axes[0, 0].plot(history.history['accuracy'], label='Entrenamiento', marker='o')
axes[0, 0].plot(history.history['val_accuracy'], label='Validación', marker='o')
axes[0, 0].set_title('Accuracy durante el entrenamiento LSTM')
axes[0, 0].set_xlabel('Época')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True)

# 6. Evolución de loss
axes[0, 1].plot(history.history['loss'], label='Entrenamiento', marker='o')
axes[0, 1].plot(history.history['val_loss'], label='Validación', marker='o')
axes[0, 1].set_title('Loss durante el entrenamiento LSTM')
axes[0, 1].set_xlabel('Época')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

# 2. Curva ROC

roc_auc = roc_auc_score(y_test_lstm, y_pred_probs)

fpr, tpr, _ = roc_curve(y_test_lstm, y_pred_probs)
axes[0, 2].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc:.3f}')
axes[0, 2].plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
axes[0, 2].set_xlim([0.0, 1.0])
axes[0, 2].set_ylim([0.0, 1.05])
axes[0, 2].set_xlabel('False Positive Rate')
axes[0, 2].set_ylabel('True Positive Rate')
axes[0, 2].set_title('Curva ROC - LSTM')
axes[0, 2].legend(loc="lower right")

# 4. Matriz de confusión
conf_matrix_lstm = confusion_matrix(y_test_lstm, y_pred_lstm)
sns.heatmap(conf_matrix_lstm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No pasó', 'Pasó'],
            yticklabels=['No pasó', 'Pasó'],
            ax=axes[1, 0])
axes[1, 0].set_title('Matriz de Confusión - LSTM')
axes[1, 0].set_xlabel('Predicción')
axes[1, 0].set_ylabel('Real')


# Calcular precisión y recall
precision, recall, _ = precision_recall_curve(y_test_lstm, y_pred_probs)
average_precision = average_precision_score(y_test_lstm, y_pred_probs)

# Insertar en Curva Precision-Recall - LSTM
axes[1, 1].plot(recall, precision, color='blue', lw=2, label=f'AP = {average_precision:.3f}')
axes[1, 1].set_xlabel('Recall')
axes[1, 1].set_ylabel('Precision')
axes[1, 1].set_title('Curva Precision-Recall - LSTM')
axes[1, 1].legend(loc='lower left')
axes[1, 1].grid(True)

# 3. Distribución de probabilidades predichas
axes[1, 2].hist(y_pred_probs, bins=30, color='purple', alpha=0.7, edgecolor='black')
axes[1, 2].set_title('Distribución de Probabilidades Predichas - LSTM')
axes[1, 2].set_xlabel('Probabilidad de pasar')
axes[1, 2].set_ylabel('Frecuencia')

plt.tight_layout()
plt.savefig(os.path.join(rutaEdaImg, "H2 - Resultados_modelo_lstm.png"))
plt.show()

In [None]:
#Model de Árbol de Decisión Optimizado

# Definir el modelo base
dt = DecisionTreeClassifier(random_state=42)

# Definir la grilla de hiperparámetros para optimización
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

# Búsqueda de grilla con validación cruzada
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_dt = grid_search.best_estimator_

# Predicciones
y_pred_dt = best_dt.predict(X_test)

# Métricas
print("Resultados del Árbol de Decisión Optimizado:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_dt):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_dt):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_dt):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_dt):.4f}")

# calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred_dt)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_dt)
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

# crear un DataFrame para las métricas
metrics = {
    "Metric": ["TP","FP","TN","FN","Accuracy", "Precision", "Recall", "F1 Score", "MSE", "RMSE", "R² Score"],
    "Value": [tp, fp, tn, fn, accuracy, precision, recall, f1, mse, rmse, r2]
}
df_metrics = pd.DataFrame(metrics)
# Guardar las métricas en un archivo CSV
df_metrics.to_csv(os.path.join(rutaMetrics_models,"H2 - decision_tree_metrics.csv"), index=False)

# Crear un DataFrame con las predicciones
df_predictions = X_test.copy()
df_predictions['Prediccion'] = y_pred_dt

# Guardar las predicciones en un archivo CSV
df_predictions.to_csv(os.path.join(rutaResult_Pred, "H2 - predicciones_arbol_decision.csv"), index=False)



In [None]:
# Visualización de resultados del modelo de Árbol de Decisión Optimizado

fig, axes = plt.subplots(2, 3, figsize=(22, 12))

# 1. Matriz de confusión
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['No pasó', 'Pasó'],
            yticklabels=['No pasó', 'Pasó'],
            ax=axes[0, 0])
axes[0, 0].set_title('Matriz de Confusión')
axes[0, 0].set_xlabel('Predicción')
axes[0, 0].set_ylabel('Real')

# 2. Importancia de características
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
axes[0, 1].bar(range(len(importances)), importances[indices], color='skyblue')
axes[0, 1].set_xticks(range(len(importances)))
axes[0, 1].set_xticklabels([X.columns[i] for i in indices], rotation=45, ha='right')
axes[0, 1].set_title('Importancia de Características')
axes[0, 1].set_ylabel('Importancia')

# 3. Distribución de Errores
errores = y_test - y_pred
axes[0, 2].hist(errores, bins=3, color='orange', edgecolor='black', rwidth=0.8)
axes[0, 2].set_title('Distribución de Errores')
axes[0, 2].set_xlabel('Error (y_test - y_pred)')
axes[0, 2].set_ylabel('Frecuencia')
axes[0, 2].set_xticks([-1, 0, 1])

# 4. Curva ROC
fpr, tpr, _ = roc_curve(y_test, y_scores)
axes[1, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc:.3f}')
axes[1, 0].plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
axes[1, 0].set_xlim([0.0, 1.0])
axes[1, 0].set_ylim([0.0, 1.05])
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('Curva ROC')
axes[1, 0].legend(loc="lower right")

# 5. Curva Precision-Recall
axes[1, 1].plot(recall_vals, precision_vals, label='Curva Precision-Recall', color='blue')
axes[1, 1].set_xlabel('Recall')
axes[1, 1].set_ylabel('Precision')
axes[1, 1].set_title('Curva Precision-Recall')
axes[1, 1].grid(True)
axes[1, 1].legend()

# 6. Distribución de probabilidades predichas
axes[1, 2].hist(y_scores, bins=30, color='purple', alpha=0.7, edgecolor='black')
axes[1, 2].set_title('Distribución de Probabilidades Predichas')
axes[1, 2].set_xlabel('Probabilidad de pasar')
axes[1, 2].set_ylabel('Frecuencia')

plt.tight_layout()
plt.savefig(os.path.join(rutaEdaImg, "H2 - Resultados_modelo_decision_tree.png"))
plt.show()

In [None]:
# modelo de clustering KMeans(No supervisado)

# Selección de variables para clustering
clustering_features = ["total_assessments", "average_score", "assignment_failed"]
X_cluster = df[clustering_features].copy()

# Escalado
scaler_cluster = StandardScaler()
X_scaled_cluster = scaler_cluster.fit_transform(X_cluster)

# KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled_cluster)

# Añadir clusters al DataFrame
df["cluster"] = clusters

# Evaluación (suponiendo que tienes y_test y y_pred_dt definidos)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_dt).ravel()

# calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred_dt)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_dt)
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print("Resultados del modelo clustering KMeans:")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_dt):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_dt):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_dt):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_dt):.4f}")

# crear un DataFrame para las métricas
metrics = {
    "Metric": ["TP","FP","TN","FN","Accuracy", "Precision", "Recall", "F1 Score", "MSE", "RMSE", "R² Score"],
    "Value": [tp, fp, tn, fn, accuracy, precision, recall, f1, mse, rmse, r2]
}
df_metrics = pd.DataFrame(metrics)
# Guardar las métricas en un archivo CSV
df_metrics.to_csv(os.path.join(rutaMetrics_models,"H2 - kmeans_metrics.csv"), index=False)

# Crear un DataFrame con los valores reales y predichos
df_predictions = pd.DataFrame({
    "Index": y_test.index,
    "Actual": y_test.values,
    "Predicted": y_pred_dt
})

# Guardar el DataFrame en un archivo CSV
df_predictions.to_csv(os.path.join(rutaResult_Pred, "H2 - kmeans_predictions.csv"), index=False)


In [None]:
# Visualización de resultados y métricas modelo kmeans

# Calcular métricas necesarias
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_dt).ravel()
fpr, tpr, _ = roc_curve(y_test, y_pred_dt)
precision, recall, _ = precision_recall_curve(y_test, y_pred_dt)
errors = y_test - y_pred_dt
sns.set(style="whitegrid", context="talk", palette="husl")

# Crear figura con 2 filas y 3 columnas
fig, axs = plt.subplots(2, 3, figsize=(18, 10))

# 1. Matriz de confusión
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_dt, ax=axs[0, 0])
axs[0, 0].set_title("Matriz de Confusión")

# 2. Importancia de características 
try:
    importances = model.feature_importances_
    axs[0, 1].barh(range(len(importances)), importances)
    axs[0, 1].set_yticks(range(len(importances)))
    axs[0, 1].set_yticklabels(clustering_features)
    axs[0, 1].set_title("Importancia de Características")
except:
    axs[0, 1].text(0.5, 0.5, 'Importancia no disponible', ha='center')
    axs[0, 1].set_title("Importancia de Características")

# 3. Distribución de errores
axs[0, 2].hist(errors, bins=20, color='skyblue', edgecolor='black')
axs[0, 2].set_title("Distribución de Errores")
axs[0, 2].set_xlabel("Error")
axs[0, 2].set_ylabel("Frecuencia")

# 4. Curva ROC
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc(fpr, tpr)).plot(ax=axs[1, 0])
axs[1, 0].set_title("Curva ROC")

# 5. Curva Precision-Recall
PrecisionRecallDisplay(precision=precision, recall=recall).plot(ax=axs[1, 1])
axs[1, 1].set_title("Curva Precision-Recall")

# 6. Gráfico 3D de clusters KMeans
ax3d = fig.add_subplot(2, 3, 6, projection='3d')
scatter = ax3d.scatter(
    df["total_assessments"],
    df["average_score"],
    df["assignment_failed"],
    c=df["cluster"],
    cmap='viridis'
)
ax3d.set_title("Clusters KMeans (3D)")
ax3d.set_xlabel("Total Assessments")
ax3d.set_ylabel("Average Score")
ax3d.set_zlabel("Assignment Failed")

plt.tight_layout()
plt.show()
