In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model 
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
from statsmodels.stats.contingency_tables import mcnemar 

from google.colab import drive
drive.mount('/content/drive')
VALIDATION_DATA_DIR = '/content/drive/MyDrive/dataset'

MODEL1_PATH = '/content/drive/MyDrive/models/Model_1_Training.h5'
MODEL2_PATH = '/content/drive/MyDrive/models/Model_2_Training.h5'
MODEL3_PATH = '/content/drive/MyDrive/models/Model_3_Training.keras'

CLASSES = ['cataract', 'diabetic_retinopathy', 'glaucoma', 'normal'] # Ejemplo: ajusta a tus clases reales

models_to_evaluate = [
    {'name': 'Modelo 1', 'path': MODEL1_PATH, 'img_size': (128, 128)},
    {'name': 'Modelo 2', 'path': MODEL2_PATH, 'img_size': (128, 128)},
    {'name': 'Modelo 3', 'path': MODEL3_PATH, 'img_size': (256, 256)}  

loaded_models = {}

print("Cargando modelos con sus tamaños de imagen específicos...")
for model_info in models_to_evaluate:
    model_name = model_info['name']
    model_path = model_info['path']
    model_img_size = model_info['img_size']
    print(f"Intentando cargar {model_name} desde {model_path}...")
    try:
        loaded_model = load_model(model_path)
        loaded_models[model_name] = loaded_model
        print(f"{model_name} cargado exitosamente.")
    except Exception as e:
        print(f"Error al cargar {model_name} desde {model_path}: {e}")
        loaded_models[model_name] = None 

print("Carga de modelos completada.")

validation_datagen = ImageDataGenerator(rescale=1./255) 

import os
if not os.path.exists(VALIDATION_DATA_DIR):
    print(f"Error: Directory not found at {VALIDATION_DATA_DIR}")

if models_to_evaluate and os.path.exists(VALIDATION_DATA_DIR):
    try:
        temp_generator = validation_datagen.flow_from_directory(
            VALIDATION_DATA_DIR,
            target_size=models_to_evaluate[0]['img_size'], 
            batch_size=1, 
            class_mode='categorical',
            shuffle=False
        )
        y_true = temp_generator.classes
        print(f"Clases inferidas por el generador para etiquetas: {temp_generator.class_indices}")
    except Exception as e:
        print(f"Error creating temporary generator to get true labels: {e}")
        y_true = None 
else:
    y_true = None 

def evaluate_model_performance(model, y_true_labels, model_name, class_names, validation_data_dir, target_img_size, batch_size=32):
    if model is None:
        print(f"\n--- Skipping evaluation for {model_name} as it failed to load ---")
        return None, None 
    if y_true_labels is None:
        print(f"\n--- Skipping evaluation for {model_name} as true labels could not be obtained ---")
        return None, None 
    if not os.path.exists(validation_data_dir):
         print(f"\n--- Skipping evaluation for {model_name} as validation data directory not found ---")
         return None, None

    print(f"\n--- Evaluando {model_name} ---")

    model_datagen = ImageDataGenerator(rescale=1./255)
    model_generator = model_datagen.flow_from_directory(
        validation_data_dir,
        target_size=target_img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False 
    )

    steps = int(np.ceil(model_generator.n / model_generator.batch_size)) 
    predictions_raw = model.predict(model_generator, steps=steps, verbose=1)
    y_pred_labels = np.argmax(predictions_raw, axis=1)
    cm = confusion_matrix(y_true_labels, y_pred_labels)
    fig, ax = plt.subplots(figsize=(len(class_names)+2, len(class_names)+2))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names, ax=ax)
    ax.set_title(f'Matriz de Confusión - {model_name}')
    ax.set_ylabel('Etiqueta Verdadera')
    ax.set_xlabel('Etiqueta Predicha')
    plt.tight_layout()
    plt.show()

    confusion_matrix_filename = f"{model_name.replace(' ', '_')}_confusion_matrix.png"
    plt.savefig(confusion_matrix_filename)
    print(f"Matriz de confusión guardada como {confusion_matrix_filename}")
    plt.close(fig)

    report = classification_report(y_true_labels, y_pred_labels,
                                   target_names=class_names, output_dict=True, zero_division=0)

    precision = report['weighted avg']['precision']
    sensibilidad = report['weighted avg']['recall']
    f1_score_val = report['weighted avg']['f1-score']
    mcc = matthews_corrcoef(y_true_labels, y_pred_labels)
    total_specificity = 0
    num_classes = len(class_names)
    for i in range(num_classes):
        TN = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        FP = np.sum(cm[:, i]) - cm[i, i]

        if (TN + FP) > 0:
            total_specificity += TN / (TN + FP)
    specificity = total_specificity / num_classes if num_classes > 0 else 0

    metrics = {
        'Modelo': model_name,
        'Precision': f"{precision:.3f}",
        'Sensibilidad': f"{sensibilidad:.3f}",
        'Especificidad': f"{specificity:.3f}",
        'F1-Score': f"{f1_score_val:.3f}",
        'MCC': f"{mcc:.3f}"
    }

    return metrics, y_pred_labels

results = []
model_predictions = {} 

if y_true is not None:
    for model_info in models_to_evaluate:
        model_name = model_info['name']
        model = loaded_models.get(model_name) 
        model_img_size = model_info['img_size']
        result, predictions = evaluate_model_performance(
            model=model,
            y_true_labels=y_true,
            model_name=model_name,
            class_names=CLASSES,
            validation_data_dir=VALIDATION_DATA_DIR,
            target_img_size=model_img_size,
            batch_size=32 
        )
        if result:
            results.append(result)
            if predictions is not None:
                model_predictions[model_name] = predictions


else:
    print("Skipping model evaluation because true labels could not be obtained.")

if results: 
    df_results = pd.DataFrame(results)
    print("\n--- Tabla Comparativa de Métricas ---")
    print(df_results.to_string(index=False)) 
else:
    print("\nNo results to display.")

print("\n--- Resultados de la Prueba de McNemar (Comparación Pareada de Modelos) ---")

if len(model_predictions) >= 2:
    model_names = list(model_predictions.keys())
    mcnemar_results = []

    for i in range(len(model_names)):
        for j in range(i + 1, len(model_names)):
            model1_name = model_names[i]
            model2_name = model_names[j]
            predictions1 = model_predictions[model1_name]
            predictions2 = model_predictions[model2_name]
            a = np.sum((predictions1 == y_true) & (predictions2 == y_true))
            b = np.sum((predictions1 == y_true) & (predictions2 != y_true))
            c = np.sum((predictions1 != y_true) & (predictions2 == y_true))
            d = np.sum((predictions1 != y_true) & (predictions2 != y_true))
            contingency_table = [[a, b], [c, d]]

            try:
                result = mcnemar(contingency_table, exact=True) 
                mcnemar_results.append({
                    'Modelos Comparados': f"{model1_name} vs {model2_name}",
                    'Chi-square statistic': f"{result.statistic:.3f}",
                    'P-value': f"{result.pvalue:.4f}",
                    'Significancia (alpha=0.05)': 'Sí' if result.pvalue < 0.05 else 'No'
                })
            except ValueError as e:
                 mcnemar_results.append({
                    'Modelos Comparados': f"{model1_name} vs {model2_name}",
                    'Chi-square statistic': 'N/A',
                    'P-value': 'N/A',
                    'Significancia (alpha=0.05)': f"Error: {e}"
                })

    if mcnemar_results:
        df_mcnemar = pd.DataFrame(mcnemar_results)
        print(df_mcnemar.to_string(index=False))
    else:
        print("Could not perform McNemar's test. Ensure models were loaded and evaluated successfully.")

else:
    print("Not enough models with valid predictions to perform McNemar's test.")