In [1]:
# Celda1 1: Instalar Librerias
!pip install --quiet boto3

In [2]:
# Celda 2: Import required libraries
import pandas as pd
import numpy as np
import pickle
import boto3
import yaml
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# Celda 3: Cargar Credenciales
with open("credentials.yaml", "r") as f:
    config = yaml.safe_load(f)

In [4]:
# Celda 4: Funcion para cargar modelo
def load_model(bucket, bucket_path):
    """
    Load a model from S3 bucket
    
    Parameters:
    bucket (str): Name of the S3 bucket
    bucket_path (str): Path to the model file in the bucket
    
    Returns:
    object: Loaded model
    """
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )
    
    s3 = session.resource('s3')
    obj = s3.Object(bucket, bucket_path).get()['Body'].read()
    model = pickle.loads(obj)
    
    return model

In [5]:
# Celda 5: Cargar Modelo y plotear ROC
def plot_roc_curve(model, X_test, y_test):
    """
    Generate ROC curve and calculate AUC
    
    Parameters:
    model: Trained model
    X_test: Test features
    y_test: Test labels
    
    Returns:
    float: AUC score
    """
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return roc_auc

In [6]:
# Celda 6: Generar Metricas
def generate_metrics_table(model, X_test, y_test):
    """
    Generate comprehensive metrics table
    
    Parameters:
    model: Trained model
    X_test: Test features
    y_test: Test labels
    
    Returns:
    pd.DataFrame: Metrics table
    """
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    
    metrics_list = []
    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        tnr = tn / (tn + fp)  # Specificity
        fnr = fn / (fn + tp)  # Miss rate
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0  # Precision
        rec = tp / (tp + fn)  # Recall
        f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0  # F1 score
        
        metrics_list.append({
            'threshold': threshold,
            'tnr': tnr,
            'fnr': fnr,
            'precision': prec,
            'recall': rec,
            'f1_score': f1
        })
    
    return pd.DataFrame(metrics_list)

In [7]:
# Celda 7: Traer Metricas de S3
def get_metrics(bucket, metrics_path):
    """
    Load metrics from S3
    
    Parameters:
    bucket (str): Name of the S3 bucket
    metrics_path (str): Path to the metrics file
    
    Returns:
    pd.DataFrame: Metrics table
    """
    return load_model(bucket, metrics_path)

In [8]:
# Celda 8: FUncion para elegir el mejor modelo
def select_best_model(min_recall, metrics_paths):
    """
    Select the best model based on business constraints
    
    Parameters:
    min_recall (float): Minimum recall threshold
    metrics_paths (list): List of paths to metrics files
    
    Returns:
    tuple: (model_name, threshold, metrics)
    """
    best_f1 = -1
    best_model = None
    best_threshold = None
    best_metrics = None
    
    for path in metrics_paths:
        metrics = get_metrics(bucket, path)
        valid_metrics = metrics[metrics['recall'] >= min_recall]
        
        if not valid_metrics.empty:
            max_f1_idx = valid_metrics['f1_score'].idxmax()
            if valid_metrics.loc[max_f1_idx, 'f1_score'] > best_f1:
                best_f1 = valid_metrics.loc[max_f1_idx, 'f1_score']
                best_model = path.split('/')[-1].replace('metricas_', '').replace('.pkl', '')
                best_threshold = valid_metrics.loc[max_f1_idx, 'threshold']
                best_metrics = valid_metrics.loc[max_f1_idx]
    
    return (best_model, best_threshold, best_metrics)

In [9]:
# Celda 9: FUnciòn para guardar Mejor Modelo
def save_threshold_best_model(best_model_tuple, bucket):
    """
    Save the best model threshold
    
    Parameters:
    best_model_tuple (tuple): Output from select_best_model
    bucket (str): Name of the S3 bucket
    """
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )
    
    s3 = session.resource('s3')
    pickle_data = pickle.dumps(best_model_tuple)
    s3.Object(bucket, 'best_model/threshold.pkl').put(Body=pickle_data)
    
    ## Función para guardar modelos en S3
def save_model(bucket, bucket_path, model):
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')
    pickle_data = pickle.dumps(model)
    s3.Object(bucket, bucket_path).put(Body=pickle_data)

In [10]:
# Celda 11: Guardar Mejor Modelo
def save_best_model(bucket, model_path, model):
    """
    Save the selected model
    
    Parameters:
    bucket (str): Name of the S3 bucket
    model_path (str): Path where to save the model
    model: Model to save
    """
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )
    
    s3 = session.resource('s3')
    pickle_data = pickle.dumps(model)
    s3.Object(bucket, model_path).put(Body=pickle_data)

In [11]:
# Celda 12
import numpy as np
from sklearn.preprocessing import LabelEncoder

# bucket
bucket = "aplicaciones-cd-2-" + config['iexe']['matricula']

print("Cargando datos de prueba...")
# Cargar test data
test_data = load_model(bucket, "dataset/test/test_dataset.pkl")
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Convertir etiquetas categóricas a numéricas
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
print("Datos de prueba cargados correctamente")

# Crear carpeta de evaluación
session = boto3.Session(
    aws_access_key_id=config['s3']['aws_access_key_id'],
    aws_secret_access_key=config['s3']['aws_secret_access_key'],
    aws_session_token=config['s3']['aws_session_token']
)
s3 = session.resource('s3')

try:
    s3.Object(bucket, 'evaluacion/').put(Body='')
    print("Carpeta de evaluación creada\n")
except Exception as e:
    print(f"La carpeta de evaluación ya existe o hubo un error: {e}\n")

# Cargar y evaluar modelos
models = {
    'arbol': 'models/decision_tree_best_model.pkl'
}

def generate_metrics_table_modified(model, X_test, y_test):
    """
    Generate comprehensive metrics table with handling for categorical labels
    """
    # Obtener probabilidades de predicción
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calcular precision-recall para diferentes umbrales
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba, pos_label=1)
    
    metrics_list = []
    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Miss rate
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0  # Precision
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall
        f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0  # F1 score
        
        metrics_list.append({
            'threshold': threshold,
            'tnr': tnr,
            'fnr': fnr,
            'precision': prec,
            'recall': rec,
            'f1_score': f1
        })
    
    return pd.DataFrame(metrics_list)

# Procesar cada modelo
metrics_generated = False
for model_name, model_path in models.items():
    try:
        print(f"Procesando modelo: {model_name}")
        # Cargar modelo
        model = load_model(bucket, model_path)
        print(f"Modelo {model_name} cargado correctamente")
        
        print(f"Generando métricas para {model_name}...")
        # Generar y Guardar Métricas
        metrics = generate_metrics_table_modified(model, X_test, y_test_encoded)
        save_model(bucket, f"evaluacion/metricas_{model_name}.pkl", metrics)
        print(f"Métricas guardadas para {model_name}\n")
        metrics_generated = True
        
    except Exception as e:
        print(f"Error procesando {model_name}: {str(e)}\n")

print("Proceso de generación de métricas completado")

if not metrics_generated:
    print("No se pudieron generar métricas para ningún modelo")
    exit()

# Seleccionar mejor Modelo
print("\nSeleccionando mejor modelo...")
metrics_paths = [
    "evaluacion/metricas_arbol.pkl"
]

best_model_info = select_best_model(0.5, metrics_paths)

if best_model_info:
    # Guardar threshold
    save_threshold_best_model(best_model_info, bucket)
    print("Threshold del mejor modelo guardado")

    # Guardar Modelo Seleccionado
    try:
        selected_model = load_model(bucket, f"models/decision_tree_best_model.pkl")
        save_best_model(bucket, "selected-model/selected_model.pkl", selected_model)
        print("Modelo seleccionado guardado exitosamente")
    except Exception as e:
        print(f"Error guardando el modelo seleccionado: {str(e)}")
else:
    print("No se encontró un modelo que cumpla con los criterios mínimos de recall")

Cargando datos de prueba...
Datos de prueba cargados correctamente
Carpeta de evaluación creada

Procesando modelo: arbol
Modelo arbol cargado correctamente
Generando métricas para arbol...
Métricas guardadas para arbol

Proceso de generación de métricas completado

Seleccionando mejor modelo...
Threshold del mejor modelo guardado
Modelo seleccionado guardado exitosamente
