In [2]:
import json, joblib
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix

BASE          = Path.cwd().parent  # Go up one level from notebooks/ to project root
DATA_PROC     = BASE / "data" / "processed"
MODELS_DIR    = BASE / "models"
VAL_IN        = DATA_PROC / "val_table_final.csv"
TEST_IN       = DATA_PROC / "test_table_final.csv"
VAL_OUT       = DATA_PROC / "val_combined.csv"
TEST_OUT      = DATA_PROC / "test_combined.csv"
CHI2_JSON     = DATA_PROC / "threshold_pp.json"
ISO_JSON      = MODELS_DIR / "threshold_iso.json"
ISO_PKL       = MODELS_DIR / "iso_model.pkl"

# 🎯 CONFIGURACIÓN DE UMBRALES - Fácil de modificar
THRESHOLD_CONFIGS = {
    "conservative": {
        "chi2_percentile": 99,      # chi2_pp_threshold_99
        "iso_percentile": 1,        # score_threshold_p1  
        "description": "Muy conservador - pocos falsos positivos"
    },
    "balanced": {
        "chi2_percentile": 95,      # chi2_pp_threshold_95
        "iso_percentile": 5,        # score_threshold_p5
        "description": "Balance entre precisión y recall"
    },
    "sensitive": {
        "chi2_percentile": 90,      # chi2_pp_threshold_90  
        "iso_percentile": 10,       # score_threshold_p10
        "description": "Más sensible - detecta más anomalías"
    },
    "custom": {
        "chi2_percentile": 97.5,    # chi2_pp_threshold_97.5
        "iso_percentile": 3,        # score_threshold_p3
        "description": "Configuración personalizada"
    }
}

# 🎯 SELECCIONA LA CONFIGURACIÓN AQUÍ
SELECTED_CONFIG = "balanced"  # Cambia por: "conservative", "sensitive", "custom"

# 📊 Cargar umbrales disponibles
chi2_thresholds = json.load(open(CHI2_JSON))
iso_thresholds = json.load(open(ISO_JSON))

def get_threshold_values(config_name):
    """Obtiene los valores de umbral para una configuración."""
    config = THRESHOLD_CONFIGS[config_name]
    
    chi2_key = f"chi2_pp_threshold_{config['chi2_percentile']}"
    iso_key = f"score_threshold_p{config['iso_percentile']}"
    
    if chi2_key not in chi2_thresholds:
        raise KeyError(f"Umbral chi2 no encontrado: {chi2_key}")
    if iso_key not in iso_thresholds:
        raise KeyError(f"Umbral ISO no encontrado: {iso_key}")
        
    return {
        "chi2_threshold": chi2_thresholds[chi2_key],
        "iso_threshold": iso_thresholds[iso_key],
        "chi2_percentile": config['chi2_percentile'],
        "iso_percentile": config['iso_percentile'],
        "description": config['description']
    }

def add_iso_score(df, latent_npy_path):
    """Añade scores de Isolation Forest al dataframe."""
    Z = np.load(latent_npy_path)
    Z_std = scaler.transform(Z)
    df["score_IF"] = iso.score_samples(Z_std)
    return df

def evaluate_combination(df, chi2_thresh, iso_thresh, rare_classes={64, 53}):
    """Evalúa una combinación de umbrales."""
    # Clasificaciones individuales
    df["is_chi2"] = (df["chi2_pp"] > chi2_thresh).astype(int)
    df["is_iso"] = (df["score_IF"] < iso_thresh).astype(int)  # ISO: scores bajos = anomalías
    
    # Combinaciones
    df["is_event_OR"] = (df["is_chi2"] | df["is_iso"]).astype(int)      # Chi2 OR ISO
    df["is_event_AND"] = (df["is_chi2"] & df["is_iso"]).astype(int)     # Chi2 AND ISO
    df["is_event_CHI2_ONLY"] = df["is_chi2"]                            # Solo Chi2
    df["is_event_ISO_ONLY"] = df["is_iso"]                              # Solo ISO
    
    # Ground truth (si está disponible)
    if "true_target" in df.columns:
        y_true = df["true_target"].isin(rare_classes).astype(int)
        
        methods = {
            "Chi2 OR ISO": df["is_event_OR"],
            "Chi2 AND ISO": df["is_event_AND"], 
            "Chi2 Only": df["is_event_CHI2_ONLY"],
            "ISO Only": df["is_event_ISO_ONLY"]
        }
        
        print("\n" + "="*60)
        print("📊 EVALUACIÓN DE COMBINACIONES DE UMBRALES")
        print("="*60)
        
        for method_name, y_pred in methods.items():
            print(f"\n🔍 {method_name}")
            print("-" * 30)
            
            # Métricas principales
            report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
            precision = report['1']['precision']
            recall = report['1']['recall']
            f1 = report['1']['f1-score']
            
            # Conteos
            tp = sum((y_true == 1) & (y_pred == 1))
            fp = sum((y_true == 0) & (y_pred == 1))
            fn = sum((y_true == 1) & (y_pred == 0))
            tn = sum((y_true == 0) & (y_pred == 0))
            
            print(f"  Precision: {precision:.3f}")
            print(f"  Recall:    {recall:.3f}")
            print(f"  F1-score:  {f1:.3f}")
            print(f"  Detecciones: {sum(y_pred)} ({sum(y_pred)/len(y_pred)*100:.1f}%)")
            print(f"  TP:{tp:3d} | FP:{fp:3d} | FN:{fn:3d} | TN:{tn:3d}")
        
        return methods
    else:
        print("⚠️ Columna 'true_target' no encontrada - no se pueden calcular métricas")
        return None

def test_multiple_configs(df, latent_path, rare_classes={64, 53}):
    """Prueba múltiples configuraciones de umbrales."""
    print("\n" + "="*80)
    print("🧪 COMPARACIÓN DE CONFIGURACIONES DE UMBRALES")
    print("="*80)
    
    df = add_iso_score(df.copy(), latent_path)
    results_summary = []
    
    for config_name, config in THRESHOLD_CONFIGS.items():
        try:
            thresholds = get_threshold_values(config_name)
            
            print(f"\n🎯 CONFIGURACIÓN: {config_name.upper()}")
            print(f"📝 {thresholds['description']}")
            print(f"   Chi2 percentil {thresholds['chi2_percentile']}%: {thresholds['chi2_threshold']:.1f}")
            print(f"   ISO percentil {thresholds['iso_percentile']}%:   {thresholds['iso_threshold']:.4f}")
            
            # Evaluar esta configuración
            df_temp = df.copy()
            methods = evaluate_combination(
                df_temp, 
                thresholds['chi2_threshold'], 
                thresholds['iso_threshold'],
                rare_classes
            )
            
            if methods:
                # Guardar resumen para comparación final
                for method_name, y_pred in methods.items():
                    if "true_target" in df.columns:
                        y_true = df["true_target"].isin(rare_classes).astype(int)
                        report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                        results_summary.append({
                            'config': config_name,
                            'method': method_name,
                            'precision': report['1']['precision'],
                            'recall': report['1']['recall'],
                            'f1': report['1']['f1-score'],
                            'detections': sum(y_pred)
                        })
            
        except KeyError as e:
            print(f"❌ Error en configuración {config_name}: {e}")
    
    # Resumen final
    if results_summary:
        print("\n" + "="*80)
        print("📈 RESUMEN COMPARATIVO (ordenado por F1-score)")
        print("="*80)
        
        summary_df = pd.DataFrame(results_summary)
        summary_df = summary_df.sort_values('f1', ascending=False)
        
        print(f"{'Config':<12} {'Method':<15} {'Precision':<10} {'Recall':<8} {'F1':<8} {'Detections':<11}")
        print("-" * 80)
        
        for _, row in summary_df.iterrows():
            print(f"{row['config']:<12} {row['method']:<15} {row['precision']:<10.3f} "
                  f"{row['recall']:<8.3f} {row['f1']:<8.3f} {row['detections']:<11.0f}")

# 🚀 EJECUCIÓN PRINCIPAL

# Cargar modelo y scaler
bundle = joblib.load(ISO_PKL)
iso = bundle["iso"]
scaler = bundle["scaler"]

print("🔧 CONFIGURACIONES DISPONIBLES:")
for name, config in THRESHOLD_CONFIGS.items():
    print(f"  • {name}: {config['description']}")

# Obtener umbrales para la configuración seleccionada
selected_thresholds = get_threshold_values(SELECTED_CONFIG)

print(f"\n✅ CONFIGURACIÓN SELECCIONADA: {SELECTED_CONFIG}")
print(f"📝 {selected_thresholds['description']}")
print(f"🎯 Chi2 umbral (percentil {selected_thresholds['chi2_percentile']}%): {selected_thresholds['chi2_threshold']:.1f}")
print(f"🎯 ISO umbral (percentil {selected_thresholds['iso_percentile']}%): {selected_thresholds['iso_threshold']:.4f}")

# ===== PROCESAR VALIDACIÓN =====
print("\n" + "="*50)
print("🔍 PROCESANDO CONJUNTO DE VALIDACIÓN")
print("="*50)

val_df = pd.read_csv(VAL_IN)

# Opción 1: Evaluar solo la configuración seleccionada
val_df = add_iso_score(val_df, DATA_PROC / "latent_val.npy")
evaluate_combination(
    val_df, 
    selected_thresholds['chi2_threshold'], 
    selected_thresholds['iso_threshold']
)

# Opción 2: Evaluar TODAS las configuraciones (descomenta para usar)
# test_multiple_configs(pd.read_csv(VAL_IN), DATA_PROC / "latent_val.npy")

# Usar combinación OR como default para guardar
val_df["is_event"] = (
    (val_df["chi2_pp"] > selected_thresholds['chi2_threshold']) | 
    (val_df["score_IF"] < selected_thresholds['iso_threshold'])
).astype(int)

val_df.to_csv(VAL_OUT, index=False)
print(f"\n✅ Guardado {VAL_OUT}")

# ===== PROCESAR TEST =====
print("\n" + "="*50)
print("🔍 PROCESANDO CONJUNTO DE TEST")
print("="*50)

test_df = pd.read_csv(TEST_IN)
test_df = add_iso_score(test_df, DATA_PROC / "latent_test.npy")

# Aplicar umbrales seleccionados
test_df["is_chi2"] = (test_df["chi2_pp"] > selected_thresholds['chi2_threshold']).astype(int)
test_df["is_iso"] = (test_df["score_IF"] < selected_thresholds['iso_threshold']).astype(int)
test_df["is_event_OR"] = (test_df["is_chi2"] | test_df["is_iso"]).astype(int)
test_df["is_event_AND"] = (test_df["is_chi2"] & test_df["is_iso"]).astype(int)

# Columna principal para compatibilidad
test_df["is_event99"] = test_df["is_event_OR"]

detections_or = sum(test_df["is_event_OR"])
detections_and = sum(test_df["is_event_AND"])
detections_chi2 = sum(test_df["is_chi2"])
detections_iso = sum(test_df["is_iso"])

print(f"📊 Detecciones en TEST:")
print(f"   Chi2 OR ISO:  {detections_or:4d} ({detections_or/len(test_df)*100:.1f}%)")
print(f"   Chi2 AND ISO: {detections_and:4d} ({detections_and/len(test_df)*100:.1f}%)")
print(f"   Solo Chi2:    {detections_chi2:4d} ({detections_chi2/len(test_df)*100:.1f}%)")
print(f"   Solo ISO:     {detections_iso:4d} ({detections_iso/len(test_df)*100:.1f}%)")

test_df.to_csv(TEST_OUT, index=False)
print(f"\n✅ Guardado {TEST_OUT}")

print(f"\n🎉 PROCESO COMPLETADO usando configuración '{SELECTED_CONFIG}'")
print(f"💡 Para probar otras configuraciones, cambia SELECTED_CONFIG en línea 35")
print(f"💡 Para comparar todas las configuraciones, descomenta línea 169")

🔧 CONFIGURACIONES DISPONIBLES:
  • conservative: Muy conservador - pocos falsos positivos
  • balanced: Balance entre precisión y recall
  • sensitive: Más sensible - detecta más anomalías
  • custom: Configuración personalizada

✅ CONFIGURACIÓN SELECCIONADA: balanced
📝 Balance entre precisión y recall
🎯 Chi2 umbral (percentil 95%): 4652.1
🎯 ISO umbral (percentil 5%): 0.3035

🔍 PROCESANDO CONJUNTO DE VALIDACIÓN

📊 EVALUACIÓN DE COMBINACIONES DE UMBRALES

🔍 Chi2 OR ISO
------------------------------
  Precision: 0.017
  Recall:    1.000
  F1-score:  0.033
  Detecciones: 785 (100.0%)
  TP: 13 | FP:772 | FN:  0 | TN:  0

🔍 Chi2 AND ISO
------------------------------
  Precision: 0.000
  Recall:    0.000
  F1-score:  0.000
  Detecciones: 40 (5.1%)
  TP:  0 | FP: 40 | FN: 13 | TN:732

🔍 Chi2 Only
------------------------------
  Precision: 0.000
  Recall:    0.000
  F1-score:  0.000
  Detecciones: 40 (5.1%)
  TP:  0 | FP: 40 | FN: 13 | TN:732

🔍 ISO Only
------------------------------
  Pre