# Calibración de Confianza

Este cuaderno permite:

1. Cargar datasets generados con `scripts/confidence/build_dataset.py`.
2. Entrenar múltiples calibradores (Platt / Isotónico) por régimen.
3. Visualizar curvas de confiabilidad y métricas (Brier / ECE).
4. Exportar los modelos aprobados como artefactos versionados.

> Ejecuta siempre este notebook con la misma versión de código y dataset para garantizar reproducibilidad.


In [None]:
import json
from datetime import datetime
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import train_test_split

REPO_ROOT = Path.cwd().parents[0] if Path.cwd().name == "confidence" else Path.cwd()


In [None]:
dataset_path = REPO_ROOT / "artifacts" / "confidence" / "datasets" / "<reemplaza_con_dataset>"
dataset_path


In [None]:
dataset = pd.read_parquet(dataset_path, engine="pyarrow")
dataset.head()


In [None]:
from app.confidence.calibrator import IsotonicCalibrator, PlattCalibrator, expected_calibration_error

results = {}
for regime, df_regime in dataset.groupby(dataset["market_regime"].str.lower()):
    if len(df_regime) < 200:
        continue
    X = df_regime["confidence_norm"].to_numpy().reshape(-1, 1)
    y = df_regime["hit"].astype(int).to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    calibrators = {
        "platt": PlattCalibrator(),
        "isotonic": IsotonicCalibrator(),
    }
    regime_results = {}
    for name, calibrator in calibrators.items():
        calibrator.fit(X_train, y_train)
        prob = np.clip(calibrator.predict_proba(X_test), 0.0, 1.0)
        brier = brier_score_loss(y_test, prob)
        ece = expected_calibration_error(y_test, prob)
        regime_results[name] = {
            "model": calibrator,
            "brier": brier,
            "ece": ece,
            "y_test": y_test,
            "prob": prob,
        }
    results[regime] = regime_results

results


In [None]:
def plot_reliability(regime: str, calibrator_name: str) -> None:
    payload = results[regime][calibrator_name]
    y_true = payload["y_test"]
    y_prob = payload["prob"]
    bins = np.linspace(0.0, 1.0, 11)
    accuracies = []
    confidences = []
    for i in range(len(bins) - 1):
        mask = (y_prob >= bins[i]) & (y_prob < bins[i + 1])
        if mask.any():
            accuracies.append(np.mean(y_true[mask]))
            confidences.append(np.mean(y_prob[mask]))
    plt.figure(figsize=(6, 6))
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.plot(confidences, accuracies, marker="o")
    plt.title(f"Reliability Curve | {regime} | {calibrator_name}")
    plt.xlabel("Confidence")
    plt.ylabel("Accuracy")
    plt.grid(True)
    plt.show()

plot_reliability(regime=list(results.keys())[0], calibrator_name="platt")


In [None]:
# Generar reporte HTML completo
from datetime import datetime
import matplotlib
matplotlib.use('Agg')  # Para generar sin display

html_report = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Confidence Calibration Report - {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; }}
        h1 {{ color: #333; }}
        h2 {{ color: #666; margin-top: 30px; }}
        table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        th {{ background-color: #f2f2f2; }}
        .metric {{ font-weight: bold; }}
        .good {{ color: green; }}
        .warning {{ color: orange; }}
        .bad {{ color: red; }}
        img {{ max-width: 100%; height: auto; margin: 10px 0; }}
    </style>
</head>
<body>
    <h1>Confidence Calibration Validation Report</h1>
    <p><strong>Generated:</strong> {datetime.utcnow().isoformat()} UTC</p>
    <p><strong>Dataset:</strong> {str(dataset_path)}</p>
    
    <h2>Summary by Regime</h2>
    <table>
        <tr>
            <th>Regime</th>
            <th>Calibrator</th>
            <th>Brier Score</th>
            <th>ECE</th>
            <th>Test Samples</th>
            <th>Status</th>
        </tr>
"""

for regime, candidates in results.items():
    best = min(candidates.items(), key=lambda kv: kv[1]["ece"])
    name, payload = best
    brier = payload["brier"]
    ece = payload["ece"]
    n_samples = len(payload["y_test"])
    
    status_class = "good" if ece < 0.05 and brier < 0.08 else ("warning" if ece < 0.10 else "bad")
    status_text = "✓ Good" if status_class == "good" else ("⚠ Warning" if status_class == "warning" else "✗ Poor")
    
    html_report += f"""
        <tr>
            <td>{regime}</td>
            <td>{name}</td>
            <td class="metric">{brier:.4f}</td>
            <td class="metric">{ece:.4f}</td>
            <td>{n_samples}</td>
            <td class="{status_class}">{status_text}</td>
        </tr>
    """

html_report += """
    </table>
    
    <h2>Reliability Curves</h2>
"""

# Guardar imágenes de reliability curves
import base64
from io import BytesIO

for regime, candidates in results.items():
    best = min(candidates.items(), key=lambda kv: kv[1]["ece"])
    name, payload = best
    y_true = payload["y_test"]
    y_prob = payload["prob"]
    
    fig, ax = plt.subplots(figsize=(8, 6))
    bins = np.linspace(0.0, 1.0, 11)
    accuracies = []
    confidences = []
    for i in range(len(bins) - 1):
        mask = (y_prob >= bins[i]) & (y_prob < bins[i + 1])
        if mask.any():
            accuracies.append(np.mean(y_true[mask]))
            confidences.append(np.mean(y_prob[mask]))
    
    ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Perfect Calibration")
    ax.plot(confidences, accuracies, marker="o", label=f"{name} (ECE={payload['ece']:.4f})")
    ax.set_xlabel("Confidence")
    ax.set_ylabel("Accuracy")
    ax.set_title(f"Reliability Curve | {regime} | {name}")
    ax.grid(True)
    ax.legend()
    
    # Convertir a base64
    buf = BytesIO()
    fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    plt.close(fig)
    
    html_report += f"""
    <h3>{regime} - {name}</h3>
    <img src="data:image/png;base64,{img_base64}" alt="Reliability Curve {regime} {name}" />
    """

html_report += """
</body>
</html>
"""

# Guardar reporte
report_path = REPO_ROOT / "artifacts" / "confidence" / f"validation_report_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.html"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(html_report, encoding='utf-8')
print(f"Reporte HTML guardado en: {report_path}")


In [None]:
artifacts_root = REPO_ROOT / "artifacts" / "confidence"
artifacts_root.mkdir(parents=True, exist_ok=True)

for regime, candidates in results.items():
    best = min(candidates.items(), key=lambda kv: kv[1]["ece"])
    name, payload = best
    metadata = {
        "regime": regime,
        "calibrator_type": name,
        "brier": float(payload["brier"]),
        "ece": float(payload["ece"]),
        "created_at": datetime.utcnow().isoformat(),
    }
    target_dir = artifacts_root / regime
    target_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump({"calibrator": payload["model"], "metadata": metadata}, target_dir / "calibrator.pkl")
    (target_dir / "metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")

print("Artefactos actualizados.")
