In [1]:
# -*- coding: utf-8 -*-
"""
AIB pre-falla desde Excel multi-sheet
- Cada hoja = un pozo (sheet name = well_id)
- Columnas en cada hoja: Fecha, VIBR
- Si alguna fila trae 'FALLA' en VIBR, se usa como evento y se remueve de la señal
- Resampleo (30 min), features rolling, etiquetado pre-falla (72h), modelo y métricas

Requisitos: pandas, numpy, scikit-learn
"""

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# ===================== CONFIG =====================
INPUT_XLSX = "vibraciones_multi.xlsx"  # <<< tu archivo Excel
RESAMPLE_RULE = "30min"                # ej. "15min", "1H"
FEATURE_WINDOW = "12H"
FEATURE_MIN_SAMPLES = 6
ALERTA_HORAS = 72
COOLDOWN_HORAS = 6
N_SPLITS = 4
UMBRAL_ALERTA = 0.35
SEED = 42

OUT_PROBAS_CSV = "aib_pred_probas.csv"
OUT_IMPORTANCES_CSV = "aib_feature_importances.csv"
OUT_EVENTOS_DETECTADOS = "aib_eventos_detectados.csv"
# ==================================================


def cargar_excel_multipozo(path_xlsx: str):
    """Lee todas las hojas del Excel. Cada hoja debe tener Fecha y VIBR.
       El nombre de la hoja se usa como well_id. Detecta filas con 'FALLA'."""
    xls = pd.read_excel(path_xlsx, sheet_name=None, dtype=str)
    frames = []
    eventos = []

    for sheet_name, df in xls.items():
        if df is None or df.empty:
            continue

        # Normalización básica
        df = df.copy()
        df.columns = [str(c).strip() for c in df.columns]
        colmap = {}
        for c in df.columns:
            cl = c.lower()
            if cl in ("fecha", "timestamp", "datetime"):
                colmap[c] = "timestamp"
            elif cl in ("vibr", "vibraciones", "valor", "value"):
                colmap[c] = "vibr"
        df = df.rename(columns=colmap)

        if "timestamp" not in df.columns or "vibr" not in df.columns:
            raise ValueError(
                f"La hoja '{sheet_name}' no tiene columnas reconocibles de Fecha/VIBR."
            )

        # Parseo de tiempo (formato dd/mm/yyyy hh:mm es común)
        df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce")

        # Detectar FALLA
        falla_mask = df["vibr"].astype(str).str.contains("FALLA", case=False, na=False)
        if falla_mask.any():
            for t in df.loc[falla_mask, "timestamp"]:
                if pd.notna(t):
                    eventos.append({"well_id": sheet_name, "failure_time": t})

        # Filtrar solo filas con valor numérico de vibración
        df_num = df.loc[~falla_mask].copy()
        df_num["vibr"] = pd.to_numeric(df_num["vibr"], errors="coerce")
        df_num = df_num.dropna(subset=["timestamp", "vibr"])

        if df_num.empty:
            continue

        df_num.insert(0, "well_id", sheet_name)
        frames.append(df_num[["well_id", "timestamp", "vibr"]])

    vib = pd.concat(frames, ignore_index=True).sort_values(["well_id", "timestamp"])
    ev = pd.DataFrame(eventos)
    if not ev.empty:
        ev["failure_time"] = pd.to_datetime(ev["failure_time"])
    return vib, ev


def resamplear_y_features(vib: pd.DataFrame) -> pd.DataFrame:
    """Resamplea por pozo a RESAMPLE_RULE y arma features rolling sobre FEATURE_WINDOW."""
    feats = []
    for well, g in vib.groupby("well_id", sort=False):
        g = g.sort_values("timestamp").set_index("timestamp")
        # Resampleo uniforme por tiempo (median + interpolación temporal)
        gs = g["vibr"].resample(RESAMPLE_RULE).median()
        gs = gs.interpolate(method="time", limit_direction="both")

        r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)

        # Features robustas
        f = pd.DataFrame({
            "well_id": well,
            "t_end": gs.index,
            "vib_last": gs.values,
            "vib_mean": r.mean().values,
            "vib_std": r.std().values,
            "vib_rms": np.sqrt(r.apply(lambda x: np.mean(np.square(x)) if len(x)>0 else np.nan).values),
            "vib_p2p": (r.max() - r.min()).values,
            "vib_iqr": (r.quantile(0.75) - r.quantile(0.25)).values,
            "vib_med": r.median().values,
            "vib_mad": r.apply(lambda x: np.median(np.abs(x - np.median(x))) if len(x)>0 else np.nan).values,
            "vib_diff_mean": r.apply(lambda x: np.mean(np.abs(np.diff(x))) if len(x)>1 else np.nan).values,
        })

        # Z-score robusto y pendiente aproximada
        f["vib_rz"] = (f["vib_last"] - f["vib_med"]) / f["vib_mad"].replace(0, np.nan)
        def slope_approx(x):
            n = len(x)
            return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)
        f["vib_slope"] = r.apply(slope_approx).values
        f["vib_cv"] = f["vib_std"] / (f["vib_mean"].abs() + 1e-9)

        f = f.dropna(subset=["vib_mean", "vib_std", "vib_rms", "vib_p2p"])
        feats.append(f)

    F = pd.concat(feats, ignore_index=True).sort_values(["well_id", "t_end"])
    return F


def etiquetar(F: pd.DataFrame, eventos: pd.DataFrame,
              alerta_h=72, cooldown_h=6) -> pd.DataFrame:
    """y=1 si t_end ∈ [failure_time - alerta_h, failure_time); excluye cooldown post-falla."""
    F = F.copy()
    F["y"] = 0
    F["min_dt_h"] = np.inf

    if eventos is None or eventos.empty:
        print("Aviso: no hay eventos de falla detectados.")
        return F.dropna(subset=["vib_mean"])

    eventos = eventos.sort_values(["well_id", "failure_time"])
    for well, ge in eventos.groupby("well_id"):
        mask = F["well_id"] == well
        if not mask.any():
            continue
        t_end = F.loc[mask, "t_end"]

        for _, row in ge.iterrows():
            ft = row["failure_time"]
            dt_h = (ft - t_end).dt.total_seconds() / 3600.0

            pos = mask & (dt_h >= 0) & (dt_h <= alerta_h)
            F.loc[pos, "y"] = 1

            cool = mask & (dt_h < 0) & (dt_h >= -cooldown_h)
            F.loc[cool, "y"] = np.nan

            F.loc[mask, "min_dt_h"] = np.minimum(F.loc[mask, "min_dt_h"].values,
                                                 np.abs(dt_h).values)

    F = F.dropna(subset=["y"]).copy()
    F["y"] = F["y"].astype(int)
    return F


def entrenar_y_evaluar(DS: pd.DataFrame,
                       umbral=0.35, n_splits=4, seed=42):
    feat_cols = [c for c in DS.columns if c not in {
        "well_id", "t_end", "y", "min_dt_h", "vib_med", "vib_mad"
    }]
    groups = DS["well_id"].astype(str).values

    classes = np.unique(DS["y"])
    cw = compute_class_weight(class_weight="balanced", classes=classes, y=DS["y"])
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}

    gkf = GroupKFold(n_splits=max(2, min(n_splits, DS["well_id"].nunique())))
    y_true_all, y_pred_all, y_proba_all = [], [], []

    for fold, (tr, te) in enumerate(gkf.split(DS[feat_cols], DS["y"], groups)):
        Xtr, Xte = DS.iloc[tr][feat_cols], DS.iloc[te][feat_cols]
        ytr, yte = DS.iloc[tr]["y"], DS.iloc[te]["y"]

        model = HistGradientBoostingClassifier(
            learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=seed
        )
        sw = ytr.map(lambda yy: class_weight[int(yy)]).values
        model.fit(Xtr, ytr, sample_weight=sw)

        proba = model.predict_proba(Xte)[:, 1]
        ypred = (proba >= umbral).astype(int)

        y_true_all.extend(yte.tolist())
        y_pred_all.extend(ypred.tolist())
        y_proba_all.extend(proba.tolist())

    print("ROC-AUC:", round(roc_auc_score(y_true_all, y_proba_all), 4))
    print("PR-AUC :", round(average_precision_score(y_true_all, y_proba_all), 4))
    print(classification_report(y_true_all, y_pred_all, digits=3))

    # Modelo final para exportar probabilidades
    model_full = HistGradientBoostingClassifier(
        learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=seed
    )
    sw_full = DS["y"].map(lambda yy: class_weight[int(yy)]).values
    model_full.fit(DS[feat_cols], DS["y"], sample_weight=sw_full)

    # Importancias proxy (correlación con y, rápida)
    with np.errstate(invalid="ignore"):
        corrs = {c: np.corrcoef(DS[c].fillna(DS[c].median()), DS["y"])[0, 1] for c in feat_cols}
    imp = pd.DataFrame({"feature": list(corrs.keys()), "corr_with_y": list(corrs.values())}) \
           .sort_values("corr_with_y", ascending=False)
    imp.to_csv(OUT_IMPORTANCES_CSV, index=False)
    print(f"[OK] Guardé importancias proxy en {OUT_IMPORTANCES_CSV}")

    return model_full, feat_cols


def main():
    path = Path(INPUT_XLSX)
    assert path.exists(), f"No encontré el archivo: {path}"

    vib, ev = cargar_excel_multipozo(str(path))
    print(f"Cargadas {vib['well_id'].nunique()} series | {len(vib):,} muestras")
    print(f"Eventos detectados en hojas: {0 if ev is None else len(ev)}")
    if ev is not None and not ev.empty:
        ev.sort_values(["well_id", "failure_time"]).to_csv(OUT_EVENTOS_DETECTADOS, index=False)
        print(f"[OK] Guardé eventos detectados en {OUT_EVENTOS_DETECTADOS}")

    F = resamplear_y_features(vib)
    DS = etiquetar(F, ev, alerta_h=ALERTA_HORAS, cooldown_h=COOLDOWN_HORAS)

    if DS["y"].sum() == 0:
        print("No hay positivos etiquetados. Revisá ALERTA_HORAS o las marcas de FALLA.")
        return

    model, feat_cols = entrenar_y_evaluar(DS, umbral=UMBRAL_ALERTA, n_splits=N_SPLITS, seed=SEED)

    DS = DS.copy()
    DS["proba"] = model.predict_proba(DS[feat_cols])[:, 1]
    DS.to_csv(OUT_PROBAS_CSV, index=False)
    print(f"[OK] Guardé probabilidades por timestamp en {OUT_PROBAS_CSV}")


if __name__ == "__main__":
    main()


Cargadas 5 series | 9,058 muestras
Eventos detectados en hojas: 5
[OK] Guardé eventos detectados en aib_eventos_detectados.csv


  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  return np.nan if n < 2 else (x[-1] - x[0]) / (n - 1)


ROC-AUC: 0.5608
PR-AUC : 0.0716
              precision    recall  f1-score   support

           0      0.942     0.945     0.943     11527
           1      0.068     0.064     0.066       720

    accuracy                          0.893     12247
   macro avg      0.505     0.504     0.505     12247
weighted avg      0.890     0.893     0.892     12247

[OK] Guardé importancias proxy en aib_feature_importances.csv
[OK] Guardé probabilidades por timestamp en aib_pred_probas.csv


In [5]:
#FUNCIONA BASTANTE BIEN, DA LAS ALERTAS ANTES DE LA FALLA. 
#
# -*- coding: utf-8 -*-
"""
AIB - Versión única (corregida, sin 'global') — de Excel multi-hoja a alertas.csv

Qué hace:
1) Lee 'vibraciones_multi.xlsx' (cada hoja = pozo, columnas: Fecha y VIBR).
   - Filas con "FALLA" en VIBR marcan la fecha de la caída y NO se usan como señal.
2) Resamplea a 30 minutos y calcula indicadores rolling 24 h.
3) Etiqueta como "pre-falla" las ventanas dentro de las 48 h previas a cada falla.
4) Entrena y evalúa un modelo (HistGradientBoosting) con validación por pozo.
5) Genera probabilidades por tiempo y arma alertas con: umbral, 2 consecutivas, cooldown.
6) Guarda:
   - alertas.csv
   - aib_pred_probas.csv
   - aib_feature_importances.csv
   - aib_eventos_detectados.csv

Requisitos:
    pip install pandas numpy scikit-learn openpyxl
"""

import argparse
from pathlib import Path
from datetime import timedelta
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# ===================== DEFAULTS (podés cambiarlos acá o por CLI) =====================
INPUT_XLSX = "vibraciones_multi.xlsx"
DEFAULT_RESAMPLE_RULE = "30min"     # "15min", "1H", etc.
DEFAULT_FEATURE_WINDOW = "24H"      # ventana de features (recomendado 24H)
DEFAULT_FEATURE_MIN_SAMPLES = 6     # mínimo de puntos válidos dentro de la ventana
DEFAULT_ALERTA_HORAS = 48           # horizonte pre-falla a detectar
COOLDOWN_HORAS = 6                  # enfriamiento para etiquetado post-falla y para alertas
N_SPLITS = 4                        # folds de validación (agrupando por pozo)
SEED = 42

# Regla de alerta
DEFAULT_UMBRAL_ALERTA = 0.70        # umbral alto para bajar falsas
DEFAULT_CONSECUTIVAS = 2            # exigir 2 lecturas seguidas >= umbral
DEFAULT_COOLDOWN_ALERTA_H = 6       # no emitir otra alerta en la misma zona por X horas

# Archivos de salida
OUT_ALERTAS = "alertas.csv"
OUT_PROBAS_CSV = "aib_pred_probas.csv"
OUT_IMPORTANCES_CSV = "aib_feature_importances.csv"
OUT_EVENTOS_DETECTADOS = "aib_eventos_detectados.csv"
# =====================================================================================


def cargar_excel_multipozo(path_xlsx: str):
    """Lee todas las hojas del Excel. Cada hoja debe tener Fecha y VIBR.
       El nombre de la hoja se usa como well_id. Detecta filas con 'FALLA'."""
    xls = pd.read_excel(path_xlsx, sheet_name=None, dtype=str)
    frames = []
    eventos = []

    for sheet_name, df in xls.items():
        if df is None or df.empty:
            continue

        df = df.copy()
        df.columns = [str(c).strip() for c in df.columns]

        # Mapear nombres
        colmap = {}
        for c in df.columns:
            cl = c.lower()
            if cl in ("fecha", "timestamp", "datetime"):
                colmap[c] = "timestamp"
            elif (cl in ("vibr", "vibraciones", "vibracion", "valor", "value", "vibr.") or cl.startswith("vibr")):
                colmap[c] = "vibr"
        if colmap:
            df = df.rename(columns=colmap)
        else:
            # fallback: primera=fecha, segunda=vibr
            if len(df.columns) >= 2:
                df = df.rename(columns={df.columns[0]: "timestamp", df.columns[1]: "vibr"})

        if "timestamp" not in df.columns or "vibr" not in df.columns:
            raise ValueError(f"La hoja '{sheet_name}' no tiene columnas reconocibles de Fecha/VIBR.")

        # Parsear tiempo dd/mm/yyyy hh:mm típico
        df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce")

        # Detectar FALLA
        falla_mask = df["vibr"].astype(str).str.contains("FALLA", case=False, na=False)
        if falla_mask.any():
            for t in df.loc[falla_mask, "timestamp"]:
                if pd.notna(t):
                    eventos.append({"well_id": sheet_name, "failure_time": t})

        # Filas numéricas
        df_num = df.loc[~falla_mask].copy()
        df_num["vibr"] = pd.to_numeric(df_num["vibr"], errors="coerce")
        df_num = df_num.dropna(subset=["timestamp", "vibr"])
        if df_num.empty:
            continue

        df_num.insert(0, "well_id", sheet_name)
        frames.append(df_num[["well_id", "timestamp", "vibr"]])

    if not frames:
        raise ValueError("No se encontraron datos numéricos de vibración en el Excel.")

    vib = pd.concat(frames, ignore_index=True).sort_values(["well_id", "timestamp"])
    ev = pd.DataFrame(eventos)
    if not ev.empty:
        ev["failure_time"] = pd.to_datetime(ev["failure_time"], errors="coerce")
    return vib, ev


def features_por_pozo(g: pd.DataFrame,
                      rule: str,
                      win: str,
                      min_samp: int) -> pd.DataFrame:
    """Resamplea por pozo y arma features rolling (parámetros explícitos, sin global)."""
    g = g.sort_values("timestamp").set_index("timestamp")
    gs = g["vibr"].resample(rule).median().interpolate(method="time", limit_direction="both")

    r = gs.rolling(win, min_periods=min_samp)
    mean = r.mean()
    std = r.std()
    p2p = r.max() - r.min()
    rms = (gs.pow(2).rolling(win, min_periods=min_samp).mean()) ** 0.5
    diff_abs_mean = gs.diff().abs().rolling(win, min_periods=min_samp).mean()
    slope = gs.diff().rolling(win, min_periods=min_samp).mean()
    cv = std / (mean.abs() + 1e-9)

    F = pd.DataFrame({
        "t_end": gs.index,
        "vib_last": gs.values,
        "vib_mean": mean.values,
        "vib_std": std.values,
        "vib_rms": rms.values,
        "vib_p2p": p2p.values,
        "vib_diff_mean": diff_abs_mean.values,
        "vib_slope": slope.values,
        "vib_cv": cv.values
    }).dropna(subset=["vib_mean", "vib_std", "vib_rms", "vib_p2p"])
    return F


def construir_dataset(vib: pd.DataFrame,
                      rule: str,
                      feature_window: str,
                      min_samp: int) -> pd.DataFrame:
    """Construye el dataset de features para todos los pozos (parámetros explícitos)."""
    feats = []
    for well, g in vib.groupby("well_id", sort=False):
        F = features_por_pozo(g, rule=rule, win=feature_window, min_samp=min_samp)
        if not F.empty:
            F.insert(0, "well_id", well)
            feats.append(F)
    if not feats:
        raise ValueError("No se pudieron construir features (¿pocos datos o NaNs?)")
    return pd.concat(feats, ignore_index=True).sort_values(["well_id", "t_end"])


def etiquetar(F: pd.DataFrame, eventos: pd.DataFrame,
              alerta_h: int, cooldown_h: int) -> pd.DataFrame:
    """Etiqueta y=1 si t_end cae en las 'alerta_h' horas previas a una falla."""
    F = F.copy()
    F["y"] = 0
    F["min_dt_h"] = np.inf

    if eventos is None or eventos.empty:
        print("Aviso: no hay eventos de falla detectados.")
        return F.dropna(subset=["vib_mean"])

    eventos = eventos.sort_values(["well_id", "failure_time"])
    for well, ge in eventos.groupby("well_id"):
        mask = F["well_id"] == well
        if not mask.any():
            continue
        t_end = F.loc[mask, "t_end"]

        for _, row in ge.iterrows():
            ft = row["failure_time"]
            dt_h = (ft - t_end).dt.total_seconds() / 3600.0

            # Ventanas positivas (pre-falla)
            pos = mask & (dt_h >= 0) & (dt_h <= alerta_h)
            F.loc[pos, "y"] = 1

            # Cooldown post-falla (excluir)
            cool = mask & (dt_h < 0) & (dt_h >= -cooldown_h)
            F.loc[cool, "y"] = np.nan

            # Distancia temporal mínima a una falla (para info)
            F.loc[mask, "min_dt_h"] = np.minimum(F.loc[mask, "min_dt_h"].values,
                                                 np.abs(dt_h).values)

    F = F.dropna(subset=["y"]).copy()
    F["y"] = F["y"].astype(int)
    return F


def entrenar_y_evaluar(DS: pd.DataFrame,
                       n_splits: int,
                       seed: int):
    """Entrena con validación por pozo y devuelve modelo final + columnas de features."""
    feat_cols = [c for c in DS.columns if c not in {
        "well_id", "t_end", "y", "min_dt_h"
    }]

    groups = DS["well_id"].astype(str).values
    classes = np.unique(DS["y"])
    cw = compute_class_weight(class_weight="balanced", classes=classes, y=DS["y"])
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}

    gkf = GroupKFold(n_splits=max(2, min(n_splits, DS["well_id"].nunique())))
    y_true_all, y_pred_all, y_proba_all = [], [], []

    for fold, (tr, te) in enumerate(gkf.split(DS[feat_cols], DS["y"], groups)):
        Xtr, Xte = DS.iloc[tr][feat_cols], DS.iloc[te][feat_cols]
        ytr, yte = DS.iloc[tr]["y"], DS.iloc[te]["y"]

        model = HistGradientBoostingClassifier(
            learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=seed
        )
        sw = ytr.map(lambda yy: class_weight[int(yy)]).values
        model.fit(Xtr, ytr, sample_weight=sw)

        proba = model.predict_proba(Xte)[:, 1]
        y_proba_all.extend(proba.tolist())

        # Umbral de referencia solo para imprimir (no para operar)
        umbral_ref = 0.50
        ypred = (proba >= umbral_ref).astype(int)
        y_true_all.extend(yte.tolist())
        y_pred_all.extend(ypred.tolist())

    print("\n=== MÉTRICAS (validación por pozo) ===")
    try:
        print("ROC-AUC:", round(roc_auc_score(y_true_all, y_proba_all), 4))
        print("PR-AUC :", round(average_precision_score(y_true_all, y_proba_all), 4))
    except Exception as e:
        print("No se pudieron calcular AUCs:", e)
    print("Reporte con umbral de referencia 0.50 (solo informativo):")
    print(classification_report(y_true_all, y_pred_all, digits=3))

    # Entrenamos el modelo final con TODO el dataset
    model_full = HistGradientBoostingClassifier(
        learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=seed
    )
    sw_full = DS["y"].map(lambda yy: class_weight[int(yy)]).values
    model_full.fit(DS[feat_cols], DS["y"], sample_weight=sw_full)

    # Importancias "proxy" via correlación simple (rápida, orientativa)
    with np.errstate(invalid="ignore"):
        corrs = {c: np.corrcoef(DS[c].fillna(DS[c].median()), DS["y"])[0, 1] for c in feat_cols}
    imp = pd.DataFrame({"feature": list(corrs.keys()), "corr_with_y": list(corrs.values())}) \
           .sort_values("corr_with_y", ascending=False)
    imp.to_csv(OUT_IMPORTANCES_CSV, index=False)
    print(f"[OK] Guardé importancias proxy en {OUT_IMPORTANCES_CSV}")

    return model_full, feat_cols


def generar_alertas(df_probas: pd.DataFrame,
                    umbral: float,
                    consecutivas: int,
                    cooldown_h: int) -> pd.DataFrame:
    """Regla operativa: umbral + N consecutivas + cooldown."""
    alertas = []
    for well, g in df_probas.groupby("well_id", sort=False):
        g = g.sort_values("t_end").reset_index(drop=True)
        run = 0
        last_alert = None
        for _, row in g.iterrows():
            if row["proba"] >= umbral:
                run += 1
            else:
                run = 0
            if run >= consecutivas:
                t = row["t_end"]
                if (last_alert is None) or ((t - last_alert).total_seconds() >= cooldown_h * 3600):
                    alertas.append({
                        "well_id": well,
                        "t_alerta": t,
                        "proba": float(row["proba"]),
                        "y_en_esa_ventana": int(row.get("y", 0)),
                        "min_dt_h": float(row.get("min_dt_h", np.nan))
                    })
                    last_alert = t
                run = 0
    return pd.DataFrame(alertas).sort_values(["well_id", "t_alerta"])


def cobertura_por_evento(alertas_df: pd.DataFrame, eventos_df: pd.DataFrame,
                         alerta_horas: int):
    """Calcula en cuántas fallas hubo al menos una alerta previa y el anticipo medio."""
    if alertas_df.empty or eventos_df is None or eventos_df.empty:
        return 0, 0, np.nan
    cubiertas = 0
    leadtimes = []
    for _, e in eventos_df.iterrows():
        well = e["well_id"]; ft = e["failure_time"]
        a = alertas_df[(alertas_df["well_id"] == well)]
        a = a[(a["t_alerta"] <= ft) & (a["t_alerta"] >= ft - pd.Timedelta(hours=alerta_horas))]
        if not a.empty:
            cubiertas += 1
            # Tomamos la alerta más temprana dentro de la ventana
            lt_h = (ft - a["t_alerta"].min()).total_seconds() / 3600.0
            leadtimes.append(lt_h)
    total = len(eventos_df)
    cobertura = cubiertas / total if total else 0
    lead_prom = float(np.mean(leadtimes)) if leadtimes else np.nan
    return cubiertas, total, lead_prom


def main():
    parser = argparse.ArgumentParser(description="AIB - versión única (de Excel a alertas.csv)")
    parser.add_argument("--input", default=INPUT_XLSX, help="Ruta del Excel multi-hoja")
    parser.add_argument("--umbral", type=float, default=DEFAULT_UMBRAL_ALERTA, help="Umbral de alerta (proba)")
    parser.add_argument("--consec", type=int, default=DEFAULT_CONSECUTIVAS, help="Lecturas consecutivas requeridas")
    parser.add_argument("--cooldown_h", type=int, default=DEFAULT_COOLDOWN_ALERTA_H, help="Cooldown entre alertas (horas)")
    parser.add_argument("--alerta_h", type=int, default=DEFAULT_ALERTA_HORAS, help="Ventana pre-falla (horas)")
    parser.add_argument("--feature_win", default=DEFAULT_FEATURE_WINDOW, help="Ventana de features (ej. '24H')")
    parser.add_argument("--resample", default=DEFAULT_RESAMPLE_RULE, help="Frecuencia de resampleo (ej. '30min')")
    parser.add_argument("--min_samp", type=int, default=DEFAULT_FEATURE_MIN_SAMPLES, help="Mínimo de muestras en ventana")
    args, _ = parser.parse_known_args()

    # Mensaje inicial
    print("=======================================")
    print("   AIB - Pipeline completo a alertas   ")
    print("=======================================")
    print(f"Excel: {args.input}")
    print(f"Params: ALERTA_H={args.alerta_h}h | FEAT_WIN={args.feature_win} | RESAMPLE={args.resample} | MIN_SAMP={args.min_samp}")
    print(f"Regla: UMBRAL={args.umbral} | CONSEC={args.consec} | COOLDOWN={args.cooldown_h}h")

    # Carga de datos
    path = Path(args.input)
    assert path.exists(), f"No encontré el archivo: {path}"

    vib, ev = cargar_excel_multipozo(str(path))
    print(f"\nPozos: {vib['well_id'].nunique()} | Muestras: {len(vib):,}")
    print(f"Fallas detectadas: {0 if ev is None else len(ev)}")
    if ev is not None and not ev.empty:
        ev.sort_values(["well_id", "failure_time"]).to_csv(OUT_EVENTOS_DETECTADOS, index=False)
        print(f"[OK] Guardé eventos detectados en {OUT_EVENTOS_DETECTADOS}")

    # Construcción de features (parámetros explícitos)
    F = construir_dataset(vib, rule=args.resample, feature_window=args.feature_win, min_samp=args.min_samp)
    DS = etiquetar(F, ev, alerta_h=args.alerta_h, cooldown_h=COOLDOWN_HORAS)

    if DS["y"].sum() == 0:
        print("No hay positivos etiquetados. Revisá ALERTA_HORAS o las marcas de FALLA en el Excel.")
        return

    # Entrenar y evaluar
    model, feat_cols = entrenar_y_evaluar(DS, n_splits=N_SPLITS, seed=SEED)

    # Probabilidades por timestamp
    DS = DS.copy()
    DS["proba"] = model.predict_proba(DS[feat_cols])[:, 1]
    DS.to_csv(OUT_PROBAS_CSV, index=False)
    print(f"[OK] Guardé probabilidades por timestamp en {OUT_PROBAS_CSV}")

    # Generar alertas operativas
    alertas_df = generar_alertas(DS, umbral=args.umbral,
                                 consecutivas=args.consec,
                                 cooldown_h=args.cooldown_h)
    alertas_df.to_csv(OUT_ALERTAS, index=False)
    print(f"[OK] Guardé alertas en {OUT_ALERTAS} (total: {len(alertas_df)})")

    # Cobertura por evento
    cubiertas, total, lead_prom = cobertura_por_evento(alertas_df, ev, alerta_horas=args.alerta_h)
    print("\n=== COBERTURA POR EVENTO ===")
    print(f"Fallas cubiertas con >=1 alerta previa: {cubiertas}/{total}")
    if total:
        print(f"Cobertura evento: {100.0*cubiertas/total:.1f}%")
    if not np.isnan(lead_prom):
        print(f"Anticipo medio (horas) de la PRIMERA alerta por evento: {lead_prom:.1f}h")

    print("\nListo. Archivos generados:")
    print(f" - {OUT_ALERTAS}")
    print(f" - {OUT_PROBAS_CSV}")
    print(f" - {OUT_IMPORTANCES_CSV}")
    if ev is not None and not ev.empty:
        print(f" - {OUT_EVENTOS_DETECTADOS}")


if __name__ == "__main__":
    main()

   AIB - Pipeline completo a alertas   
Excel: vibraciones_multi.xlsx
Params: ALERTA_H=48h | FEAT_WIN=24H | RESAMPLE=30min | MIN_SAMP=6
Regla: UMBRAL=0.7 | CONSEC=2 | COOLDOWN=6h

Pozos: 5 | Muestras: 9,058
Fallas detectadas: 5
[OK] Guardé eventos detectados en aib_eventos_detectados.csv


  r = gs.rolling(win, min_periods=min_samp)
  rms = (gs.pow(2).rolling(win, min_periods=min_samp).mean()) ** 0.5
  diff_abs_mean = gs.diff().abs().rolling(win, min_periods=min_samp).mean()
  slope = gs.diff().rolling(win, min_periods=min_samp).mean()
  r = gs.rolling(win, min_periods=min_samp)
  rms = (gs.pow(2).rolling(win, min_periods=min_samp).mean()) ** 0.5
  diff_abs_mean = gs.diff().abs().rolling(win, min_periods=min_samp).mean()
  slope = gs.diff().rolling(win, min_periods=min_samp).mean()
  r = gs.rolling(win, min_periods=min_samp)
  rms = (gs.pow(2).rolling(win, min_periods=min_samp).mean()) ** 0.5
  diff_abs_mean = gs.diff().abs().rolling(win, min_periods=min_samp).mean()
  slope = gs.diff().rolling(win, min_periods=min_samp).mean()
  r = gs.rolling(win, min_periods=min_samp)
  rms = (gs.pow(2).rolling(win, min_periods=min_samp).mean()) ** 0.5
  diff_abs_mean = gs.diff().abs().rolling(win, min_periods=min_samp).mean()
  slope = gs.diff().rolling(win, min_periods=min_samp).mea


=== MÉTRICAS (validación por pozo) ===
ROC-AUC: 0.5535
PR-AUC : 0.0465
Reporte con umbral de referencia 0.50 (solo informativo):
              precision    recall  f1-score   support

           0      0.959     0.958     0.959     11767
           1      0.006     0.006     0.006       480

    accuracy                          0.921     12247
   macro avg      0.483     0.482     0.483     12247
weighted avg      0.922     0.921     0.922     12247

[OK] Guardé importancias proxy en aib_feature_importances.csv
[OK] Guardé probabilidades por timestamp en aib_pred_probas.csv
[OK] Guardé alertas en alertas.csv (total: 40)

=== COBERTURA POR EVENTO ===
Fallas cubiertas con >=1 alerta previa: 5/5
Cobertura evento: 100.0%
Anticipo medio (horas) de la PRIMERA alerta por evento: 47.3h

Listo. Archivos generados:
 - alertas.csv
 - aib_pred_probas.csv
 - aib_feature_importances.csv
 - aib_eventos_detectados.csv


In [6]:
#TE ARMA EL SCRIP DEL ARCHIVO .PKL PARA LA PREDICCION.

# -*- coding: utf-8 -*-
"""
AIB - ENTRENAR Y GUARDAR MODELO (simple y directo)
- Lee 'vibraciones_multi.xlsx' (cada hoja = pozo; columnas: Fecha, VIBR)
- Calcula features 24h, etiqueta 48h pre-falla y entrena
- Guarda: model_aib.pkl (modelo) y model_aib_config.json (parámetros de preprocesamiento)

Uso:
    python aib_train_save.py

Requisitos:
    pip install pandas numpy scikit-learn openpyxl
"""
import json, pickle
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# ===== Parámetros base (no toques si no hace falta) =====
INPUT_XLSX = "vibraciones_multi.xlsx"
RESAMPLE_RULE = "30min"
FEATURE_WINDOW = "24H"
FEATURE_MIN_SAMPLES = 6
ALERTA_HORAS = 48
COOLDOWN_HORAS = 6
N_SPLITS = 4
SEED = 42
# =======================================================

def cargar_excel_multipozo(path_xlsx: str):
    xls = pd.read_excel(path_xlsx, sheet_name=None, dtype=str)
    frames, eventos = [], []
    for sheet_name, df in xls.items():
        if df is None or df.empty: 
            continue
        df = df.copy()
        df.columns = [str(c).strip() for c in df.columns]
        colmap = {}
        for c in df.columns:
            cl = c.lower()
            if cl in ("fecha","timestamp","datetime"): colmap[c]="timestamp"
            elif (cl in ("vibr","vibraciones","vibracion","valor","value","vibr.") or cl.startswith("vibr")):
                colmap[c]="vibr"
        if colmap:
            df = df.rename(columns=colmap)
        elif len(df.columns)>=2:
            df = df.rename(columns={df.columns[0]:"timestamp", df.columns[1]:"vibr"})
        if "timestamp" not in df.columns or "vibr" not in df.columns: 
            continue
        df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce")
        falla_mask = df["vibr"].astype(str).str.contains("FALLA", case=False, na=False)
        if falla_mask.any():
            for t in df.loc[falla_mask,"timestamp"]:
                if pd.notna(t):
                    eventos.append({"well_id": sheet_name, "failure_time": t})
        df_num = df.loc[~falla_mask].copy()
        df_num["vibr"] = pd.to_numeric(df_num["vibr"], errors="coerce")
        df_num = df_num.dropna(subset=["timestamp","vibr"])
        if df_num.empty: 
            continue
        df_num.insert(0,"well_id", sheet_name)
        frames.append(df_num[["well_id","timestamp","vibr"]])
    if not frames:
        raise ValueError("No se encontraron datos numéricos de vibración.")
    vib = pd.concat(frames, ignore_index=True).sort_values(["well_id","timestamp"])
    ev = pd.DataFrame(eventos)
    if not ev.empty:
        ev["failure_time"] = pd.to_datetime(ev["failure_time"])
    return vib, ev

def build_features(vib: pd.DataFrame) -> pd.DataFrame:
    feats = []
    for well, g in vib.groupby("well_id", sort=False):
        g = g.sort_values("timestamp").set_index("timestamp")
        gs = g["vibr"].resample(RESAMPLE_RULE).median().interpolate(method="time", limit_direction="both")
        r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
        mean = r.mean(); std = r.std(); p2p = r.max()-r.min()
        rms = (gs.pow(2).rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean())**0.5
        diff_abs_mean = gs.diff().abs().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
        slope = gs.diff().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
        cv = std / (mean.abs() + 1e-9)
        F = pd.DataFrame({
            "well_id": well,
            "t_end": gs.index,
            "vib_last": gs.values,
            "vib_mean": mean.values,
            "vib_std": std.values,
            "vib_rms": rms.values,
            "vib_p2p": p2p.values,
            "vib_diff_mean": diff_abs_mean.values,
            "vib_slope": slope.values,
            "vib_cv": cv.values
        }).dropna()
        feats.append(F)
    return pd.concat(feats, ignore_index=True).sort_values(["well_id","t_end"])

def etiquetar(F: pd.DataFrame, eventos: pd.DataFrame) -> pd.DataFrame:
    F = F.copy(); F["y"]=0
    if eventos is None or eventos.empty: 
        return F
    for well, ge in eventos.groupby("well_id"):
        mask = F["well_id"]==well
        t_end = F.loc[mask,"t_end"]
        for _, row in ge.iterrows():
            ft = row["failure_time"]
            dt_h = (ft - t_end).dt.total_seconds()/3600.0
            pos = mask & (dt_h>=0) & (dt_h<=ALERTA_HORAS)
            cool= mask & (dt_h<0) & (dt_h>=-COOLDOWN_HORAS)
            F.loc[pos,"y"]=1
            F.loc[cool,"y"]=np.nan
    F = F.dropna(subset=["y"]).copy()
    F["y"]=F["y"].astype(int)
    return F

def main():
    path = Path(INPUT_XLSX); assert path.exists(), f"No encontré {path}"
    vib, ev = cargar_excel_multipozo(str(path))
    F = build_features(vib)
    DS = etiquetar(F, ev)
    feat_cols = [c for c in DS.columns if c not in {"well_id","t_end","y"}]
    classes = np.unique(DS["y"])
    cw = compute_class_weight(class_weight="balanced", classes=classes, y=DS["y"])
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}
    y_true, y_pred, y_proba = [], [], []
    gkf = GroupKFold(n_splits=max(2, min(N_SPLITS, DS["well_id"].nunique())))
    for tr, te in gkf.split(DS[feat_cols], DS["y"], DS["well_id"].astype(str).values):
        Xtr, Xte = DS.iloc[tr][feat_cols], DS.iloc[te][feat_cols]
        ytr, yte = DS.iloc[tr]["y"], DS.iloc[te]["y"]
        sw = ytr.map(lambda yy: class_weight[int(yy)]).values
        m = HistGradientBoostingClassifier(learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=SEED)
        m.fit(Xtr, ytr, sample_weight=sw)
        p = m.predict_proba(Xte)[:,1]; y_proba.extend(p.tolist()); y_true.extend(yte.tolist())
        y_pred.extend((p>=0.5).astype(int).tolist())
    print("ROC-AUC:", round(roc_auc_score(y_true,y_proba),4))
    print("PR-AUC :", round(average_precision_score(y_true,y_proba),4))
    print(classification_report(y_true,y_pred,digits=3))
    # Entrena final y guarda
    m = HistGradientBoostingClassifier(learning_rate=0.08, max_iter=500, min_samples_leaf=20, random_state=SEED)
    sw_full = DS["y"].map(lambda yy: class_weight[int(yy)]).values
    m.fit(DS[feat_cols], DS["y"], sample_weight=sw_full)
    with open("model_aib.pkl","wb") as f:
        pickle.dump({"model": m, "features": feat_cols}, f)
    with open("model_aib_config.json","w") as f:
        json.dump({
            "RESAMPLE_RULE": RESAMPLE_RULE,
            "FEATURE_WINDOW": FEATURE_WINDOW,
            "FEATURE_MIN_SAMPLES": FEATURE_MIN_SAMPLES
        }, f, indent=2)
    print("[OK] Guardé model_aib.pkl y model_aib_config.json")

if __name__=="__main__":
    main()

  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  rms = (gs.pow(2).rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean())**0.5
  diff_abs_mean = gs.diff().abs().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
  slope = gs.diff().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  rms = (gs.pow(2).rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean())**0.5
  diff_abs_mean = gs.diff().abs().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
  slope = gs.diff().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
  r = gs.rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES)
  rms = (gs.pow(2).rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean())**0.5
  diff_abs_mean = gs.diff().abs().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPLES).mean()
  slope = gs.diff().rolling(FEATURE_WINDOW, min_periods=FEATURE_MIN_SAMPL

ROC-AUC: 0.5607
PR-AUC : 0.0464
              precision    recall  f1-score   support

           0      0.959     0.957     0.958     11762
           1      0.006     0.006     0.006       480

    accuracy                          0.920     12242
   macro avg      0.483     0.482     0.482     12242
weighted avg      0.922     0.920     0.921     12242

[OK] Guardé model_aib.pkl y model_aib_config.json
