In [4]:
# Bayesian Network Baseline (Naive Bayes BN) for Ordinal Severity (3-class merged)

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from pgmpy.models import BayesianNetwork, DiscreteBayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
from pathlib import Path

BASE_DIR = Path().resolve()

CSV_PATH = BASE_DIR / "storms_data.csv"
storms_data = pd.read_csv(CSV_PATH)

In [5]:
# Columns
OUTAGE_COL = "max_outage_after_24h"
BASELINE_COL = "baseline_outage_median"
HU_COL = "housing_units"

feature_cols = [
    "era_i10fg_max_total_48h",
    "era_tp_max_total_48h",
    "era_crr_max_total_48h",
    'housing_units_by_area',
    "overhead_circuits",
    "n_points",
    "n_urban",
    "season_code",
    "cbp_emp_total",
]


RANDOM_STATE = 42
TEST_SIZE = 0.2
USE_EXCESS = True
CUTS = [0.005, 0.02, 0.05]   # 4-level cuts, then merge
N_BINS = 5                  # discretization bins per numeric feature (tune 4–8)


required_cols = list(set(feature_cols + [OUTAGE_COL, BASELINE_COL, HU_COL]))
missing = [c for c in required_cols if c not in storms_data.columns]
if missing:
    raise KeyError(f"Missing required columns in CSV: {missing}")


df0 = storms_data.copy()

max_out = pd.to_numeric(df0[OUTAGE_COL], errors="coerce")
base = pd.to_numeric(df0[BASELINE_COL], errors="coerce")
hu = pd.to_numeric(df0[HU_COL], errors="coerce").replace(0, np.nan)

z = (max_out - base) if USE_EXCESS else max_out
df0["sev_ratio"] = (z / hu).clip(lower=0)

# numeric coercion
for c in feature_cols:
    df0[c] = pd.to_numeric(df0[c], errors="coerce")

df = df0.dropna(subset=feature_cols + ["sev_ratio"]).copy()

def ratio_to_level_4(x: float, cuts=CUTS) -> int:
    if x < cuts[0]:
        return 0
    elif x < cuts[1]:
        return 1
    elif x < cuts[2]:
        return 2
    else:
        return 3

df["y4"] = df["sev_ratio"].apply(ratio_to_level_4).astype(int)
df["y"] = df["y4"].replace({2: 1, 3: 2}).astype(int)  # merged into {0,1,2}

levels = sorted(df["y"].unique().tolist())
if levels != [0, 1, 2]:
    raise ValueError(f"Expected merged y levels [0,1,2], got {levels}")

print("\n[Check] merged y class proportions (%):")
print((df["y"].value_counts(normalize=True).sort_index().mul(100).round(2)).to_string())



X = df[feature_cols].copy()
y = df["y"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

train = X_train.copy()
train["y"] = y_train.values

test = X_test.copy()
test["y"] = y_test.values


def make_quantile_edges(s: pd.Series, n_bins: int):
    """Return bin edges based on train quantiles, robust to ties."""
    s = s.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return None
    qs = np.linspace(0, 1, n_bins + 1)
    edges = np.quantile(s.values, qs)
    edges = np.unique(edges)
    if len(edges) < 3:
        # fallback: use min/max with small epsilon
        mn, mx = float(s.min()), float(s.max())
        if mn == mx:
            return np.array([mn - 1e-9, mn + 1e-9])
        return np.array([mn - 1e-9, mx + 1e-9])
    edges[0] = edges[0] - 1e-9
    edges[-1] = edges[-1] + 1e-9
    return edges

def discretize_with_edges(df_in: pd.DataFrame, edges_map: dict):
    df_out = df_in.copy()
    for col, edges in edges_map.items():
        if edges is None:
            df_out[col] = 0
        else:
            df_out[col] = pd.cut(df_out[col], bins=edges, labels=False, include_lowest=True)
            df_out[col] = df_out[col].astype("float").fillna(0).astype(int)
    return df_out

# build edges on train only
edges_map = {}
for col in feature_cols:
    edges_map[col] = make_quantile_edges(train[col], N_BINS)

train_disc = discretize_with_edges(train, edges_map)
test_disc  = discretize_with_edges(test, edges_map)

# ensure y is int
train_disc["y"] = train_disc["y"].astype(int)
test_disc["y"]  = test_disc["y"].astype(int)


edges = [("y", f) for f in feature_cols]
model = DiscreteBayesianNetwork(edges)

# Fit CPDs with Bayesian (Dirichlet) smoothing
model.fit(
    train_disc,
    estimator=BayesianEstimator,
    prior_type="BDeu",
    equivalent_sample_size=10,  # smoothing strength (5–50 reasonable)
)

infer = VariableElimination(model)


def predict_proba_bn(infer, df_disc: pd.DataFrame, feature_cols, y_states=(0,1,2)):
    probas = np.zeros((len(df_disc), len(y_states)), dtype=float)
    for i in range(len(df_disc)):
        evidence = {c: int(df_disc.iloc[i][c]) for c in feature_cols}
        q = infer.query(variables=["y"], evidence=evidence, show_progress=False)
        # q.values aligned with state order 0..K-1
        vals = q.values
        # guard: sometimes pgmpy returns float64
        probas[i, :] = vals[:len(y_states)]
    return probas

proba = predict_proba_bn(infer, test_disc, feature_cols)
y_pred = proba.argmax(axis=1)


print("\n===== BN Naive Bayes (3-class) : Classification Report =====\n")
print(classification_report(y_test, y_pred, digits=4))

print("\n===== BN Naive Bayes (3-class) : Confusion Matrix =====\n")
print(confusion_matrix(y_test, y_pred))

# Threshold-level AUCs (ordinal/cumulative): P(y>=1), P(y>=2)
p_ge_1 = proba[:, 1] + proba[:, 2]
p_ge_2 = proba[:, 2]

y_ge_1 = (y_test.values >= 1).astype(int)
y_ge_2 = (y_test.values >= 2).astype(int)

print("\n===== Threshold-level ROC AUC (BN) =====")
print(f"AUC P(y>=1): {roc_auc_score(y_ge_1, p_ge_1):.4f}")
print(f"AUC P(y>=2): {roc_auc_score(y_ge_2, p_ge_2):.4f}")


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'era_i10fg_max_total_48h': 'N', 'era_tp_max_total_48h': 'N', 'era_crr_max_total_48h': 'N', 'housing_units_by_area': 'N', 'overhead_circuits': 'N', 'n_points': 'N', 'n_urban': 'N', 'season_code': 'N', 'cbp_emp_total': 'N', 'y': 'N'}



[Check] merged y class proportions (%):
y
0    44.01
1    46.50
2     9.50

===== BN Naive Bayes (3-class) : Classification Report =====

              precision    recall  f1-score   support

           0     0.5547    0.5006    0.5263       851
           1     0.5456    0.6051    0.5738       899
           2     0.1834    0.1685    0.1756       184

    accuracy                         0.5176      1934
   macro avg     0.4279    0.4247    0.4252      1934
weighted avg     0.5152    0.5176    0.5150      1934


===== BN Naive Bayes (3-class) : Confusion Matrix =====

[[426 362  63]
 [280 544  75]
 [ 62  91  31]]

===== Threshold-level ROC AUC (BN) =====
AUC P(y>=1): 0.6408
AUC P(y>=2): 0.7348
