In [2]:
import pandas as pd
import numpy as np

CSV_PATH = r"F:\Storm Outage Modeling\storms_data.csv"
storms_data = pd.read_csv(CSV_PATH)
df = storms_data.copy()
# 需要的列（根据你现有数据）
need = ["full_fips", "CZ_NAME", "EVENT_ID", "BEGIN_DATE_TIME",
        "urban_ratio", "cbp_emp_total", "housing_units",
        "overhead_circuits", "max_outage_after_24h", "duration_hours"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise KeyError(f"Missing columns: {missing}")

# 时间
df["BEGIN_DATE_TIME"] = pd.to_datetime(df["BEGIN_DATE_TIME"], errors="coerce")

# 构造一个“暴露强度”综合指标（不需要很精确，只用于挑代表）
# 你可以理解为：规模 * 架空暴露 * 城市复杂度
for c in ["urban_ratio","cbp_emp_total","housing_units","overhead_circuits"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df["exposure_index"] = (
    np.log1p(df["housing_units"].clip(lower=0)) *
    np.log1p(df["overhead_circuits"].clip(lower=0)) *
    (1 + df["urban_ratio"].fillna(0))
)

# county-level 汇总：确保样本量够
g = (df.groupby(["full_fips","CZ_NAME"], as_index=False)
       .agg(
           n_obs=("EVENT_ID","size"),
           n_storms=("EVENT_ID","nunique"),
           urban_ratio_med=("urban_ratio","median"),
           cbp_emp_med=("cbp_emp_total","median"),
           exposure_idx_med=("exposure_index","median"),
           duration_pos=("duration_hours", lambda s: np.sum(pd.to_numeric(s, errors="coerce") > 0))
       ))

# 过滤：至少要有足够 storm 样本（你可以把 80 改成 50 或 100）
g2 = g[g["n_storms"] >= 80].copy()

# 选“最城市化/最暴露”与“最非城市化/最低暴露”
top_urban = g2.sort_values(["urban_ratio_med","exposure_idx_med"], ascending=False).head(1)
low_urban = g2.sort_values(["urban_ratio_med","exposure_idx_med"], ascending=True).head(1)

print("=== Recommended County A (High-urban/high-exposure) ===")
print(top_urban.to_string(index=False))

print("\n=== Recommended County B (Low-urban/low-exposure) ===")
print(low_urban.to_string(index=False))


=== Recommended County A (High-urban/high-exposure) ===
 full_fips    CZ_NAME  n_obs  n_storms  urban_ratio_med  cbp_emp_med  exposure_idx_med  duration_pos
     25001 BARNSTABLE    264       124              1.0     445981.0         64.967598           227

=== Recommended County B (Low-urban/low-exposure) ===
 full_fips CZ_NAME  n_obs  n_storms  urban_ratio_med  cbp_emp_med  exposure_idx_med  duration_pos
     23031    YORK    120        96             0.25     326126.0         22.549094           110
