| Event | Variables & Percentiles | Duration | Label Code |
|-------|------------------------|----------|------------|
| **Heatwave** | TMAX ≥ P95 **and** TMIN ≥ P90 | ≥ 3 consec. days | 1 |
| **Coldwave** | TMIN ≤ P5 | ≥ 2 days | 2 |
| **Drought**  | 30‑day PRCP ≤ P10 **and** TAVG ≥ P80 | rolling 30 d | 3 |
| **Flood**    | PRCP ≥ P99 **or** (SNOW ≥ P90 & TAVG > 0 °C within 3 d) | 1 day | 4 |
| **Blizzard** | SNOW ≥ P90 **and** AWND ≥ P90 **and** TMAX ≤ P10 | 1 day | 5 |
| **Extreme Wind** | AWND ≥ P95 | 1 day | 6 |
| **Thunderstorm** | WT03 flag *or* (PRCP ≥ P80 & AWND ≥ P80) | 1 day | 7 |
| **Compound** | ≥ 2 different events in 14‑day window | 14 d window | 99 |

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load processed daily data
DF_PATH = "data/merged_pca.csv"
df = pd.read_csv(DF_PATH, parse_dates=["DATE"])
print("Loaded, shape:", df.shape)

Loaded, shape: (383565, 17)


In [2]:
# Core variable list (must exist in df)
VARS = ["TMAX", "TMIN", "TAVG", "PRCP", "SNOW", "SNWD", "AWND"]

# Compute station‑wise percentiles
pct_levels = [0.05, 0.10, 0.80, 0.90, 0.95, 0.99]
percs = (
    df.groupby("STATION")[VARS]
    .quantile(pct_levels)
    .unstack(level=-1)
)

# Attach useful daily rolling sums (30‑day PRCP)
df.sort_values("DATE", inplace=True)
df["PRCP_30d"] = (
    df.groupby("STATION")["PRCP"].transform(lambda x: x.rolling(30, min_periods=1).sum())
)

# Helper: fetch percentile value
def p(station, var, q):
    return percs.loc[station, (var, q)]

# Allocate event flag columns
event_cols = [
    "heatwave",
    "coldwave",
    "drought",
    "flood",
    "blizzard",
    "wind_extreme",
    "thunderstorm",
]
for c in event_cols:
    df[c] = 0

In [3]:
# Iterate station‑by‑station for percentile thresholds
for stn, g in df.groupby("STATION"):
    idx = g.index
    # Individual day flags ---------------------------------------------------
    heat_cond = (g.TMAX >= p(stn, "TMAX", 0.95)) & (g.TMIN >= p(stn, "TMIN", 0.90))
    cold_cond = g.TMIN <= p(stn, "TMIN", 0.05)
    drought_cond = (g.PRCP_30d <= p(stn, "PRCP", 0.10)) & (g.TAVG >= p(stn, "TAVG", 0.80))
    flood_cond = (
        (g.PRCP >= p(stn, "PRCP", 0.99))
        | (
            (g.SNOW >= p(stn, "SNOW", 0.90))
            & (g.TAVG.shift(-3).ffill() > 0)
        )
    )
    blizzard_cond = (
        (g.SNOW >= p(stn, "SNOW", 0.90))
        & (g.AWND >= p(stn, "AWND", 0.90))
        & (g.TMAX <= p(stn, "TMAX", 0.10))
    )
    wind_cond = g.AWND >= p(stn, "AWND", 0.95)
    thund_cond = (g.PRCP >= p(stn, "PRCP", 0.80)) & (g.AWND >= p(stn, "AWND", 0.80))

    # Apply min‑duration rolling conditions ----------------------------------
    heatwave = heat_cond.rolling(3).sum() >= 3
    coldwave = cold_cond.rolling(2).sum() >= 2

    # Safe assignment with index alignment -----------------------------------
    df.loc[idx, "heatwave"] = heatwave.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "coldwave"] = coldwave.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "drought"] = drought_cond.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "flood"] = flood_cond.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "blizzard"] = blizzard_cond.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "wind_extreme"] = wind_cond.reindex(idx, fill_value=False).astype(int)
    df.loc[idx, "thunderstorm"] = thund_cond.reindex(idx, fill_value=False).astype(int)

In [4]:
# Save -----------------------------------------------------------------------
Path("data").mkdir(exist_ok=True)
df.to_csv("data/merged_labeled.csv", index=False)
print("Saved data/merged_labeled.csv with", df.shape[0], "rows and", df.shape[1], "columns")

Saved data/merged_labeled.csv with 383565 rows and 25 columns


In [5]:
# Compound label -------------------------------------------------------------
window = 14
rolling_sum = (
    df[event_cols]
    .astype(int)
    .groupby(df["STATION"])
    .rolling(window)
    .sum()
    .reset_index(level=0, drop=True)
)
df["compound"] = (rolling_sum >= 2).any(axis=1)
df["compound"] = df["compound"].astype(int)

In [6]:
# Save -----------------------------------------------------------------------
Path("data").mkdir(exist_ok=True)
df = df.drop(columns="PRCP_30d")
df.to_csv("data/merged_labeled.csv", index=False)
print("Saved data/merged_labeled.csv with", df.shape[0], "rows and", df.shape[1], "columns")

Saved data/merged_labeled.csv with 383565 rows and 25 columns
