# Weekly Re‑engineering & Compound Tagging
1. Drop old COMPOUND_* columns.
2. Aggregate features Mon‑Sun weeks (start Monday 1995‑01‑02).
3. Compute `EXT_*` weekly counts by summing.
4. Tag new compound events using `PREDEF` pairs.

In [5]:
import pandas as pd, numpy as np, re, os, math
from pathlib import Path

RAW_PATH = "data/merged_pca_11_cmp_7d.csv"
OUT_PATH = "data/weekly_dataset.csv"

# Load daily data
df = pd.read_csv(RAW_PATH, parse_dates=["DATE"])

In [6]:
# 1. Filter to full weeks starting Monday 1995‑01‑02

df = df[df["DATE"] >= "1995-01-02"]
df.set_index("DATE", inplace=True)

# Drop noisy stations
BAD_STATIONS = ["AUBURN", "ESCANABA"]
df = df[~df.STATION.isin(BAD_STATIONS)]
print("Stations after drop →", df.STATION.nunique())

Stations after drop → 33


In [7]:
# 2. Weekly aggregation spec

META = ["STATION", "NAME", "LATITUDE", "LONGITUDE", "ELEVATION"]
agg = {
    # mean
    "ACSH":"mean","AWND":"mean","SNWD":"mean","TAVG":"mean","TSUN":"mean","WDF5":"mean","WDFG":"mean",
    # sum
    "PRCP":"sum","SNOW":"sum",
    # extremes
    "TMAX":"max","TMIN":"min",
}
# EXT columns
ext_cols = [c for c in df.columns if c.startswith("EXT_")]
agg.update({c:"sum" for c in ext_cols})

weekly = (df.groupby("STATION")
            .resample("W-MON", label="left", closed="left")
            .agg(agg))
weekly.reset_index(inplace=True)
weekly.rename(columns={"DATE":"WEEK_START"}, inplace=True)
print("Weekly shape", weekly.shape)

Weekly shape (51678, 30)


In [8]:
# 3. Compound definitions

PREDEF = {
    "HEAT_DRY":["EXT_TMAX_HOT"],
    "HEAT_WIND":["EXT_TMAX_HOT","EXT_AWND_GALE"],
    "HOT_CLEAR_SKY":["EXT_TMAX_HOT","EXT_ACSH_CLEAR"],
    "STORM_RAIN_WIND":["EXT_PRCP_P95","EXT_WSFG_DAMG"],
    "BLIZZARD":["EXT_SNOW_P95","EXT_AWND_GALE"],
    "RAIN_ON_SNOW":["EXT_PRCP_P95","SNWD_P90_LAG1","TMAX_ABOVE0"],
    "THAW_FREEZE":["TMAX_HOT_T","TMIN_FROST_TPLUS1"],
    "BACK2BACK_RAIN":["EXT_PRCP_P95"],
}

for tag, conds in PREDEF.items():
    for c in conds:
        if c not in weekly.columns:
            weekly[c] = 0
    weekly[f"COMPOUND_{tag}"] = (weekly[conds] > 0).all(axis=1).astype(int)

In [9]:
# 4. Shift compound tags to next week & drop last week rows

comp_cols = [c for c in weekly.columns if c.startswith("COMPOUND_")]
weekly.sort_values(["STATION","WEEK_START"], inplace=True)
for c in comp_cols:
    weekly[f"{c}_next"] = weekly.groupby("STATION")[c].shift(-1)
weekly.dropna(subset=[f"{c}_next" for c in comp_cols], inplace=True)

# mark splits: 20% stations as validation
sts = weekly.STATION.unique()
val_sts = np.random.RandomState(1).choice(sts, size=math.ceil(0.2*len(sts)), replace=False)
weekly["SPLIT"] = np.where(weekly.STATION.isin(val_sts), "val", "train")

In [None]:
# 5. Save
Path("data").mkdir(exist_ok=True)
weekly.to_csv(OUT_PATH, index=False)
print("Saved", OUT_PATH)

Saved data/weekly_dataset.csv


In [11]:
# Sanity check

import pandas as pd
w = pd.read_csv("data/weekly_dataset.csv")
print(w.isna().sum().sum(), "total NaNs")
print(w[w.SPLIT=="val"].STATION.unique())

0 total NaNs
['BUFFALO' 'HOUGHTON' 'MARINETTE' 'MARQUETTE' 'SAULT STE MARIE'
 'SOUTH BEND' 'TRAVERSE CITY']
