# Wind DT — Model Comparison Notebook

This notebook loads one or more CSV files produced by the pipeline's sinks (e.g., `preds_lgbm.csv`, `preds_svr.csv`), computes comparable metrics, and makes a few diagnostic plots.

**What you need:**
- CSVs with columns at least: `ts, y, y_hat` (and optionally `model, v, pi`).
- Put them in a folder and point `DATA_DIR` below.
- Set `RATED_POWER_KW` to your turbine's rated power for normalized errors.

> Charts use **matplotlib** (no seaborn), one chart per figure, and avoid explicit colors to match your environment rules.

In [None]:
# --- Configuration ---
from pathlib import Path
import pandas as pd
import numpy as np

# Path to the directory containing your CSV outputs (change as needed)
DATA_DIR = Path("out")  # e.g., Path(r"C:/Users/.../wind-dt/out")

# Glob pattern for CSVs to include. You can list specific files instead.
CSV_GLOB = "*.csv"  # e.g., "preds_*.csv"

# Turbine rated power (kW) for normalized error. Adjust to your turbine.
RATED_POWER_KW = 2050.0

# Optional: choose a time window for plots (None means use full range)
TIME_START = None  # e.g., "2020-06-01"
TIME_END   = None  # e.g., "2020-06-15"

# Downsample factor for plotting (plot every Nth point to keep figures light)
PLOT_EVERY = 10

In [None]:
# --- Load CSVs ---
import glob

files = sorted([Path(p) for p in glob.glob(str(DATA_DIR / CSV_GLOB))])
if not files:
    raise FileNotFoundError(f"No CSV files match pattern {CSV_GLOB} in {DATA_DIR.resolve()}")

dfs = []
for p in files:
    df = pd.read_csv(p)
    if "ts" not in df.columns or "y" not in df.columns or "y_hat" not in df.columns:
        raise ValueError(f"{p.name} must contain at least columns: ts, y, y_hat. Has: {list(df.columns)[:10]}")
    df["ts"] = pd.to_datetime(df["ts"], utc=True, errors="coerce")
    df = df.dropna(subset=["ts"])
    if "model" not in df.columns or df["model"].isna().all():
        stem = p.stem
        inferred = stem.replace("preds_", "")
        df["model"] = inferred
    for col in ["y","y_hat","v","pi"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    if TIME_START is not None:
        df = df[df["ts"] >= pd.Timestamp(TIME_START, tz="UTC")]
    if TIME_END is not None:
        df = df[df["ts"] <= pd.Timestamp(TIME_END, tz="UTC")]
    dfs.append(df)

len(files), [f.name for f in files]

In [None]:
# --- Helper: infer sampling interval in hours per model ---
def infer_delta_hours(ts_series: pd.Series) -> float:
    if ts_series.size < 2:
        return np.nan
    dt = ts_series.sort_values().diff().dropna().median()
    return float(dt.total_seconds()) / 3600.0

sampling_by_model = {}
for df in dfs:
    model = df["model"].iloc[0]
    sampling_by_model[model] = infer_delta_hours(df["ts"])
sampling_by_model

In [None]:
# --- Compute per-model metrics ---
def safe_mape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = y_true > 1e-6
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0)

rows = []
for df in dfs:
    model = df["model"].iloc[0]
    d = df.dropna(subset=["y","y_hat"]).copy()
    if d.empty:
        continue
    err = d["y_hat"] - d["y"]
    mae = float(np.mean(np.abs(err)))
    rmse = float(np.sqrt(np.mean(err**2)))
    nmae = mae / RATED_POWER_KW * 100.0
    dh = sampling_by_model.get(model, np.nan)
    energy_bias_kwh = float(np.nansum(err * (dh if np.isfinite(dh) else 0.0)))
    pi_mean = float(d["pi"].mean()) if "pi" in d.columns else np.nan
    pi_median = float(d["pi"].median()) if "pi" in d.columns else np.nan
    pi_p10 = float(d["pi"].quantile(0.1)) if "pi" in d.columns else np.nan
    pi_p90 = float(d["pi"].quantile(0.9)) if "pi" in d.columns else np.nan
    rows.append({
        "model": model,
        "rows": int(len(d)),
        "sampling_h": float(dh) if np.isfinite(dh) else np.nan,
        "MAE_kW": mae,
        "RMSE_kW": rmse,
        "NMAE_%_of_rated": nmae,
        "MAPE_%": safe_mape(d["y"], d["y_hat"]),
        "EnergyBias_kWh": energy_bias_kwh,
        "PI_mean": pi_mean,
        "PI_median": pi_median,
        "PI_p10": pi_p10,
        "PI_p90": pi_p90
    })

metrics = pd.DataFrame(rows).sort_values(["NMAE_%_of_rated","RMSE_kW"])
metrics

In [None]:
# Save metrics table
metrics_path = DATA_DIR / "model_metrics.csv"
metrics.to_csv(metrics_path, index=False)
print(f"Saved metrics to: {metrics_path.resolve()}")
metrics

In [None]:
# --- Merge multiple models for head-to-head plots ---
from functools import reduce

def prepare_for_merge(df):
    m = df.copy()
    mdl = m["model"].iloc[0]
    m = m[["ts","y","y_hat"]].rename(columns={"y_hat": f"y_hat_{mdl}"})
    return m

merged = reduce(lambda left,right: pd.merge(left, right, on=["ts","y"], how="outer"),
                [prepare_for_merge(df) for df in dfs])

merged = merged.sort_values("ts")
merged.head()

In [None]:
# --- Plot: actual vs predicted (subset, top 1–3 models) ---
import matplotlib.pyplot as plt

pred_cols = [c for c in merged.columns if c.startswith("y_hat_")][:3]

fig = plt.figure(figsize=(12, 4))
if pred_cols:
    s = merged if PLOT_EVERY <= 1 else merged.iloc[::PLOT_EVERY, :]
    plt.plot(s["ts"], s["y"], label="Actual (y)")
    for c in pred_cols:
        plt.plot(s["ts"], s[c], label=c)
    plt.xlabel("Time")
    plt.ylabel("Power (kW)")
    plt.title("Actual vs Predicted (subset)")
    plt.legend()
else:
    plt.text(0.5, 0.5, "No prediction columns found", ha="center")
plt.show()

In [None]:
# --- Scatter plots: y_hat vs y per model ---
for df in dfs:
    mdl = df["model"].iloc[0]
    d = df.dropna(subset=["y","y_hat"])
    if d.empty:
        continue
    fig = plt.figure(figsize=(4.5, 4.5))
    plt.scatter(d["y"], d["y_hat"], s=5)
    plt.xlabel("Actual y (kW)")
    plt.ylabel("Predicted y_hat (kW)")
    plt.title(f"y_hat vs y — {mdl}")
    plt.plot([d["y"].min(), d["y"].max()], [d["y"].min(), d["y"].max()])
    plt.show()

In [None]:
# --- Residual histograms per model ---
for df in dfs:
    mdl = df["model"].iloc[0]
    d = df.dropna(subset=["y","y_hat"])
    if d.empty:
        continue
    err = (d["y_hat"] - d["y"]).values
    fig = plt.figure(figsize=(5, 3.5))
    plt.hist(err, bins=50)
    plt.xlabel("Residual (y_hat - y) kW")
    plt.ylabel("Count")
    plt.title(f"Residuals — {mdl}")
    plt.show()