improved multi-pipe model

- Enforces a 0–60/60–80/80–100 % train/val/test split
- Uses early stopping on each horizon
- Increases your TS splits to 5 for any future CV
- Drops the MultiOutputRegressor in favor of one XGBRegressor per horizon

In [3]:
# 1. Imports & Config
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import joblib

DATA_PATH    = r"C:\Users\Linds\Repos\East_River\data\training\east_river_training-v2.h5"
HORIZONS     = [24, 48, 72]
TS_CV        = TimeSeriesSplit(n_splits=5)             # increased folds
MODEL_PARAMS = dict(
    tree_method='hist',
    random_state=0,
    verbosity=0,
    eval_metric='rmse',
    early_stopping_rounds=20                         # early stopping
)

# 2. Load preprocessed data (unchanged)
def load_data(path):
    return pd.read_hdf(path, key='df')

# 3. Prepare X & multi‑output y (unchanged)
def prepare(df):
    drop_cols = [
        'local_time','last_control_time',
        'OnLine_Load_MW','Load_Control_MW','Control_Threshold_MW'
    ] + [f'y_plus_{h}h' for h in HORIZONS]
    feats = [c for c in df.columns if c not in drop_cols]
    X = df[feats].drop(columns=['location'], errors='ignore')
    y = df[[f'y_plus_{h}h' for h in HORIZONS]]
    return X, y

# 4. Train & evaluate per‑horizon XGB with clear chronology + early‑stop
def run_multi():
    df = load_data(DATA_PATH)
    X, y = prepare(df)

    n = len(X)
    train_end = int(0.6 * n)
    val_end   = int(0.8 * n)

    X_tr,  X_val, X_ho = X.iloc[:train_end], X.iloc[train_end:val_end], X.iloc[val_end:]
    y_tr,  y_val, y_ho = y.iloc[:train_end], y.iloc[train_end:val_end], y.iloc[val_end:]

    models = {}
    y_pred  = np.zeros_like(y_ho.values)

    for idx, h in enumerate(HORIZONS):
        print(f"\nTraining {h}h model with early stopping…")
        m = XGBRegressor(**MODEL_PARAMS)
        m.fit(
            X_tr, y_tr.iloc[:, idx],
            eval_set=[(X_val, y_val.iloc[:, idx])]
        )
        models[h]     = m
        y_pred[:, idx] = m.predict(X_ho)

        mae  = mean_absolute_error(y_ho.iloc[:,idx], y_pred[:,idx])
        rmse = mean_squared_error(y_ho.iloc[:,idx], y_pred[:,idx], squared=False)
        print(f" ⇒ {h}h HO MAE: {mae:.2f}, RMSE: {rmse:.2f}")

    joblib.dump(models, "xgb_multi_horizon_models.pkl")
    return models

if __name__ == "__main__":
    models = run_multi()


Training 24h model with early stopping…
[0]	validation_0-rmse:58.81391
[1]	validation_0-rmse:44.11107
[2]	validation_0-rmse:33.84137
[3]	validation_0-rmse:26.89883
[4]	validation_0-rmse:22.90484
[5]	validation_0-rmse:20.10334
[6]	validation_0-rmse:18.13600
[7]	validation_0-rmse:16.88642
[8]	validation_0-rmse:16.33261
[9]	validation_0-rmse:15.89845
[10]	validation_0-rmse:15.44428
[11]	validation_0-rmse:15.20904
[12]	validation_0-rmse:15.04896
[13]	validation_0-rmse:14.88466
[14]	validation_0-rmse:14.72364
[15]	validation_0-rmse:14.67339
[16]	validation_0-rmse:14.59535
[17]	validation_0-rmse:14.53109
[18]	validation_0-rmse:14.50058
[19]	validation_0-rmse:14.52294
[20]	validation_0-rmse:14.47883
[21]	validation_0-rmse:14.40865
[22]	validation_0-rmse:14.41199
[23]	validation_0-rmse:14.37252
[24]	validation_0-rmse:14.36456
[25]	validation_0-rmse:14.32017
[26]	validation_0-rmse:14.26101
[27]	validation_0-rmse:14.22164
[28]	validation_0-rmse:14.14699
[29]	validation_0-rmse:14.11098
[30]	vali



 ⇒ 24h HO MAE: 10.72, RMSE: 14.77

Training 48h model with early stopping…
[0]	validation_0-rmse:59.00020
[1]	validation_0-rmse:44.26427
[2]	validation_0-rmse:34.18180
[3]	validation_0-rmse:27.37007
[4]	validation_0-rmse:22.82072
[5]	validation_0-rmse:20.03945
[6]	validation_0-rmse:17.91239
[7]	validation_0-rmse:16.84052
[8]	validation_0-rmse:16.14281
[9]	validation_0-rmse:15.76717
[10]	validation_0-rmse:15.40760
[11]	validation_0-rmse:15.20719
[12]	validation_0-rmse:15.04820
[13]	validation_0-rmse:14.86596
[14]	validation_0-rmse:14.71354
[15]	validation_0-rmse:14.63077
[16]	validation_0-rmse:14.54311
[17]	validation_0-rmse:14.48943
[18]	validation_0-rmse:14.44174
[19]	validation_0-rmse:14.42697
[20]	validation_0-rmse:14.38588
[21]	validation_0-rmse:14.38443
[22]	validation_0-rmse:14.38848
[23]	validation_0-rmse:14.35986
[24]	validation_0-rmse:14.36405
[25]	validation_0-rmse:14.38544
[26]	validation_0-rmse:14.37333
[27]	validation_0-rmse:14.37477
[28]	validation_0-rmse:14.38941
[29]	va



 ⇒ 48h HO MAE: 11.25, RMSE: 15.27

Training 72h model with early stopping…
[0]	validation_0-rmse:59.04976
[1]	validation_0-rmse:44.54137
[2]	validation_0-rmse:34.38589
[3]	validation_0-rmse:27.73975
[4]	validation_0-rmse:23.50458
[5]	validation_0-rmse:20.47273
[6]	validation_0-rmse:18.63962
[7]	validation_0-rmse:17.55136
[8]	validation_0-rmse:16.67464
[9]	validation_0-rmse:16.34665
[10]	validation_0-rmse:15.86827
[11]	validation_0-rmse:15.47255
[12]	validation_0-rmse:15.35867
[13]	validation_0-rmse:15.21923
[14]	validation_0-rmse:15.14526
[15]	validation_0-rmse:15.03896
[16]	validation_0-rmse:14.99063
[17]	validation_0-rmse:14.94299
[18]	validation_0-rmse:14.92332
[19]	validation_0-rmse:14.90095
[20]	validation_0-rmse:14.86686
[21]	validation_0-rmse:14.82677
[22]	validation_0-rmse:14.71610
[23]	validation_0-rmse:14.75014
[24]	validation_0-rmse:14.72893
[25]	validation_0-rmse:14.69404
[26]	validation_0-rmse:14.68469
[27]	validation_0-rmse:14.68273
[28]	validation_0-rmse:14.67756
[29]	va



 ⇒ 72h HO MAE: 11.72, RMSE: 16.05


In [5]:
# reload data & split
#
X, y = prepare(df)
split = int(0.8 * len(X))
X_ho = X.iloc[split:]
y_ho = y.iloc[split:]

# load the dict of horizon→model
models = joblib.load("xgb_multi_horizon_models.pkl")

fig, axes = plt.subplots(len(HORIZONS), 3, figsize=(15, 5 * len(HORIZONS)))

for i, h in enumerate(HORIZONS):
    model = models[h]
    y_true = y_ho.iloc[:, i]
    y_pred = model.predict(X_ho)
    resid = y_true - y_pred

    # 1) Residual histogram
    ax = axes[i, 0]
    ax.hist(resid, bins=50, edgecolor='k')
    ax.set_title(f"{h}h Residuals")
    ax.set_xlabel("Error (True – Pred)")

    # 2) Actual vs Pred scatter
    ax = axes[i, 1]
    ax.scatter(y_true, y_pred, alpha=0.3)
    mn, mx = y_true.min(), y_true.max()
    ax.plot([mn, mx], [mn, mx], 'r--')
    ax.set_title(f"{h}h Actual vs Pred")
    ax.set_xlabel("True")
    ax.set_ylabel("Pred")

    # 3) Top‑10 feature importance
    ax = axes[i, 2]
    fi = pd.Series(model.feature_importances_, index=X_ho.columns)
    top10 = fi.nlargest(10).sort_values()
    ax.barh(top10.index, top10.values, edgecolor='k')
    ax.set_title(f"{h}h Top‑10 Features")

plt.tight_layout()
plt.show()

NameError: name 'prepare' is not defined

Inital multi-pipe model

In [1]:
# 1. Imports & Config
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_validate
import joblib

DATA_PATH    = r"C:\Users\Linds\Repos\East_River\data\training\east_river_training-v2.h5"
HORIZONS     = [24, 48, 72]
TS_CV        = TimeSeriesSplit(n_splits=3)
MODEL_PARAMS = dict(tree_method='hist', random_state=0)

# 2. Load preprocessed data
def load_data(path):
    return pd.read_hdf(path, key='df')

# 3. Prepare X & multi‑output y
def prepare(df):
    drop_cols = [
        'local_time','last_control_time',
        'OnLine_Load_MW','Load_Control_MW','Control_Threshold_MW'
    ] + [f'y_plus_{h}h' for h in HORIZONS]
    feats = [c for c in df.columns if c not in drop_cols]
    X = df[feats].drop(columns=['location'], errors='ignore')
    y = df[[f'y_plus_{h}h' for h in HORIZONS]]
    return X, y

# 4. Train & evaluate multi‑output model
def run_multi():
    df = load_data(DATA_PATH)
    X, y = prepare(df)
    split = int(0.8 * len(X))
    X_tr, X_ho = X.iloc[:split], X.iloc[split:]
    y_tr, y_ho = y.iloc[:split], y.iloc[split:]

    mor = MultiOutputRegressor(XGBRegressor(**MODEL_PARAMS), n_jobs=1)
    # cross‑validate each output
    cv = cross_validate(mor, X_tr, y_tr, cv=TS_CV,
                        scoring=['neg_mean_absolute_error','neg_root_mean_squared_error'],
                        n_jobs=1)
    mor.fit(X_tr, y_tr)
    y_pred = mor.predict(X_ho)

    # per‑horizon metrics
    for idx, h in enumerate(HORIZONS):
        mae  = mean_absolute_error(y_ho.iloc[:,idx], y_pred[:,idx])
        rmse = mean_squared_error(y_ho.iloc[:,idx], y_pred[:,idx], squared=False)
        print(f"{h}h — HO MAE: {mae:.2f}, RMSE: {rmse:.2f}")

    joblib.dump(mor, "xgb_multi_horizon.pkl")
    return mor, cv

if __name__ == "__main__":
    model, cv_res = run_multi()
    print("CV MAE:", -cv_res['test_neg_mean_absolute_error'].mean())



24h — HO MAE: 9.54, RMSE: 13.03
48h — HO MAE: 10.01, RMSE: 13.59
72h — HO MAE: 10.36, RMSE: 13.97
CV MAE: 12.017614023724285


In [2]:
# print both CV MAE and CV RMSE
print("CV MAE: ", -cv_res['test_neg_mean_absolute_error'].mean())
print("CV RMSE:", -cv_res['test_neg_root_mean_squared_error'].mean())

CV MAE:  12.017614023724285
CV RMSE: 17.014191543957047


In [None]:
# 5. Compare against existing Control_Threshold logic
import joblib
df_full = pd.read_hdf(DATA_PATH, key='df')  # raw v2 includes actual & threshold
print("\n=== Hold‑out: Forecast vs Threshold ===")
for h in HORIZONS:
    # align actual & threshold for H‑hour ahead
    actual = df_full['OnLine_Load_MW'].shift(-h).dropna().iloc[split:]
    threshold = df_full['Control_Threshold_MW'].shift(-h).dropna().iloc[split:]
    # load your saved model and predict on the same hold‑out X
    model = joblib.load(f"xgb_v2_{h}h.pkl")
    y_pred = model.predict(X.iloc[split:])
    # compute MAEs
    mae_f = mean_absolute_error(actual, y_pred)
    mae_t = mean_absolute_error(actual, threshold)
    imp  = (mae_t - mae_f) / mae_t * 100
    print(f"{h}h — MAE(threshold)={mae_t:.2f}, MAE(forecast)={mae_f:.2f}, Δ={imp:.1f}%")