In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from xgboost import XGBRegressor

# 1) Load & sort
df = pd.read_hdf(
    r'c:\Users\Linds\Repos\East_River\data\training\east_river_training-v1.h5',
    key='df'
)
df.sort_values('local_time', inplace=True)

# 2) Define horizons & features
horizons = [24, 48, 72]
leak = [
    'local_time','last_control_time',
    'OnLine_Load_MW','Load_Control_MW','Control_Threshold_MW'
] + [f'y_plus_{H}h' for H in horizons]
feature_cols = [c for c in df.columns if c not in leak]

# 3) Tuning slice + CV
tune = df.iloc[:12_101_628]
tscv = TimeSeriesSplit(n_splits=2)

# 4) Build pipeline
pipe = Pipeline([
    ('drop_loc', FunctionTransformer(lambda X: X.drop(columns=['location']), validate=False)),
    ('xgb',      XGBRegressor(tree_method='hist', random_state=0))
])

param_grid = {
    'xgb__n_estimators':    [100, 200, 300],
    'xgb__max_depth':       [3, 5, 7],
    'xgb__learning_rate':   [0.01, 0.05, 0.1],
    'xgb__subsample':       [0.7, 0.8, 1.0],
    'xgb__colsample_bytree':[0.7, 0.8, 1.0]
}

# 5) Loop horizons & RandomizedSearchCV
results = []
for H in horizons:
    X = tune[feature_cols]
    y = tune[f'y_plus_{H}h']
    split = int(len(X) * 0.8)
    Xtr, Xte = X.iloc[:split], X.iloc[split:]
    ytr, yte = y.iloc[:split], y.iloc[split:]

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=5,
        cv=tscv,
        scoring='neg_mean_absolute_error',
        n_jobs=1,
        random_state=0,
        verbose=1
    )
    search.fit(Xtr, ytr)

    ypred = search.predict(Xte)
    results.append({
        'horizon_h':  H,
        'best_params':search.best_params_,
        'cv_MAE':      -search.best_score_,
        'MAE':          mean_absolute_error(yte, ypred),
        'RMSE':         root_mean_squared_error(yte, ypred)
    })

pd.DataFrame(results).set_index('horizon_h')