In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Setup & preprocessing data

In [2]:
import os, re, math, warnings
import numpy as np
import pandas as pd
import inspect

from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import math

warnings.filterwarnings("ignore")

In [3]:
# df = pd.read_csv('/content/drive/MyDrive/duong/STLF/Data/merge_weather_energy_hanoi_20202025_norm.csv')
df = pd.read_csv('/content/drive/MyDrive/STLF/Data/merge_weather_energy_hanoi_20202025_norm.csv')
df

Unnamed: 0,Temperature,Weather,Precipitation,Chance of snow,Humidity,Wind,Wind Gust,Wind Degree,Wind Direction,Cloud Cover,Visibility,timestamp,is_weekend,season,is_holiday,total_consumption_mw
0,25.4,Patchy rain possible,0.6,0.0,89.0,2.194444,4.388889,295.0,WNW,89.0,9.0,2020-01-01 00:00:00,0,winter,False,1790.10
1,25.1,Partly cloudy,0.0,0.0,90.0,2.611111,5.111111,297.0,WNW,34.0,10.0,2020-01-01 01:00:00,0,winter,False,1452.26
2,24.7,Patchy rain possible,0.0,0.0,91.0,2.805556,5.500000,309.0,NW,87.0,10.0,2020-01-01 02:00:00,0,winter,False,1483.75
3,24.5,Cloudy,0.0,0.0,92.0,2.611111,4.888889,325.0,NW,71.0,10.0,2020-01-01 03:00:00,0,winter,False,1890.07
4,24.1,Patchy rain possible,0.0,0.0,93.0,2.305556,4.000000,326.0,NNW,100.0,10.0,2020-01-01 04:00:00,0,winter,False,1371.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47444,26.8,Partly cloudy,0.0,0.0,84.0,1.888889,3.388889,109.0,ESE,56.0,10.0,2025-05-30 20:00:00,0,spring,False,2449.28
47445,26.5,Partly cloudy,0.0,0.0,86.0,2.388889,4.194444,126.0,SE,52.0,10.0,2025-05-30 21:00:00,0,spring,False,2554.05
47446,26.1,Patchy rain possible,0.0,0.0,88.0,2.388889,4.194444,149.0,SSE,84.0,10.0,2025-05-30 22:00:00,0,spring,False,1895.41
47447,25.6,Patchy rain possible,0.5,0.0,91.0,2.305556,4.111111,152.0,SSE,76.0,9.0,2025-05-30 23:00:00,0,spring,False,1558.67


In [6]:
df['Chance of snow'].value_counts()

Unnamed: 0_level_0,count
Chance of snow,Unnamed: 1_level_1
0.0,47449


In [4]:
CAT_COLS   = ['Weather','Wind Direction','season','is_holiday']
EXOG_NUM   = ['Temperature','Precipitation','Humidity',
              'Wind','Wind Gust','Wind Degree','Cloud Cover','Visibility']
HORIZON    = 24                       # forecast t+1…t+24
TARGET_LAGS   = range(1,49)           # lags of target (1…48)
ROLL_WINDOWS  = [3,6,12,24,48]        # rolling windows
EXOG_LAGS     = [0,1,3,6,12,24]       # lags for exogenous vars
TARGET_COL = 'total_consumption_mw'
TIME_COL   = 'timestamp'

In [5]:
df[TIME_COL] = pd.to_datetime(df[TIME_COL])
df = df.sort_values(TIME_COL).reset_index(drop=True)

In [6]:
# make sure categoricals are strings
for col in CAT_COLS:
    df[col] = df[col].astype(str)

In [7]:
# add calendar/time features
df['hour']      = df[TIME_COL].dt.hour
df['dow']       = df[TIME_COL].dt.dayofweek
df['dom']       = df[TIME_COL].dt.day
df['month']     = df[TIME_COL].dt.month
df['doy']       = df[TIME_COL].dt.dayofyear
df['hour_sin']  = np.sin(2*np.pi*df['hour']/24)
df['hour_cos']  = np.cos(2*np.pi*df['hour']/24)
df['dow_sin']   = np.sin(2*np.pi*df['dow']/7)
df['dow_cos']   = np.cos(2*np.pi*df['dow']/7)
df['doy_sin']   = np.sin(2*np.pi*df['doy']/365.25)
df['doy_cos']   = np.cos(2*np.pi*df['doy']/365.25)

In [None]:
# ✓ 0) Sanity: is a GPU attached?
!nvidia-smi

# ✓ 1) Version check
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

# ✓ 2) Minimal GPU training — if this runs, GPU is working
from sklearn.datasets import make_regression
from xgboost import XGBRegressor
import numpy as np

X, y = make_regression(n_samples=5000, n_features=50, noise=0.1, random_state=42)

xgb_gpu = XGBRegressor(
    tree_method="gpu_hist",       # <-- GPU trainer
    predictor="gpu_predictor",    # <-- GPU predictor
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)
xgb_gpu.fit(X, y)
print("Trained successfully with GPU!")


Sat Aug 30 12:33:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Feature engineering

In [8]:
# 2. feature engineering
# create lagged target features
for L in TARGET_LAGS:
    df[f'y_lag_{L}'] = df[TARGET_COL].shift(L)

# rolling statistics on the target, with one‑step shift to avoid leakage
for W in ROLL_WINDOWS:
    df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
    df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()

# lagged exogenous features
for col in EXOG_NUM:
    for L in EXOG_LAGS:
        df[f'{col}_lag{L}'] = df[col].shift(L)

# multi‑horizon labels
for h in range(1, HORIZON+1):
    df[f'y_t+{h}'] = df[TARGET_COL].shift(-h)

# drop rows with NaNs (from lagging & shifting)no
df_feat = df.dropna().reset_index(drop=True)
df_feat.shape

(47377, 157)

# Data pre-processing

In [16]:
np.max(df_feat['timestamp'])

Timestamp('2025-05-30 00:00:00')

In [18]:
df_feat.shape[0] - 8*7*24

46033

In [9]:
# 3. train/valid/test split by time
label_cols   = [f'y_t+{h}' for h in range(1, HORIZON+1)]
feature_cols = [c for c in df_feat.columns if c not in label_cols + [TIME_COL, TARGET_COL]]

X = df_feat[feature_cols]
Y = df_feat[label_cols].values

N        = len(X)
train_end= int(N - 2*8*7*24)
valid_end= int(N - 8*7*24)

X_train, Y_train = X.iloc[:train_end],    Y[:train_end]
X_valid, Y_valid = X.iloc[train_end:valid_end], Y[train_end:valid_end]
X_test,  Y_test  = X.iloc[valid_end:],    Y[valid_end:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)
print(X_test.shape, Y_test.shape)

(44689, 131) (44689, 24)
(1344, 131) (1344, 24)
(1344, 131) (1344, 24)


In [17]:
np.max(df_feat[:valid_end ]['timestamp'])

Timestamp('2025-04-04 00:00:00')

In [10]:
# 4. preprocessor: one‑hot encode the 4 categorical columns, pass others through
preprocess = ColumnTransformer(
    [
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CAT_COLS),
        ('num', 'passthrough', [c for c in feature_cols if c not in CAT_COLS])
    ],
    remainder='drop'
)


In [11]:
# Scorer: average RMSE across 24 horizons (negated for maximization)
# =========================
def avg_neg_rmse(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    rmse_per_h = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
    return -float(np.mean(rmse_per_h))

In [12]:
scorer = make_scorer(avg_neg_rmse, greater_is_better=True)
scorer

make_scorer(avg_neg_rmse, response_method='predict')

In [13]:
# train+valid for tuning
X_trv = pd.concat([X_train, X_valid]); Y_trv = np.vstack([Y_train, Y_valid])
tscv = TimeSeriesSplit(n_splits=5)

In [14]:
tscv = TimeSeriesSplit(n_splits=3)

# XGBoost tuning

In [None]:
# =========================
# 3) XGBoost tuning
# =========================
xgb_base = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1, tree_method='gpu_hist')
xgb_pipe = Pipeline([('prep', preprocess),
                     ('model', MultiOutputRegressor(xgb_base, n_jobs=1))])

xgb_space = {
    'model__estimator__n_estimators':  [600, 800, 1000, 1200],
    'model__estimator__max_depth':     [6, 8, 10],
    'model__estimator__learning_rate': [0.03, 0.05, 0.08],
    'model__estimator__subsample':     [0.7, 0.8, 1.0],
    'model__estimator__colsample_bytree': [0.7, 0.8, 1.0],
    'model__estimator__min_child_weight': [1, 3, 5],
    'model__estimator__gamma':         [0, 0.1, 0.3],
    'model__estimator__reg_alpha':     [0.0, 0.1, 0.5],
    'model__estimator__reg_lambda':    [0.5, 1.0, 2.0],
}

xgb_tuner = RandomizedSearchCV(
    xgb_pipe, xgb_space, n_iter=12, cv=tscv, scoring=scorer,
    random_state=42, n_jobs=-1, verbose=1, refit=True
)
xgb_tuner.fit(X_trv, Y_trv)
print("Best XGB params:", xgb_tuner.best_params_)
print("Best XGB CV avg-RMSE:", -xgb_tuner.best_score_)

# evaluate on held-out test
Y_hat_xgb = xgb_tuner.predict(X_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_metho

In [16]:
# =========================
# 3) XGBoost tuning
# =========================

xgb_space = {
    'model__estimator__n_estimators':  [600, 800, 1000],
    'model__estimator__max_depth':     [6, 8],
    'model__estimator__learning_rate': [0.03, 0.05, 0.08],
    'model__estimator__subsample':     [0.7, 0.9],
    'model__estimator__colsample_bytree': [0.7, 0.9],
    'model__estimator__min_child_weight': [1, 3],
    'model__estimator__gamma':         [0, 0.1],
    'model__estimator__reg_alpha':     [0.0, 0.1],
    'model__estimator__reg_lambda':    [0.5, 1.0],
}

xgb_pipe = Pipeline([
    ('prep', preprocess),
    ('model', MultiOutputRegressor(XGBRegressor(objective='reg:squarederror',
                                                tree_method='hist', # 'gpu_hist' if GPU
                                                random_state=42, n_jobs=-1),
                                   n_jobs=1))
])

xgb_tuner = RandomizedSearchCV(
    xgb_pipe, xgb_space, n_iter=12, cv=tscv, scoring=scorer,
    random_state=42, n_jobs=-1, verbose=1, refit=True
)

xgb_tuner.fit(X_trv, Y_trv)
print("Best XGB params:", xgb_tuner.best_params_)
print("Best XGB CV avg-RMSE:", -xgb_tuner.best_score_)

# evaluate on held-out test
Y_hat_xgb = xgb_tuner.predict(X_test)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best XGB params: {'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.5, 'model__estimator__reg_alpha': 0.0, 'model__estimator__n_estimators': 1000, 'model__estimator__min_child_weight': 3, 'model__estimator__max_depth': 6, 'model__estimator__learning_rate': 0.03, 'model__estimator__gamma': 0.1, 'model__estimator__colsample_bytree': 0.7}
Best XGB CV avg-RMSE: 320.7161990859629


In [17]:
print("Best XGB params:", xgb_tuner.best_params_)
print("Best XGB CV avg-RMSE:", -xgb_tuner.best_score_)

Best XGB params: {'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.5, 'model__estimator__reg_alpha': 0.0, 'model__estimator__n_estimators': 1000, 'model__estimator__min_child_weight': 3, 'model__estimator__max_depth': 6, 'model__estimator__learning_rate': 0.03, 'model__estimator__gamma': 0.1, 'model__estimator__colsample_bytree': 0.7}
Best XGB CV avg-RMSE: 320.7161990859629


In [18]:
import joblib

# Save the lgb_tuner object
joblib.dump(xgb_tuner, 'xgb_tuner.pkl')

print("xgb_tuner saved to xgb_tuner.pkl")

xgb_tuner saved to xgb_tuner.pkl


In [None]:
import joblib
xgb_tuner = joblib.load('/content/drive/MyDrive/duong/STLF/Saved_models/xgb_tuner.pkl')

AttributeError: Can't get attribute '_PredictScorer' on <module 'sklearn.metrics._scorer' from '/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py'>

In [19]:
xgb_tuner.best_params_

{'model__estimator__subsample': 0.7,
 'model__estimator__reg_lambda': 0.5,
 'model__estimator__reg_alpha': 0.0,
 'model__estimator__n_estimators': 1000,
 'model__estimator__min_child_weight': 3,
 'model__estimator__max_depth': 6,
 'model__estimator__learning_rate': 0.03,
 'model__estimator__gamma': 0.1,
 'model__estimator__colsample_bytree': 0.7}

In [None]:
import xgboost as xgb
# 1) Extract best params from your tuner and map to xgb.train names
bp = xgb_tuner.best_params_
def g(k, default=None): return bp.get(f"model__estimator__{k}", default)

xgb_params = {
    "max_depth":         g("max_depth"),
    "eta":               g("learning_rate"),
    "subsample":         g("subsample"),
    "colsample_bytree":  g("colsample_bytree"),
    "min_child_weight":  g("min_child_weight"),
    "gamma":             g("gamma"),
    "lambda":            g("reg_lambda"),
    "alpha":             g("reg_alpha"),
    "objective":         "reg:squarederror",
    "eval_metric":       "rmse",
    "verbosity":         0,
    #"tree_method":       "gpu_hist",   # set to 'gpu_hist' if you have a GPU
}
num_boost_round = g("n_estimators", 4000)

# 2) Fit encoder on train+valid once; transform arrays for fast training
X_trv_enc  = preprocess.fit_transform(X_trv)
X_test_enc = preprocess.transform(X_test)

# 3) Early-stopping split: last 15% of TRV as eval (keeps time order)
cut = int(0.85 * X_trv_enc.shape[0])
X_tr, X_ev = X_trv_enc[:cut], X_trv_enc[cut:]
Y_tr, Y_ev = Y_trv[:cut],     Y_trv[cut:]

# 4) Train 24 boosters with early stopping
models_xgb = []
for h in range(HORIZON):  # 0..23 for t+1..t+24
    dtrain = xgb.DMatrix(X_tr, label=Y_tr[:, h])
    dvalid = xgb.DMatrix(X_ev, label=Y_ev[:, h])
    booster = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dvalid, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False,
    )
    models_xgb.append(booster)

# 5) Predict on test and report metrics
dtest = xgb.DMatrix(X_test_enc)
Y_hat_xgb = np.column_stack([
    m.predict(dtest, iteration_range=(0, m.best_iteration + 1)) for m in models_xgb
])

def safe_mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0

rmse_h_xgb = [math.sqrt(mean_squared_error(Y_test[:, h], Y_hat_xgb[:, h])) for h in range(HORIZON)]
nrmse_h_xgb = [rm / (np.mean(Y_test[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_h_xgb)]
mape_h_xgb  = [safe_mape(Y_test[:, h], Y_hat_xgb[:, h]) for h in range(HORIZON)]

print("Per-h RMSE:", [round(x,2) for x in rmse_h_xgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_xgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_xgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_xgb)))

Per-h RMSE: [296.59, 325.64, 270.0, 270.46, 277.15, 265.78, 267.23, 259.59, 263.36, 274.65, 272.44, 275.65, 289.8, 283.15, 271.81, 291.29, 274.61, 271.92, 265.46, 284.73, 268.49, 266.76, 286.56, 301.62]
Avg RMSE:  278.114
Avg nRMSE: 11.85%
Avg MAPE:  9.00%


In [20]:
import xgboost as xgb
# 1) Extract best params from your tuner and map to xgb.train names
bp = xgb_tuner.best_params_
def g(k, default=None): return bp.get(f"model__estimator__{k}", default)

xgb_params = {
    "max_depth":         g("max_depth"),
    "eta":               g("learning_rate"),
    "subsample":         g("subsample"),
    "colsample_bytree":  g("colsample_bytree"),
    "min_child_weight":  g("min_child_weight"),
    "gamma":             g("gamma"),
    "lambda":            g("reg_lambda"),
    "alpha":             g("reg_alpha"),
    "objective":         "reg:squarederror",
    "eval_metric":       "rmse",
    "verbosity":         0,
    #"tree_method":       "gpu_hist",   # set to 'gpu_hist' if you have a GPU
}
num_boost_round = g("n_estimators", 4000)

# 2) Fit encoder on train+valid once; transform arrays for fast training
X_trv_enc  = preprocess.fit_transform(X_trv)
X_test_enc = preprocess.transform(X_test)

# 3) Early-stopping split: last 15% of TRV as eval (keeps time order)
cut = int(0.85 * X_trv_enc.shape[0])
X_tr, X_ev = X_trv_enc[:cut], X_trv_enc[cut:]
Y_tr, Y_ev = Y_trv[:cut],     Y_trv[cut:]

# 4) Train 24 boosters with early stopping
models_xgb = []
for h in range(HORIZON):  # 0..23 for t+1..t+24
    dtrain = xgb.DMatrix(X_tr, label=Y_tr[:, h])
    dvalid = xgb.DMatrix(X_ev, label=Y_ev[:, h])
    booster = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dvalid, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False,
    )
    models_xgb.append(booster)

# 5) Predict on test and report metrics
dtest = xgb.DMatrix(X_test_enc)
Y_hat_xgb = np.column_stack([
    m.predict(dtest, iteration_range=(0, m.best_iteration + 1)) for m in models_xgb
])

def safe_mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0

rmse_h_xgb = [math.sqrt(mean_squared_error(Y_test[:, h], Y_hat_xgb[:, h])) for h in range(HORIZON)]
nrmse_h_xgb = [rm / (np.mean(Y_test[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_h_xgb)]
mape_h_xgb  = [safe_mape(Y_test[:, h], Y_hat_xgb[:, h]) for h in range(HORIZON)]

print("Per-h RMSE:", [round(x,2) for x in rmse_h_xgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_xgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_xgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_xgb)))

Per-h RMSE: [300.37, 330.13, 292.41, 301.24, 302.94, 292.55, 290.24, 277.7, 290.68, 316.07, 310.6, 313.78, 313.31, 320.77, 311.67, 323.22, 329.68, 309.68, 294.54, 328.63, 311.57, 291.77, 321.41, 326.5]
Avg RMSE:  308.393
Avg nRMSE: 12.46%
Avg MAPE:  9.66%


# LightGBM tuning

In [None]:
import numpy as np, lightgbm as lgb
X = np.random.rand(1000, 100); y = np.random.rand(1000)
d = lgb.Dataset(X, label=y)
lgb.train({"objective":"regression","device_type":"gpu"}, d, num_boost_round=1)
print("GPU works")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 100
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 100 dense feature groups (0.10 MB) transferred to GPU in 0.003225 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.474785
GPU works


In [15]:
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV

# Base estimator
lgb_base = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1,          # let the trees parallelize
    boosting_type="gbdt",
    verbose=-1,
    max_bin=127,
    #device_type="gpu"
)

# Pipeline (avoid double over-parallelization)
lgb_pipe = Pipeline([
    ("prep", preprocess),
    ("model", MultiOutputRegressor(lgb_base, n_jobs=1))  # keep this at 1 if RandomizedSearchCV uses n_jobs=-1
])

# Search space (lean but effective)
lgb_space = {
  "model__estimator__n_estimators": [300, 500, 700],
  "model__estimator__learning_rate":[0.05, 0.08],
  "model__estimator__num_leaves":  [63, 127],
  "model__estimator__max_depth":   [-1, 8],
  "model__estimator__subsample":   [0.7, 0.9],
  "model__estimator__colsample_bytree":[0.7, 0.9],
  "model__estimator__min_child_samples":[20, 40],
  "model__estimator__reg_alpha":   [0.0, 0.1],
  "model__estimator__reg_lambda":  [0.1, 1.0],
}

# Randomized search (fast pass; bump n_iter if you want wider exploration)
lgb_tuner = RandomizedSearchCV(
    lgb_pipe,
    lgb_space,
    n_iter=8,
    cv=tscv,            # your TimeSeriesSplit
    scoring=scorer,     # your avg-neg-RMSE scorer
    random_state=42,
    n_jobs=-1,          # parallelize candidates × folds
    verbose=1,
    refit=True
)

lgb_tuner.fit(X_trv, Y_trv)

print("Best LGB params:", lgb_tuner.best_params_)
print("Best LGB CV avg-RMSE:", -lgb_tuner.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


KeyboardInterrupt: 

In [None]:
# ===== LGB final refit with early stopping (continue from your state) =====
import numpy as np, math, lightgbm as lgb
from sklearn.metrics import mean_squared_error

# 1) map best params from tuner to lgb.train
bp = lgb_tuner.best_params_
g  = lambda k, d=None: bp.get(f"model__estimator__{k}", d)

lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": g("learning_rate", 0.05),
    "num_leaves": g("num_leaves", 127),
    "max_depth": g("max_depth", -1),
    "feature_fraction": g("colsample_bytree", 0.8),   # alias
    "bagging_fraction": g("subsample", 0.8),          # alias
    "bagging_freq": bp.get("model__estimator__subsample_freq", 0),
    "min_data_in_leaf": g("min_child_samples", 20),
    "lambda_l1": g("reg_alpha", 0.0),
    "lambda_l2": g("reg_lambda", 0.1),
    "verbosity": -1,
    "num_threads": -1,
    # Optional GPU:
    "device_type": "gpu",
    # Optional speed knob during refit:
    "max_bin": 255,
}
# let ES find the best iteration (set bigger than tuned n_estimators)
num_boost_round = 4000
early_stopping_rounds = 100

# 2) encode once
X_trv_enc  = preprocess.fit_transform(X_trv)
X_test_enc = preprocess.transform(X_test)

# 3) carve last 15% of TRV as eval set (keep chronological order)
cut = int(0.85 * X_trv_enc.shape[0])
X_tr, X_ev = X_trv_enc[:cut], X_trv_enc[cut:]
Y_tr, Y_ev = Y_trv[:cut],     Y_trv[cut:]

# 4) train 24 boosters with ES
models_lgb = []
for h in range(HORIZON):
    dtr = lgb.Dataset(X_tr, label=Y_tr[:, h])
    dev = lgb.Dataset(X_ev, label=Y_ev[:, h], reference=dtr)
    booster = lgb.train(
        params=lgb_params,
        train_set=dtr,
        num_boost_round=num_boost_round,
        valid_sets=[dev],
        callbacks=[lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(-1)]
    )
    models_lgb.append(booster)

# 5) predict & metrics
Y_hat_lgb = np.column_stack([m.predict(X_test_enc, num_iteration=m.best_iteration) for m in models_lgb])

def safe_mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0

rmse_h_lgb = [math.sqrt(mean_squared_error(Y_test[:, h], Y_hat_lgb[:, h])) for h in range(HORIZON)]
nrmse_h_lgb = [rm / (np.mean(Y_test[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_h_lgb)]
mape_h_lgb  = [safe_mape(Y_test[:, h], Y_hat_lgb[:, h]) for h in range(HORIZON)]

print("Per-horizon RMSE:", [round(x,2) for x in rmse_h_lgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_lgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_lgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_lgb)))


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[497]	valid_0's rmse: 262.679
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[549]	valid_0's rmse: 264.763
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[578]	valid_0's rmse: 271.19
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[520]	valid_0's rmse: 273.567
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[805]	valid_0's rmse: 282.022
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[930]	valid_0's rmse: 291.961
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[697]	valid_0's rmse: 299.357
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[916]	valid_0's

## DucAn drive

In [15]:
import os
BASE_DIR = "/content/drive/MyDrive/STLF/Saved_models/lgbm_newsplit"  # <- change if you like
os.makedirs(BASE_DIR, exist_ok=True)

In [17]:
import os, json, joblib, numpy as np, gc
from sklearn.model_selection import ParameterSampler
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor

In [19]:
# ==== config for Drive paths ====
N_ITER = 8  # same as your RandomizedSearchCV
PROGRESS_JSON = os.path.join(BASE_DIR, "lgbm_tune_progress.json")
PARAMS_JSON   = os.path.join(BASE_DIR, "lgbm_param_list.json")  # to keep the same sequence on resume
TRIAL_CKP_FMT = os.path.join(BASE_DIR, "lgbm_checkpoint_trial_{:03d}.pkl")
BEST_MODEL_PKL= os.path.join(BASE_DIR, "lgbm_best_model.pkl")

In [16]:
# Base estimator
lgb_base = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1,          # let the trees parallelize
    boosting_type="gbdt",
    verbose=-1,
    max_bin=127,
    #device_type="gpu"
)

# Pipeline (avoid double over-parallelization)
lgb_pipe = Pipeline([
    ("prep", preprocess),
    ("model", MultiOutputRegressor(lgb_base, n_jobs=1))  # keep this at 1 if RandomizedSearchCV uses n_jobs=-1
])

# Search space (lean but effective)
lgb_space = {
  "model__estimator__n_estimators": [300, 500, 700],
  "model__estimator__learning_rate":[0.05, 0.08],
  "model__estimator__num_leaves":  [63, 127],
  "model__estimator__max_depth":   [-1, 8],
  "model__estimator__subsample":   [0.7, 0.9],
  "model__estimator__colsample_bytree":[0.7, 0.9],
  "model__estimator__min_child_samples":[20, 40],
  "model__estimator__reg_alpha":   [0.0, 0.1],
  "model__estimator__reg_lambda":  [0.1, 1.0],
}

# ==== build / load param list deterministically ====
if os.path.exists(PARAMS_JSON):
    with open(PARAMS_JSON, "r") as f:
        param_list = json.load(f)
else:
    from sklearn.model_selection import ParameterSampler
    param_list = list(ParameterSampler(lgb_space, n_iter=N_ITER, random_state=42))
    with open(PARAMS_JSON, "w") as f:
        json.dump(param_list, f)

# ==== load progress if exists ====
if os.path.exists(PROGRESS_JSON):
    with open(PROGRESS_JSON, "r") as f:
        prog = json.load(f)
    completed = set(prog.get("completed_indices", []))
    best_score = prog.get("best_score", -np.inf)
    best_params = prog.get("best_params", None)
else:
    completed, best_score, best_params = set(), -np.inf, None
    prog = {"completed_indices": [], "best_score": best_score, "best_params": best_params}

print(f"Resuming: {len(completed)}/{len(param_list)} trials completed.")
print("Checkpoints folder:", BASE_DIR)

# ==== helper: evaluate one param set with your tscv + scorer ====
def cv_score_for_params(params):
    lgb_base = LGBMRegressor(
        objective="regression",
        random_state=42,
        n_jobs=-1,
        boosting_type="gbdt",
        verbose=-1,
        max_bin=127,
        # GPU if available, else comment out:
        #device_type="gpu"
    )
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", MultiOutputRegressor(lgb_base, n_jobs=1))
    ])
    pipe.set_params(**params)

    fold_scores = []
    for fold_idx, (tr_idx, va_idx) in enumerate(tscv.split(X_trv)):
        X_tr, X_va = X_trv.iloc[tr_idx], X_trv.iloc[va_idx]
        Y_tr, Y_va = Y_trv[tr_idx],      Y_trv[va_idx]

        pipe_fold = clone(pipe)
        pipe_fold.fit(X_tr, Y_tr)
        s = scorer(pipe_fold, X_va, Y_va)   # your avg-neg-RMSE scorer
        fold_scores.append(s)
        # free memory between folds
        del pipe_fold; gc.collect()
    return float(np.mean(fold_scores)), pipe  # return mean score and last-fitted pipe

# ==== main loop with Drive checkpoints ====
for i, params in enumerate(param_list):
    if i in completed:
        print(f"Skipping trial {i+1}: already completed.")
        continue

    print(f"\n--- Trial {i+1}/{len(param_list)} ---")
    print("Params:", params)
    try:
        score, fitted_pipe = cv_score_for_params(params)
        print("CV score (avg-neg-RMSE):", score)

        # save trial checkpoint to Drive
        joblib.dump(fitted_pipe, TRIAL_CKP_FMT.format(i+1), compress=3)

        # update best
        if score > best_score:
            best_score = score
            best_params = params
            joblib.dump(fitted_pipe, BEST_MODEL_PKL, compress=3)
            print(">> New best! Saved:", BEST_MODEL_PKL)

        # update progress JSON on Drive
        completed.add(i)
        prog["completed_indices"] = sorted(list(completed))
        prog["best_score"] = best_score
        prog["best_params"] = best_params
        with open(PROGRESS_JSON, "w") as f:
            json.dump(prog, f)
        print("Progress saved to:", PROGRESS_JSON)

        # free memory
        del fitted_pipe; gc.collect()

    except Exception as e:
        print(f"Trial {i+1} failed: {e}")
        # still save progress before stopping/continuing
        prog["completed_indices"] = sorted(list(completed))
        prog["best_score"] = best_score
        prog["best_params"] = best_params
        with open(PROGRESS_JSON, "w") as f:
            json.dump(prog, f)
        break  # or 'continue' if you want to try the next candidate

# summary
if best_params is not None:
    print("\nBest params so far:", best_params)
    print("Best CV avg-RMSE:", -best_score)
    # Optional: reload best model later
    # best_model = joblib.load(BEST_MODEL_PKL)
else:
    print("No successful trials yet.")

print("Best LGB params:", lgb_tuner.best_params_)
print("Best LGB CV avg-RMSE:", -lgb_tuner.best_score_)

Resuming: 0/8 trials completed.
Checkpoints folder: /content/drive/MyDrive/STLF/Saved_models/lgbm_newsplit

--- Trial 1/8 ---
Params: {'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.1, 'model__estimator__reg_alpha': 0.1, 'model__estimator__num_leaves': 127, 'model__estimator__n_estimators': 700, 'model__estimator__min_child_samples': 40, 'model__estimator__max_depth': -1, 'model__estimator__learning_rate': 0.08, 'model__estimator__colsample_bytree': 0.9}
CV score (avg-neg-RMSE): -336.73368487022674
>> New best! Saved: /content/drive/MyDrive/STLF/Saved_models/lgbm_newsplit/lgbm_best_model.pkl
Progress saved to: /content/drive/MyDrive/STLF/Saved_models/lgbm_newsplit/lgbm_tune_progress.json

--- Trial 2/8 ---
Params: {'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.1, 'model__estimator__reg_alpha': 0.1, 'model__estimator__num_leaves': 63, 'model__estimator__n_estimators': 700, 'model__estimator__min_child_samples': 20, 'model__estimator__max_de

NameError: name 'lgb_tuner' is not defined

In [20]:
lgb_tuner = joblib.load(BEST_MODEL_PKL)
lgb_tuner

In [23]:
with open(os.path.join(BASE_DIR, PROGRESS_JSON), "r") as f:
    prog = json.load(f)
best_params = prog.get("best_params", None)
best_params

{'model__estimator__subsample': 0.9,
 'model__estimator__reg_lambda': 0.1,
 'model__estimator__reg_alpha': 0.0,
 'model__estimator__num_leaves': 63,
 'model__estimator__n_estimators': 300,
 'model__estimator__min_child_samples': 20,
 'model__estimator__max_depth': 8,
 'model__estimator__learning_rate': 0.05,
 'model__estimator__colsample_bytree': 0.7}

In [24]:
# Save best params dictionary to Drive
with open(os.path.join(BASE_DIR, "lgbm_best_params.json"), "w") as f:
    json.dump(best_params, f)
print(">> Saved best params to:", os.path.join(BASE_DIR, "lgbm_best_params.json"))

>> Saved best params to: /content/drive/MyDrive/STLF/Saved_models/lgbm_newsplit/lgbm_best_params.json


In [25]:
# ===== LGB final refit with early stopping (continue from your state) =====
import numpy as np, math, lightgbm as lgb
from sklearn.metrics import mean_squared_error

# 1) map best params from tuner to lgb.train
bp = best_params
g  = lambda k, d=None: bp.get(f"model__estimator__{k}", d)

lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": g("learning_rate", 0.05),
    "num_leaves": g("num_leaves", 127),
    "max_depth": g("max_depth", -1),
    "feature_fraction": g("colsample_bytree", 0.8),   # alias
    "bagging_fraction": g("subsample", 0.8),          # alias
    "bagging_freq": bp.get("model__estimator__subsample_freq", 0),
    "min_data_in_leaf": g("min_child_samples", 20),
    "lambda_l1": g("reg_alpha", 0.0),
    "lambda_l2": g("reg_lambda", 0.1),
    "verbosity": -1,
    "num_threads": -1,
    # Optional GPU:
    #"device_type": "gpu",
    # Optional speed knob during refit:
    "max_bin": 255,
}
# let ES find the best iteration (set bigger than tuned n_estimators)
num_boost_round = 4000
early_stopping_rounds = 100

# 2) encode once
X_trv_enc  = preprocess.fit_transform(X_trv)
X_test_enc = preprocess.transform(X_test)

# 3) carve last 15% of TRV as eval set (keep chronological order)
cut = int(0.85 * X_trv_enc.shape[0])
X_tr, X_ev = X_trv_enc[:cut], X_trv_enc[cut:]
Y_tr, Y_ev = Y_trv[:cut],     Y_trv[cut:]

# 4) train 24 boosters with ES
models_lgb = []
for h in range(HORIZON):
    dtr = lgb.Dataset(X_tr, label=Y_tr[:, h])
    dev = lgb.Dataset(X_ev, label=Y_ev[:, h], reference=dtr)
    booster = lgb.train(
        params=lgb_params,
        train_set=dtr,
        num_boost_round=num_boost_round,
        valid_sets=[dev],
        callbacks=[lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(-1)]
    )
    models_lgb.append(booster)

# 5) predict & metrics
Y_hat_lgb = np.column_stack([m.predict(X_test_enc, num_iteration=m.best_iteration) for m in models_lgb])

def safe_mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0

rmse_h_lgb = [math.sqrt(mean_squared_error(Y_test[:, h], Y_hat_lgb[:, h])) for h in range(HORIZON)]
nrmse_h_lgb = [rm / (np.mean(Y_test[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_h_lgb)]
mape_h_lgb  = [safe_mape(Y_test[:, h], Y_hat_lgb[:, h]) for h in range(HORIZON)]

print("Per-horizon RMSE:", [round(x,2) for x in rmse_h_lgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_lgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_lgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_lgb)))


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[318]	valid_0's rmse: 271.614
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[157]	valid_0's rmse: 275.902
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[252]	valid_0's rmse: 272.06
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[298]	valid_0's rmse: 279.215
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	valid_0's rmse: 279.115
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 276.162
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[232]	valid_0's rmse: 278.915
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[393]	valid_0's

# Results

In [None]:
print("XGBoost Results:")
print("Per-h RMSE:", [round(x,2) for x in rmse_h_xgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_xgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_xgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_xgb)))

XGBoost Results:
Per-h RMSE: [296.59, 325.64, 270.0, 270.46, 277.15, 265.78, 267.23, 259.59, 263.36, 274.65, 272.44, 275.65, 289.8, 283.15, 271.81, 291.29, 274.61, 271.92, 265.46, 284.73, 268.49, 266.76, 286.56, 301.62]
Avg RMSE:  278.114
Avg nRMSE: 11.85%
Avg MAPE:  9.00%


In [None]:
print("LightGB results:")
print("Per-horizon RMSE:", [round(x,2) for x in rmse_h_lgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_lgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_lgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_lgb)))

LightGB results:
Per-horizon RMSE: [274.6, 285.04, 273.35, 276.21, 281.96, 273.42, 270.18, 261.52, 271.67, 277.64, 276.35, 277.89, 280.05, 284.98, 276.67, 274.92, 275.68, 273.08, 265.77, 281.67, 272.1, 267.45, 286.04, 295.74]
Avg RMSE:  276.415
Avg nRMSE: 11.78%
Avg MAPE:  8.74%


In [None]:
Y_hat_xgb.shape, Y_hat_lgb.shape, Y_test.shape

((7107, 24), (7107, 24), (7107, 24))

In [None]:
for h in range(HORIZON):
    print(f"xgb_t+{h+1}:", Y_hat_xgb[:, h][0])
    print(f"lgb_t+{h+1}:", Y_hat_lgb[:, h][0])
    print(f"y_true_t+{h+1}:", Y_test[:, h][0])

xgb_t+1: 1988.4346
lgb_t+1: 2005.6319082389916
y_true_t+1: 1866.44
xgb_t+2: 1563.6069
lgb_t+2: 1556.2699649141357
y_true_t+2: 1549.87
xgb_t+3: 1424.0342
lgb_t+3: 1399.2779444772386
y_true_t+3: 1372.76
xgb_t+4: 1391.7781
lgb_t+4: 1400.8607798082196
y_true_t+4: 1304.0
xgb_t+5: 1439.0226
lgb_t+5: 1467.0696642781
y_true_t+5: 1304.0
xgb_t+6: 1655.1913
lgb_t+6: 1595.92168264926
y_true_t+6: 1516.86
xgb_t+7: 1915.626
lgb_t+7: 1966.0344576286554
y_true_t+7: 2003.4
xgb_t+8: 2836.6973
lgb_t+8: 2878.060269996576
y_true_t+8: 2938.18
xgb_t+9: 3604.7205
lgb_t+9: 3593.8982417774014
y_true_t+9: 4034.65
xgb_t+10: 4490.1543
lgb_t+10: 4495.7502563464195
y_true_t+10: 4437.05
xgb_t+11: 4637.315
lgb_t+11: 4612.101847856309
y_true_t+11: 4694.4
xgb_t+12: 4743.7603
lgb_t+12: 4703.490315495516
y_true_t+12: 4694.4
xgb_t+13: 4709.4863
lgb_t+13: 4699.345637924606
y_true_t+13: 4694.4
xgb_t+14: 4733.4927
lgb_t+14: 4706.422566885368
y_true_t+14: 4694.4
xgb_t+15: 4724.5723
lgb_t+15: 4704.18478239122
y_true_t+15: 4694.4

In [None]:
Y_hat_ens = 0.5 * (Y_hat_xgb + Y_hat_lgb)
print("Ensemble Avg RMSE:", np.mean([math.sqrt(mean_squared_error(Y_test[:,h], Y_hat_ens[:,h])) for h in range(24)]))

Ensemble Avg RMSE: 273.89086614940715


## 8 weeks test set split approach

- Per-h RMSE: [300.37, 330.13, 292.41, 301.24, 302.94, 292.55, 290.24, 277.7, 290.68, 316.07, 310.6, 313.78, 313.31, 320.77, 311.67, 323.22, 329.68, 309.68, 294.54, 328.63, 311.57, 291.77, 321.41, 326.5]
- Avg RMSE:  308.393
- Avg nRMSE: 12.46%
- Avg MAPE:  9.66%

In [26]:
print("Per-horizon RMSE:", [round(x,2) for x in rmse_h_lgb])
print("Avg RMSE:  {:.3f}".format(np.mean(rmse_h_lgb)))
print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_h_lgb)))
print("Avg MAPE:  {:.2f}%".format(np.mean(mape_h_lgb)))

Per-horizon RMSE: [314.31, 319.83, 302.94, 316.46, 312.66, 294.53, 294.67, 274.53, 296.89, 305.23, 325.58, 310.88, 319.59, 314.39, 309.43, 296.83, 332.86, 311.84, 292.17, 327.73, 302.6, 306.86, 325.9, 329.24]
Avg RMSE:  309.914
Avg nRMSE: 12.52%
Avg MAPE:  9.64%


# Old work

In [None]:
# Numeric predictors = all engineered/normalized predictors (except label, timestamp)
X_cols = [
    c for c in df_fe.columns
    if c not in ["timestamp", TARGET, "y_tgt"] + cat_cols
]
num_cols = [c for c in X_cols if pd.api.types.is_numeric_dtype(df_fe[c])]

X_train = train_df[num_cols + cat_cols]
X_val   = val_df[num_cols + cat_cols]
X_test  = test_df[num_cols + cat_cols]

X_train.shape, X_val.shape, X_test.shape

((35064, 63), (8784, 63), (3600, 63))

In [None]:
y_train.shape, y_val.shape, y_test.shape

((35064,), (8784,), (3600,))

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mape(y_true, y_pred, eps=1e-6):
    return float(np.mean(np.abs((y_true - y_pred)/np.maximum(np.abs(y_true), eps))) * 100.0)

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred))
    out = np.where(denom==0, 0, 2.0*np.abs(y_pred - y_true)/denom)
    return float(np.mean(out) * 100.0)

rmse_scorer = make_scorer(lambda yt, yp: -rmse(yt, yp))  # negate for "higher is better"


In [None]:
trval_df = pd.concat([train_df, val_df], axis=0).reset_index(drop=True)

In [None]:
# Categorical = any object/category columns (exclude timestamp & label)
cat_cols = [c for c in trval_df.columns
            if c not in ["timestamp", "y_tgt"]
            and (str(trval_df[c].dtype) in ["object", "category"])]
cat_cols

['Weather', 'season', 'is_holiday']

In [None]:
# Numeric = everything else except timestamp & label & cats
num_cols = [c for c in trval_df.columns
            if c not in ["timestamp", TARGET, "y_tgt"] + cat_cols
            and pd.api.types.is_numeric_dtype(trval_df[c])]

X_trval = trval_df[num_cols + cat_cols]
y_trval = trval_df["y_tgt"].values

X_test  = test_df[num_cols + cat_cols]
y_test  = test_df["y_tgt"].values

In [None]:
X_trval.shape, y_trval.shape, X_test.shape, y_test.shape

((43848, 63), (43848,), (3600, 63), (3600,))

In [None]:
# Check for missing values before fitting
print("Missing values in X_trval before fitting:")
print(X_trval.isnull().sum().sum())

Missing values in X_trval before fitting:
759


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),   # turns NaN into a token
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

In [None]:
print("NaNs in y_trval:", trval_df["y_tgt"].isna().sum())   # must be 0


NaNs in y_trval: 0


In [None]:
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from xgboost import XGBRegressor

tscv_fast = TimeSeriesSplit(n_splits=5)

xgb_pipe = Pipeline([
    ("pre", pre),
    ("model", XGBRegressor(
        random_state=42,
        n_estimators=1200,
        tree_method="gpu_hist",    # or "gpu_hist" if you installed a GPU build
        n_jobs=-1
    ))
])

xgb_param_dist = {
    "model__learning_rate":  [0.03, 0.05, 0.07, 0.1],
    "model__max_depth":      [4, 6, 8, 10],
    "model__min_child_weight":[1, 3, 5, 8],
    "model__subsample":      [0.7, 0.85, 1.0],
    "model__colsample_bytree":[0.7, 0.85, 1.0],
    "model__reg_lambda":     [0.0, 1.0, 3.0],
}

xgb_search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=xgb_param_dist,
    n_iter=24,
    scoring=rmse_scorer,      # your negative-RMSE scorer
    cv=tscv_fast,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    error_score="raise"       # helpful while debugging
)

xgb_search.fit(X_trval, y_trval)
print("XGB best CV RMSE:", -xgb_search.best_score_)
print("XGB best params:", xgb_search.best_params_)

# Evaluate on the held-out test
yhat_test_xgb = xgb_search.predict(X_test)
print({
    "XGB_Test_RMSE": rmse(y_test, yhat_test_xgb),
    "XGB_Test_MAE":  mean_absolute_error(y_test, yhat_test_xgb),
    "XGB_Test_MAPE%": mape(y_test, yhat_test_xgb),
    "XGB_Test_sMAPE%": smape(y_test, yhat_test_xgb),
    "XGB_Test_R2":   r2_score(y_test, yhat_test_xgb),
})


Fitting 5 folds for each of 24 candidates, totalling 120 fits
XGB best CV RMSE: 274.734398322277
XGB best params: {'model__subsample': 0.85, 'model__reg_lambda': 3.0, 'model__min_child_weight': 3, 'model__max_depth': 6, 'model__learning_rate': 0.03, 'model__colsample_bytree': 0.85}
{'XGB_Test_RMSE': 399.6059230619843, 'XGB_Test_MAE': np.float64(279.4452435194228), 'XGB_Test_MAPE%': 12.45668378150411, 'XGB_Test_sMAPE%': 12.476697200281645, 'XGB_Test_R2': 0.7583594032487943}


In [None]:
import joblib

# Save the best trained pipeline
joblib.dump(xgb_search.best_estimator_, 'xgb_pipeline.pkl')

print("Trained XGBoost pipeline saved to xgb_pipeline.pkl")

Trained XGBoost pipeline saved to xgb_pipeline.pkl


In [None]:
import joblib
import pandas as pd

# Load the saved pipeline
loaded_xgb_pipeline = joblib.load('xgb_pipeline.pkl')

# Assume you have new data for prediction in a pandas DataFrame called new_data_df
# new_data_df should have the same columns as the data used for training (excluding the target)
# For demonstration purposes, let's use a sample from the test data.
# In a real scenario, you would load your new data here.
new_data_df = X_test.head() # Replace with your actual new data

# Make predictions using the loaded model
predictions = loaded_xgb_pipeline.predict(new_data_df)

print("Predictions for the new data:")
print(predictions)

In [None]:
import json

# Get the best score and best parameters
best_score = xgb_search.best_score_
best_params = xgb_search.best_params_

# Create a dictionary to store the results
results = {
    "best_cv_rmse": -best_score, # Assuming best_score is negative RMSE
    "best_params": best_params
}

# Save the results to a JSON file
with open('xgb_best_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Best CV RMSE and parameters saved to xgb_best_results.json")

Best CV RMSE and parameters saved to xgb_best_results.json


In [None]:
X_train = train_df[num_cols + cat_cols]
y_train = train_df["y_tgt"].values

X_val   = val_df[num_cols + cat_cols]
y_val   = val_df["y_tgt"].values

X_test  = test_df[num_cols + cat_cols]
y_test  = test_df["y_tgt"].values

pre.fit(X_train)
Xt_train = pre.transform(X_train)
Xt_val   = pre.transform(X_val)
Xt_test  = pre.transform(X_test)

In [None]:
def strip_prefix(d, prefix="model__"):
    return {k[len(prefix):]: v for k, v in d.items() if k.startswith(prefix)}

xgb_params = strip_prefix(xgb_search.best_params_) if 'xgb_search' in globals() else {}

xgb_params

{'subsample': 0.85,
 'reg_lambda': 3.0,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.03,
 'colsample_bytree': 0.85}

In [None]:
# Reasonable defaults in case some keys were not present in the grid
xgb_params.setdefault("random_state", 42)
xgb_params.setdefault("tree_method", "hist")
xgb_params.setdefault("n_jobs", -1)

-1

In [None]:
xgb_best_es = XGBRegressor(
    **xgb_params,
    n_estimators=20000,            # large cap; ES will stop much earlier
)
xgb_best_es.fit(
    Xt_train, y_train,
    eval_set=[(Xt_val, y_val)],
    early_stopping_rounds=300,
    verbose=False
)

yhat_tr_x = xgb_best_es.predict(Xt_train)
yhat_v_x  = xgb_best_es.predict(Xt_val)
yhat_te_x = xgb_best_es.predict(Xt_test)

print("\n[XGBoost Early-Stopped]")
print("  best_iteration_:", getattr(xgb_best_es, "best_iteration", None))
print("  Train:", {"RMSE": rmse(y_train,yhat_tr_x), "MAE": mean_absolute_error(y_train,yhat_tr_x),
                 "MAPE%": mape(y_train,yhat_tr_x), "sMAPE%": smape(y_train,yhat_tr_x),
                 "R2": r2_score(y_train,yhat_tr_x)})
print("  Val:  ", {"RMSE": rmse(y_val,yhat_v_x), "MAE": mean_absolute_error(y_val,yhat_v_x),
                 "MAPE%": mape(y_val,yhat_v_x), "sMAPE%": smape(y_val,yhat_v_x),
                 "R2": r2_score(y_val,yhat_v_x)})
print("  Test: ", {"RMSE": rmse(y_test,yhat_te_x), "MAE": mean_absolute_error(y_test,yhat_te_x),
                 "MAPE%": mape(y_test,yhat_te_x), "sMAPE%": smape(y_test,yhat_te_x),
                 "R2": r2_score(y_test,yhat_te_x)})


[XGBoost Early-Stopped]
  best_iteration_: 1461
  Train: {'RMSE': 170.1558028259626, 'MAE': np.float64(113.47134239166584), 'MAPE%': 4.353415817604535, 'sMAPE%': 4.346512702338945, 'R2': 0.978891777250954}
  Val:   {'RMSE': 256.9816001540845, 'MAE': np.float64(168.46312122827547), 'MAPE%': 6.690556285073275, 'sMAPE%': 6.560981069369697, 'R2': 0.9504244529331255}
  Test:  {'RMSE': 413.01272804314146, 'MAE': np.float64(294.06047038981114), 'MAPE%': 13.600469723390294, 'sMAPE%': 13.300370997949779, 'R2': 0.7418732951197451}


In [None]:
from lightgbm import LGBMRegressor
# Lightweight LGBM pipeline
lgb_pipe_fast = Pipeline([
    ("pre", pre),
    ("model", LGBMRegressor(
        boosting_type="gbdt",
        n_estimators=600,          # fewer rounds (faster)
        learning_rate=0.1,         # higher lr pairs with fewer rounds
        num_leaves=63,             # smaller trees → faster
        max_bin=127,               # fewer histogram bins → faster
        subsample_freq=1,          # enable bagging
        random_state=42,
        n_jobs=-1
        # device="gpu"  # only if you compiled GPU LGBM
    ))
])

# Lean param distributions (all fast-friendly)
lgb_param_dist_fast = {
    "model__learning_rate":     [0.07, 0.1, 0.15, 0.2],
    "model__num_leaves":        [31, 63, 95],           # small–medium
    "model__min_child_samples": [50, 100, 200],         # larger = faster/pruned
    "model__subsample":         [0.6, 0.75, 0.9],
    "model__colsample_bytree":  [0.6, 0.75, 0.9],
    "model__reg_lambda":        [0.0, 1.0, 3.0],
    "model__max_bin":           [63, 127],              # fewer bins = faster
}

lgb_search_fast = RandomizedSearchCV(
    estimator=lgb_pipe_fast,
    param_distributions=lgb_param_dist_fast,
    n_iter=12,                 # ↓ try 8 if you need even faster
    scoring=rmse_scorer,       # your negative-RMSE scorer
    cv=TimeSeriesSplit(n_splits=3),
    n_jobs=-1,
    random_state=42,
    verbose=1
)

lgb_search_fast.fit(X_trval, y_trval)
print("LGBM fast best CV RMSE:", -lgb_search_fast.best_score_)
print("LGBM fast best params:", lgb_search_fast.best_params_)

# Evaluate on held-out test
yhat_test_lgb_fast = lgb_search_fast.predict(X_test)
print({
    "LGB_Fast_Test_RMSE": rmse(y_test, yhat_test_lgb_fast),
    "LGB_Fast_Test_MAE":  mean_absolute_error(y_test, yhat_test_lgb_fast),
    "LGB_Fast_Test_MAPE%": mape(y_test, yhat_test_lgb_fast),
    "LGB_Fast_Test_sMAPE%": smape(y_test, yhat_test_lgb_fast),
    "LGB_Fast_Test_R2":   r2_score(y_test, yhat_test_lgb_fast),
})

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 43848, number of used features: 66
[LightGBM] [Info] Start training from score 2707.070033
LGBM fast best CV RMSE: 280.4836838034441
LGBM fast best params: {'model__subsample': 0.75, 'model__reg_lambda': 0.0, 'model__num_leaves': 31, 'model__min_child_samples': 50, 'model__max_bin': 63, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.9}
{'LGB_Fast_Test_RMSE': 423.09493407586257, 'LGB_Fast_Test_MAE': np.float64(293.0123920833149), 'LGB_Fast_Test_MAPE%': 12.744403403770152, 'LGB_Fast_Test_sMAPE%': 13.045547320044196, 'LGB_Fast_Test_R2': 0.7291170211714155}


In [None]:
# Save the best trained pipeline
joblib.dump(lgb_search_fast.best_estimator_, 'lgb_pipeline.pkl')

print("Trained LightGBM pipeline saved to lgb_pipeline.pkl")

Trained LightGBM pipeline saved to lgb_pipeline.pkl


In [None]:
# Get the best score and best parameters
best_score = lgb_search_fast.best_score_
best_params = lgb_search_fast.best_params_

# Create a dictionary to store the results
results = {
    "best_cv_rmse": -best_score, # Assuming best_score is negative RMSE
    "best_params": best_params
}

# Save the results to a JSON file
with open('lgb_best_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Best CV RMSE and parameters saved to lgb_best_results.json")

Best CV RMSE and parameters saved to lgb_best_results.json


In [None]:
lgb_params = strip_prefix(lgb_search_fast.best_params_) if 'lgb_search' in globals() else {}

In [None]:
lgb_params.setdefault("random_state", 42)
lgb_params.setdefault("n_jobs", -1)

-1

In [None]:
import lightgbm as lgb
lgb_best_es = LGBMRegressor(
    **lgb_params,
    n_estimators=20000,           # large cap
)

lgb_best_es.fit(
    Xt_train, y_train,
    eval_set=[(Xt_val, y_val)],
    eval_metric="l2",             # RMSE^2; you can also use "rmse"
    callbacks=[
        lgb.early_stopping(stopping_rounds=300),
        lgb.log_evaluation(period=0)
    ],
)

yhat_tr_l = lgb_best_es.predict(Xt_train, num_iteration=lgb_best_es.best_iteration_)
yhat_v_l  = lgb_best_es.predict(Xt_val,   num_iteration=lgb_best_es.best_iteration_)
yhat_te_l = lgb_best_es.predict(Xt_test,  num_iteration=lgb_best_es.best_iteration_)

print("\n[LightGBM Early-Stopped]")
print("  best_iteration_:", getattr(lgb_best_es, "best_iteration_", None))
print("  Train:", {"RMSE": rmse(y_train,yhat_tr_l), "MAE": mean_absolute_error(y_train,yhat_tr_l),
                 "MAPE%": mape(y_train,yhat_tr_l), "sMAPE%": smape(y_train,yhat_tr_l),
                 "R2": r2_score(y_train,yhat_tr_l)})
print("  Val:  ", {"RMSE": rmse(y_val,yhat_v_l), "MAE": mean_absolute_error(y_val,yhat_v_l),
                 "MAPE%": mape(y_val,yhat_v_l), "sMAPE%": smape(y_val,yhat_v_l),
                 "R2": r2_score(y_val,yhat_v_l)})
print("  Test: ", {"RMSE": rmse(y_test,yhat_te_l), "MAE": mean_absolute_error(y_test,yhat_te_l),
                 "MAPE%": mape(y_test,yhat_te_l), "sMAPE%": smape(y_test,yhat_te_l),
                 "R2": r2_score(y_test,yhat_te_l)})

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 35064, number of used features: 66
[LightGBM] [Info] Start training from score 2733.604354
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[485]	valid_0's l2: 67569

[LightGBM Early-Stopped]
  best_iteration_: 485
  Train: {'RMSE': 198.9107650338299, 'MAE': np.float64(129.76571278433772), 'MAPE%': 4.914029097020292, 'sMAPE%': 4.908542914031913, 'R2': 0.9711547229136783}
  Val:   {'RMSE': 259.9403172006489, 'MAE': np.float64(168.63302953172428), 'MAPE%': 6.666095174394254, 'sMAPE%': 6.545776282701627, 'R2': 0.9492763208901658}
  Test:  {'RMSE': 405.2209140084837, 'MAE': np.float64(281.97700411972846), 'MAPE%': 12.460508785992609, 'sMAPE%': 12.583837141716526, 'R2': 0.7515209548239283}


In [None]:
pred_es = test_df[["timestamp"]].copy()
pred_es["y_true"]   = y_test
pred_es["yhat_xgb"] = yhat_te_x
pred_es["yhat_lgb"] = yhat_te_l
pred_es.to_csv("test_predictions.csv", index=False)
print("\nSaved")


Saved


In [None]:
import numpy as np

def avg_neg_rmse(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    rmse_per_h = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
    return -float(np.mean(rmse_per_h))