# Setup

In [None]:
# Colab / Kaggle cell
!pip install -q --upgrade "scikit-learn==1.6.1" "xgboost==3.0.5" "joblib==1.5.2"

# force a clean restart so the new wheels are used
import os, sys; os.kill(os.getpid(), 9)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.4/308.4 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [1]:
import sklearn, xgboost, joblib
print(sklearn.__version__, xgboost.__version__, joblib.__version__)
# should be 1.6.1, 3.0.5, 1.5.2


1.6.1 3.0.5 1.5.2


In [2]:
# --- Strict reproducibility setup ---
import os, re, math, warnings
import numpy as np
import pandas as pd
import inspect
import random
import torch
SEED = 42

os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"   # or ":16:8" (needed for deterministic matmul)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(False)            # may fall back to slower kernels
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("/kaggle/input/weather-hanoi-2020-2025-normed/merge_weather_energy_hanoi_20202025_norm.csv")
df.head()

Unnamed: 0,Temperature,Weather,Precipitation,Chance of snow,Humidity,Wind,Wind Gust,Wind Degree,Wind Direction,Cloud Cover,Visibility,timestamp,is_weekend,season,is_holiday,total_consumption_mw
0,25.4,Patchy rain possible,0.6,0.0,89.0,2.194444,4.388889,295.0,WNW,89.0,9.0,2020-01-01 00:00:00,0,winter,False,1790.1
1,25.1,Partly cloudy,0.0,0.0,90.0,2.611111,5.111111,297.0,WNW,34.0,10.0,2020-01-01 01:00:00,0,winter,False,1452.26
2,24.7,Patchy rain possible,0.0,0.0,91.0,2.805556,5.5,309.0,NW,87.0,10.0,2020-01-01 02:00:00,0,winter,False,1483.75
3,24.5,Cloudy,0.0,0.0,92.0,2.611111,4.888889,325.0,NW,71.0,10.0,2020-01-01 03:00:00,0,winter,False,1890.07
4,24.1,Patchy rain possible,0.0,0.0,93.0,2.305556,4.0,326.0,NNW,100.0,10.0,2020-01-01 04:00:00,0,winter,False,1371.23


In [4]:
df.shape

(47449, 16)

In [4]:
CAT_COLS   = ['Weather','Wind Direction','season','is_holiday']
EXOG_NUM   = ['Temperature','Precipitation','Humidity',
              'Wind','Wind Gust','Wind Degree','Cloud Cover','Visibility']
HORIZON    = 24                       # forecast t+1…t+24
TARGET_LAGS   = range(1,49)           # lags of target (1…48)
ROLL_WINDOWS  = [3,6,12,24,48]        # rolling windows
EXOG_LAGS     = [0,1,3,6,12,24]       # lags for exogenous vars
TARGET_COL = 'total_consumption_mw'
TIME_COL   = 'timestamp'

In [5]:
df[TIME_COL] = pd.to_datetime(df[TIME_COL])
df = df.sort_values(TIME_COL).reset_index(drop=True)

In [6]:
# make sure categoricals are strings
for col in CAT_COLS:
    df[col] = df[col].astype(str)

# Feature enginerring & data preprocessing

In [7]:
# add calendar/time features
df['hour']      = df[TIME_COL].dt.hour
df['dow']       = df[TIME_COL].dt.dayofweek
df['dom']       = df[TIME_COL].dt.day
df['month']     = df[TIME_COL].dt.month
df['doy']       = df[TIME_COL].dt.dayofyear
df['hour_sin']  = np.sin(2*np.pi*df['hour']/24)
df['hour_cos']  = np.cos(2*np.pi*df['hour']/24)
df['dow_sin']   = np.sin(2*np.pi*df['dow']/7)
df['dow_cos']   = np.cos(2*np.pi*df['dow']/7)
df['doy_sin']   = np.sin(2*np.pi*df['doy']/365.25)
df['doy_cos']   = np.cos(2*np.pi*df['doy']/365.25)

In [8]:
# 2. feature engineering
# create lagged target features
for L in TARGET_LAGS:
    df[f'y_lag_{L}'] = df[TARGET_COL].shift(L)

# rolling statistics on the target, with one‑step shift to avoid leakage
for W in ROLL_WINDOWS:
    df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
    df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()

# lagged exogenous features
for col in EXOG_NUM:
    for L in EXOG_LAGS:
        df[f'{col}_lag{L}'] = df[col].shift(L)

# multi‑horizon labels
for h in range(1, HORIZON+1):
    df[f'y_t+{h}'] = df[TARGET_COL].shift(-h)

# drop rows with NaNs (from lagging & shifting)no
df_feat = df.dropna().reset_index(drop=True)
df_feat.shape

(47377, 157)

In [10]:
df_feat.shape

(47377, 157)

In [9]:
np.max(df_feat['timestamp'])

Timestamp('2025-05-30 00:00:00')

In [9]:
# 3. train/valid/test split by time
label_cols   = [f'y_t+{h}' for h in range(1, HORIZON+1)]
feature_cols = [c for c in df_feat.columns if c not in label_cols + [TIME_COL, TARGET_COL]]

X = df_feat[feature_cols]
Y = df_feat[label_cols].values

N        = len(X)
train_end= int(N - 2*8*7*24)
valid_end= int(N - 8*7*24)

X_train, Y_train = X.iloc[:train_end],    Y[:train_end]
X_valid, Y_valid = X.iloc[train_end:valid_end], Y[train_end:valid_end]
X_test,  Y_test  = X.iloc[valid_end:],    Y[valid_end:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)
print(X_test.shape, Y_test.shape)

(44689, 131) (44689, 24)
(1344, 131) (1344, 24)
(1344, 131) (1344, 24)


In [13]:
np.max(df_feat['timestamp'].iloc[:valid_end])

Timestamp('2025-04-04 00:00:00')

In [14]:
df

Unnamed: 0,y_t+1,y_t+2,y_t+3,y_t+4,y_t+5,y_t+6,y_t+7,y_t+8,y_t+9,y_t+10,...,y_t+15,y_t+16,y_t+17,y_t+18,y_t+19,y_t+20,y_t+21,y_t+22,y_t+23,y_t+24
0,1555.56,1505.56,1304.00,1819.42,1383.52,1546.93,1740.87,1746.23,1897.63,2062.55,...,1823.04,1791.94,1878.24,2037.24,2352.93,3209.41,3103.21,2531.59,2117.06,1725.51
1,1505.56,1304.00,1819.42,1383.52,1546.93,1740.87,1746.23,1897.63,2062.55,2296.52,...,1791.94,1878.24,2037.24,2352.93,3209.41,3103.21,2531.59,2117.06,1725.51,1700.35
2,1304.00,1819.42,1383.52,1546.93,1740.87,1746.23,1897.63,2062.55,2296.52,2345.26,...,1878.24,2037.24,2352.93,3209.41,3103.21,2531.59,2117.06,1725.51,1700.35,1681.38
3,1819.42,1383.52,1546.93,1740.87,1746.23,1897.63,2062.55,2296.52,2345.26,2187.29,...,2037.24,2352.93,3209.41,3103.21,2531.59,2117.06,1725.51,1700.35,1681.38,1533.64
4,1383.52,1546.93,1740.87,1746.23,1897.63,2062.55,2296.52,2345.26,2187.29,1976.74,...,2352.93,3209.41,3103.21,2531.59,2117.06,1725.51,1700.35,1681.38,1533.64,1633.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44684,3097.82,2915.25,2112.98,1837.54,1595.33,1308.12,1556.42,1325.77,1568.16,1706.35,...,1304.00,1317.09,1304.00,1304.00,1304.00,1304.00,2137.69,2368.26,2629.19,2980.56
44685,2915.25,2112.98,1837.54,1595.33,1308.12,1556.42,1325.77,1568.16,1706.35,2008.87,...,1317.09,1304.00,1304.00,1304.00,1304.00,2137.69,2368.26,2629.19,2980.56,2936.49
44686,2112.98,1837.54,1595.33,1308.12,1556.42,1325.77,1568.16,1706.35,2008.87,1304.00,...,1304.00,1304.00,1304.00,1304.00,2137.69,2368.26,2629.19,2980.56,2936.49,2670.39
44687,1837.54,1595.33,1308.12,1556.42,1325.77,1568.16,1706.35,2008.87,1304.00,1304.00,...,1304.00,1304.00,1304.00,2137.69,2368.26,2629.19,2980.56,2936.49,2670.39,2340.46


In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
HORIZON = 24
L = max(TARGET_LAGS)  # 48  (history window)
DTYPE = np.float32

In [12]:
# 1) Decide which columns are allowed inside the SEQUENCE
#    (NO future labels; NO GBM-style lag columns; we keep "current/past" variables
#     and time/categorical signals; the model will see the last L rows as a sequence)
base_cols = [TARGET_COL] + EXOG_NUM + CAT_COLS + [
    'hour','dow','dom','month','doy','hour_sin','hour_cos','dow_sin','dow_cos','doy_sin','doy_cos'
]
use_cols = [c for c in base_cols if c in df.columns]

df_seq = df[use_cols + [TIME_COL]].copy()

# 2) One-hot the categoricals (strings) for NN input
df_seq = pd.get_dummies(df_seq, columns=[c for c in CAT_COLS if c in df_seq.columns], drop_first=False, dtype=np.int8)
df_seq

Unnamed: 0,total_consumption_mw,Temperature,Precipitation,Humidity,Wind,Wind Gust,Wind Degree,Cloud Cover,Visibility,hour,...,Wind Direction_SW,Wind Direction_W,Wind Direction_WNW,season_autumn,season_spring,season_summer,season_winter,is_holiday_False,is_holiday_national,is_holiday_tet
0,1790.10,25.4,0.6,89.0,2.194444,4.388889,295.0,89.0,9.0,0,...,0,0,1,0,0,0,1,1,0,0
1,1452.26,25.1,0.0,90.0,2.611111,5.111111,297.0,34.0,10.0,1,...,0,0,1,0,0,0,1,1,0,0
2,1483.75,24.7,0.0,91.0,2.805556,5.500000,309.0,87.0,10.0,2,...,0,0,0,0,0,0,1,1,0,0
3,1890.07,24.5,0.0,92.0,2.611111,4.888889,325.0,71.0,10.0,3,...,0,0,0,0,0,0,1,1,0,0
4,1371.23,24.1,0.0,93.0,2.305556,4.000000,326.0,100.0,10.0,4,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47444,2449.28,26.8,0.0,84.0,1.888889,3.388889,109.0,56.0,10.0,20,...,0,0,0,0,1,0,0,1,0,0
47445,2554.05,26.5,0.0,86.0,2.388889,4.194444,126.0,52.0,10.0,21,...,0,0,0,0,1,0,0,1,0,0
47446,1895.41,26.1,0.0,88.0,2.388889,4.194444,149.0,84.0,10.0,22,...,0,0,0,0,1,0,0,1,0,0
47447,1558.67,25.6,0.5,91.0,2.305556,4.111111,152.0,76.0,9.0,23,...,0,0,0,0,1,0,0,1,0,0


In [13]:
# 3) Make sure "is_weekend" exists (optional but useful)
if 'is_weekend' not in df_seq.columns:
    df_seq['is_weekend'] = (df_seq['dow'] >= 5).astype(np.int8)

# 4) Drop the first L-1 rows and last H rows to allow full windows and labels later
#    We'll create windows on the fly; indices will map to the "window end" time t.
df_seq = df_seq.reset_index(drop=True)
df_seq

Unnamed: 0,total_consumption_mw,Temperature,Precipitation,Humidity,Wind,Wind Gust,Wind Degree,Cloud Cover,Visibility,hour,...,Wind Direction_W,Wind Direction_WNW,season_autumn,season_spring,season_summer,season_winter,is_holiday_False,is_holiday_national,is_holiday_tet,is_weekend
0,1790.10,25.4,0.6,89.0,2.194444,4.388889,295.0,89.0,9.0,0,...,0,1,0,0,0,1,1,0,0,0
1,1452.26,25.1,0.0,90.0,2.611111,5.111111,297.0,34.0,10.0,1,...,0,1,0,0,0,1,1,0,0,0
2,1483.75,24.7,0.0,91.0,2.805556,5.500000,309.0,87.0,10.0,2,...,0,0,0,0,0,1,1,0,0,0
3,1890.07,24.5,0.0,92.0,2.611111,4.888889,325.0,71.0,10.0,3,...,0,0,0,0,0,1,1,0,0,0
4,1371.23,24.1,0.0,93.0,2.305556,4.000000,326.0,100.0,10.0,4,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47444,2449.28,26.8,0.0,84.0,1.888889,3.388889,109.0,56.0,10.0,20,...,0,0,0,1,0,0,1,0,0,0
47445,2554.05,26.5,0.0,86.0,2.388889,4.194444,126.0,52.0,10.0,21,...,0,0,0,1,0,0,1,0,0,0
47446,1895.41,26.1,0.0,88.0,2.388889,4.194444,149.0,84.0,10.0,22,...,0,0,0,1,0,0,1,0,0,0
47447,1558.67,25.6,0.5,91.0,2.305556,4.111111,152.0,76.0,9.0,23,...,0,0,0,1,0,0,1,0,0,0


In [14]:
# 5) Build the feature matrix F and target vector y from the same df (no leakage)
feature_cols = [c for c in df_seq.columns if c not in [TIME_COL, TARGET_COL]]
F_all = df_seq[feature_cols].astype(DTYPE).values                 # (T, D)
y_all = df_seq[[TARGET_COL]].astype(DTYPE).values                 # (T, 1)
t_all = df_seq[TIME_COL].to_numpy()     

In [15]:
# 6) Sliding windows: [t-L+1..t] -> [t+1..t+H]
def make_windows(F, y, times, L, H):
    T, D = F.shape
    N = T - L - H + 1
    X = np.empty((N, L, D), dtype=DTYPE)
    Y = np.empty((N, H), dtype=DTYPE)
    end_ts = np.empty(N, dtype='datetime64[ns]')  # timestamp at t
    for i in range(N):
        X[i] = F[i:i+L]
        Y[i] = y[i+L:i+L+H, 0]
        end_ts[i] = times[i+L-1]
    return X, Y, end_ts

X_raw, Y_raw, end_times = make_windows(F_all, y_all, t_all, L, HORIZON)

In [62]:
X_raw.shape, end_times

((47378, 48, 48),
 array(['2020-01-02T23:00:00.000000000', '2020-01-03T00:00:00.000000000',
        '2020-01-03T01:00:00.000000000', ...,
        '2025-05-29T22:00:00.000000000', '2025-05-29T23:00:00.000000000',
        '2025-05-30T00:00:00.000000000'], dtype='datetime64[ns]'))

In [16]:
# 7) 
cut_train_ts = df_feat.iloc[train_end-1][TIME_COL]
cut_valid_ts = df_feat.iloc[valid_end-1][TIME_COL]

train_mask = (end_times <= np.datetime64(cut_train_ts))
valid_mask = (end_times >  np.datetime64(cut_train_ts)) & (end_times <= np.datetime64(cut_valid_ts))
test_mask  = (end_times >  np.datetime64(cut_valid_ts))

X_train, Y_train = X_raw[train_mask], Y_raw[train_mask]
X_val,   Y_val   = X_raw[valid_mask], Y_raw[valid_mask]
X_test,  Y_test  = X_raw[test_mask],  Y_raw[test_mask]

print("Sequence shapes:", X_train.shape, Y_train.shape, "|", X_val.shape, Y_val.shape, "|", X_test.shape, Y_test.shape)


Sequence shapes: (44690, 48, 48) (44690, 24) | (1344, 48, 48) (1344, 24) | (1344, 48, 48) (1344, 24)


In [17]:
test_times = end_times[test_mask]
print("First Transformer test timestamp:", test_times[0])
print("Last Transformer test timestamp:", test_times[-1])
print("First GBM test timestamp:", df_feat.iloc[valid_end][TIME_COL])
print("Last GBM test timestamp:", df_feat.iloc[-1][TIME_COL])


First Transformer test timestamp: 2025-04-04T01:00:00.000000000
Last Transformer test timestamp: 2025-05-30T00:00:00.000000000
First GBM test timestamp: 2025-04-04 01:00:00
Last GBM test timestamp: 2025-05-30 00:00:00


In [18]:
# 8) Scale features and targets using TRAIN ONLY (good practice for NNs)
x_scaler = StandardScaler().fit(X_train.reshape(-1, X_train.shape[-1]))
X_train = x_scaler.transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape).astype(DTYPE)
X_val   = x_scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape).astype(DTYPE)
X_test  = x_scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape).astype(DTYPE)

y_scaler = StandardScaler().fit(Y_train.reshape(-1,1))
Y_train_s = y_scaler.transform(Y_train.reshape(-1,1)).reshape(Y_train.shape).astype(DTYPE)
Y_val_s   = y_scaler.transform(Y_val.reshape(-1,1)).reshape(Y_val.shape).astype(DTYPE)
Y_test_s  = y_scaler.transform(Y_test.reshape(-1,1)).reshape(Y_test.shape).astype(DTYPE)

print("Scaled shapes:", X_train.shape, Y_train_s.shape)

Scaled shapes: (44690, 48, 48) (44690, 24)


In [22]:
print("Scaled shapes:", X_val.shape, X_test.shape)

Scaled shapes: (1344, 48, 48) (1344, 48, 48)


In [19]:
# 9) (Optional but recommended) Build known-ahead future covariates Z for each horizon h=1..H
#    Use ONLY features known at t+h: calendar signals + holiday/season (not observed weather).
def build_future_covariates(times, H):
    # times here should be the "window end" timestamps: we’ll advance by +h hours
    # Features: hour/dow/doy sin/cos, is_weekend, and one-hots for is_holiday and season if present
    # We'll reuse df to look up is_holiday/season at t+h
    df_key = df[[TIME_COL, 'is_holiday', 'season']].copy()
    df_key[TIME_COL] = pd.to_datetime(df_key[TIME_COL])
    df_key = df_key.set_index(TIME_COL).sort_index()

    Z_list = []
    for ts in times:
        row = []
        for h in range(1, H+1):
            t_h = pd.Timestamp(ts) + pd.Timedelta(hours=h)
            hour = t_h.hour; dow = t_h.dayofweek; doy = t_h.dayofyear
            v = [
                np.sin(2*np.pi*hour/24), np.cos(2*np.pi*hour/24),
                np.sin(2*np.pi*dow/7),  np.cos(2*np.pi*dow/7),
                np.sin(2*np.pi*doy/365.25), np.cos(2*np.pi*doy/365.25),
                1 if dow>=5 else 0,  # is_weekend
            ]
            # holiday + season one-hots from the calendar row if present
            if t_h in df_key.index:
                hol = str(df_key.loc[t_h, 'is_holiday'])
                sea = str(df_key.loc[t_h, 'season'])
            else:
                hol, sea = "False", None
            # map to fixed one-hot order
            hol_vec = [int(hol=="False"), int(hol=="tet"), int(hol=="national")]
            sea_vec = [int(sea==s) for s in ["winter","spring","summer","autumn"]] if sea is not None else [0,0,0,0]
            v += hol_vec + sea_vec
            row.append(v)
        Z_list.append(row)
    Z = np.array(Z_list, dtype=DTYPE)  # (N, H, Ff)
    return Z

Z_train = build_future_covariates(end_times[train_mask], HORIZON)
Z_val   = build_future_covariates(end_times[valid_mask], HORIZON)
Z_test  = build_future_covariates(end_times[test_mask],  HORIZON)

print("Future covariates Z:", Z_train.shape, Z_val.shape, Z_test.shape)

Future covariates Z: (44690, 24, 14) (1344, 24, 14) (1344, 24, 14)


In [20]:
# 10) Bundle for later inverse-transform
class Bundle: pass
bundle = Bundle()
bundle.y_scaler = y_scaler
bundle.x_scaler = x_scaler
bundle.L = L
bundle.H = HORIZON
bundle.feature_cols = feature_cols

# Train LSTM + Attention

## Baseline LSTM + attention

In [38]:
# train_lstm_attention.py
# Baseline LSTM + Attention for 24-step (multi-horizon) regression.
# Expects: X_train, Y_train_s, X_val, Y_val_s, X_test, Y_test_s, bundle (with y_scaler)

import math, os, random, numpy as np, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

# ---------- Repro ----------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.benchmark = True  # speed for fixed shapes

# ---------- If your arrays are not in memory -----------
# import numpy as np, joblib
# X_train = np.load("X_train.npy");  Y_train_s = np.load("Y_train_s.npy")
# X_val   = np.load("X_val.npy");    Y_val_s   = np.load("Y_val_s.npy")
# X_test  = np.load("X_test.npy");   Y_test_s  = np.load("Y_test_s.npy")
# bundle  = joblib.load("preproc_bundle.joblib")  # with y_scaler

# ---------- Tiny dataset wrapper ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X)        # (N, L, D)
        self.Y = torch.from_numpy(Y)        # (N, H)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i): return self.X[i], self.Y[i]

# ---------- LSTM + simple dot-product attention ----------
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, layers=1, dropout=0.2, horizon=24):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.head = nn.Sequential(
            nn.Linear(2*hidden, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, horizon),  # direct multi-output
        )

    def forward(self, x):                     # x: (B, L, D)
        seq, (h_n, _) = self.lstm(x)          # seq: (B, L, H), h_n[-1]: (B, H)
        q = h_n[-1]                            # query = last hidden state
        # temporal (across L) scaled dot-product attention
        scores = torch.sum(seq * q.unsqueeze(1), dim=-1) / math.sqrt(seq.size(-1))  # (B, L)
        w = torch.softmax(scores, dim=1)       # (B, L)
        ctx = torch.sum(seq * w.unsqueeze(-1), dim=1)                                # (B, H)
        out = self.head(self.dropout(torch.cat([ctx, q], dim=-1)))                   # (B, HORIZON)
        return out, w

# ---------- Training / eval helpers ----------
def make_loaders(Xtr, Ytr, Xva, Yva, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr, Ytr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True),
        DataLoader(SeqDataset(Xva, Yva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    mse, n = 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        yhat, _ = model(x)
        mse += torch.mean((yhat - y)**2).item() * x.size(0)
        n += x.size(0)
    return mse / n

def train_model(
    X_train, Y_train_s, X_val, Y_val_s, *,
    hidden=128, layers=1, dropout=0.2, batch=128,
    epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    L, D = X_train.shape[1], X_train.shape[2]
    H = Y_train_s.shape[1]

    train_loader, val_loader = make_loaders(X_train, Y_train_s, X_val, Y_val_s, batch=batch)

    model = LSTMAttn(input_dim=D, hidden=hidden, layers=layers, dropout=dropout, horizon=H).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scaler = GradScaler(enabled=(device.type == "cuda"))
    best_state, best_val = None, float("inf")
    patience, patience_left = 5, 5  # early stopping

    for epoch in range(1, epochs+1):
        model.train()
        running, nitems = 0.0, 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat, _ = model(x)
                loss = nn.functional.mse_loss(yhat, y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(opt)
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            running += loss.item() * x.size(0); nitems += x.size(0)

        train_mse = running / nitems
        val_mse = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch:02d} | train MSE {train_mse:.4f} | val MSE {val_mse:.4f}")

        # early stopping on val MSE
        if val_mse + 1e-6 < best_val:
            best_val = val_mse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_left = patience
        else:
            patience_left -= 1
            if patience_left == 0:
                print("Early stopping.")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model

# ---------- Metrics on original MW ----------
@torch.no_grad()
def evaluate_on_test(model, X_test, Y_test_s, y_scaler, batch=256):
    device = next(model.parameters()).device
    loader = DataLoader(SeqDataset(X_test, Y_test_s), batch_size=batch, shuffle=False)
    model.eval()
    preds_s, trues_s = [], []
    for x, y in loader:
        x = x.to(device)
        yhat, _ = model(x)
        preds_s.append(yhat.cpu().numpy())
        trues_s.append(y.numpy())
    preds_s = np.concatenate(preds_s, axis=0)
    trues_s = np.concatenate(trues_s, axis=0)

    # invert scaling to MW
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)

    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    # horizon-wise MAE (optional insight)
    mae_h = np.mean(np.abs(preds - trues), axis=0)   # shape (24,)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "preds": preds, "trues": trues}

# ---------- Train ----------
if __name__ == "__main__":
    # If you ran the preprocessing script in the same session, the variables already exist.
    # Otherwise, see the np.load section near the top.

    # Fast baseline hyperparams (tweak later):
    model = train_model(
        X_train, Y_train_s, X_val, Y_val_s,
        hidden=128, layers=1, dropout=0.2,
        batch=128, epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0
    )

    # Evaluate on test set (in MW)
    metrics = evaluate_on_test(model, X_test, Y_test_s, bundle.y_scaler)
    print(f"\nTest MAE (MW):  {metrics['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics['RMSE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics["MAE_by_h"], 2))

  scaler = GradScaler(enabled=(device.type == "cuda"))
  with autocast(enabled=(device.type == "cuda")):


Epoch 01 | train MSE 0.2423 | val MSE 0.1407
Epoch 02 | train MSE 0.1526 | val MSE 0.1114
Epoch 03 | train MSE 0.1444 | val MSE 0.1039
Epoch 04 | train MSE 0.1399 | val MSE 0.0909
Epoch 05 | train MSE 0.1371 | val MSE 0.0940
Epoch 06 | train MSE 0.1351 | val MSE 0.0781
Epoch 07 | train MSE 0.1328 | val MSE 0.0789
Epoch 08 | train MSE 0.1324 | val MSE 0.0752
Epoch 09 | train MSE 0.1303 | val MSE 0.0753
Epoch 10 | train MSE 0.1305 | val MSE 0.0800
Epoch 11 | train MSE 0.1290 | val MSE 0.0726
Epoch 12 | train MSE 0.1276 | val MSE 0.0916
Epoch 13 | train MSE 0.1274 | val MSE 0.0701
Epoch 14 | train MSE 0.1238 | val MSE 0.0661
Epoch 15 | train MSE 0.1231 | val MSE 0.0693
Epoch 16 | train MSE 0.1218 | val MSE 0.0690
Epoch 17 | train MSE 0.1208 | val MSE 0.0643
Epoch 18 | train MSE 0.1182 | val MSE 0.0749
Epoch 19 | train MSE 0.1186 | val MSE 0.0698
Epoch 20 | train MSE 0.1173 | val MSE 0.0747
Epoch 21 | train MSE 0.1154 | val MSE 0.0712
Epoch 22 | train MSE 0.1148 | val MSE 0.0538
Epoch 23 |

In [None]:
# Save model (optional)
torch.save(model.state_dict(), "lstm_attention_baseline.pt")

### with future know ahead covs

In [31]:
# train_lstm_attention.py  (minimal Z-aware version)

import math, os, random, numpy as np, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.benchmark = True

# ---------- Dataset: optionally carry known-ahead Z ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)        # (N, L, D)
        self.Y = torch.from_numpy(Y)        # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))  # (N, H, Fz)
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future: return self.X[i], self.Y[i], self.Z[i]
        else:                return self.X[i], self.Y[i]

def make_loaders(Xtr, Ytr, Xva, Yva, Ztr=None, Zva=None, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr, Ytr, Ztr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True, drop_last=True),
        DataLoader(SeqDataset(Xva, Yva, Zva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

# ---------- LSTM + attention, with optional Z fusion ----------
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, layers=1, dropout=0.2, horizon=24, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.future_proj = nn.Linear(future_feat_dim, hidden) if (future_feat_dim and future_feat_dim > 0) else None
        head_in = 2*hidden + (hidden if self.future_proj is not None else 0)
        # per-horizon shared head applied across H (B,H,head_in) -> (B,H,1)
        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, x, z_future=None):           # x: (B,L,D), z_future: (B,H,Fz) or None
        seq, (h_n, _) = self.lstm(x)               # seq: (B,L,Hd), h_n[-1]: (B,Hd)
        q = h_n[-1]                                # (B,Hd)   — query = last hidden
        # temporal attention over L
        scores = torch.sum(seq * q.unsqueeze(1), dim=-1) / math.sqrt(seq.size(-1))  # (B,L)
        w = torch.softmax(scores, dim=1)           # (B,L)
        ctx = torch.sum(seq * w.unsqueeze(-1), dim=1)                               # (B,Hd)

        B, Hd, H = x.size(0), seq.size(-1), self.horizon
        base = torch.cat([ctx, q], dim=-1).unsqueeze(1).expand(B, H, -1)            # (B,H,2Hd)

        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)                                      # (B,H,Hd)
            fused = torch.cat([base, z_proj], dim=-1)                                # (B,H,2Hd+Hd)
        else:
            fused = base                                                             # (B,H,2Hd)

        yhat = self.head(self.dropout(fused)).squeeze(-1)                            # (B,H)
        return yhat, w

# ---------- Eval (MSE on scaled) ----------
@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    mse, n = 0.0, 0
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x, y = x.to(device), y.to(device)
        z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        mse += torch.mean((yhat - y)**2).item() * x.size(0)
        n += x.size(0)
    return mse / n

# ---------- Train ----------
def train_model(
    X_train, Y_train_s, X_val, Y_val_s, *,
    Z_train=None, Z_val=None,
    hidden=128, layers=1, dropout=0.2, batch=128,
    epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    D = X_train.shape[2]; H = Y_train_s.shape[1]
    Fz = 0 if Z_train is None else Z_train.shape[2]

    train_loader, val_loader = make_loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val, batch=batch)

    model = LSTMAttn(input_dim=D, hidden=hidden, layers=layers, dropout=dropout,
                     horizon=H, future_feat_dim=Fz).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scaler = GradScaler(enabled=(device.type == "cuda"))
    best_state, best_val = None, float("inf")
    patience, patience_left = 6, 6

    for epoch in range(1, epochs+1):
        model.train()
        running, nitems = 0.0, 0
        for batch in train_loader:
            if len(batch) == 2: x, y = batch; z = None
            else:               x, y, z = batch
            x, y = x.to(device), y.to(device)
            z = None if z is None else z.to(device)

            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat, _ = model(x, z_future=z)
                loss = nn.functional.mse_loss(yhat, y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            running += loss.item() * x.size(0); nitems += x.size(0)

        val_mse = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch:02d} | train MSE {running/nitems:.4f} | val MSE {val_mse:.4f}")

        if val_mse + 1e-6 < best_val:
            best_val = val_mse
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience_left = patience
        else:
            patience_left -= 1
            if patience_left == 0:
                print("Early stopping."); break

    if best_state is not None: model.load_state_dict(best_state)
    return model

# ---------- Metrics on original MW ----------
@torch.no_grad()
def evaluate_on_test(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    device = next(model.parameters()).device
    loader = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    model.eval()
    preds_s, trues_s = [], []
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x = x.to(device); z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        preds_s.append(yhat.cpu().numpy()); trues_s.append(y.numpy())
    preds_s = np.concatenate(preds_s, axis=0)
    trues_s = np.concatenate(trues_s, axis=0)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "preds": preds, "trues": trues}

# ---------- Train ----------
if __name__ == "__main__":
    model = train_model(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        hidden=128, layers=1, dropout=0.2,
        batch=128, epochs=60, lr=8e-4, weight_decay=2e-5, grad_clip=1.0
    )
    metrics = evaluate_on_test(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    print(f"\nTest MAE (MW):  {metrics['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics['RMSE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics["MAE_by_h"], 2))


  scaler = GradScaler(enabled=(device.type == "cuda"))
  with autocast(enabled=(device.type == "cuda")):


Epoch 01 | train MSE 0.1689 | val MSE 0.0514
Epoch 02 | train MSE 0.1274 | val MSE 0.0452
Epoch 03 | train MSE 0.1219 | val MSE 0.0330
Epoch 04 | train MSE 0.1181 | val MSE 0.0439
Epoch 05 | train MSE 0.1159 | val MSE 0.0399
Epoch 06 | train MSE 0.1133 | val MSE 0.0350
Epoch 07 | train MSE 0.1121 | val MSE 0.0358
Epoch 08 | train MSE 0.1126 | val MSE 0.0364
Epoch 09 | train MSE 0.1113 | val MSE 0.0403
Early stopping.

Test MAE (MW):  227.78
Test RMSE (MW): 315.11
Horizon-wise MAE (MW): [230.06 226.65 219.69 208.23 203.62 206.95 216.19 215.42 210.67 211.08
 222.24 233.35 234.99 237.77 238.69 242.11 242.98 242.34 239.09 239.89
 239.44 237.92 235.79 231.56]


In [32]:
# train_lstm_attention.py  (minimal Z-aware version)

import math, os, random, numpy as np, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.benchmark = True

# ---------- Dataset: optionally carry known-ahead Z ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)        # (N, L, D)
        self.Y = torch.from_numpy(Y)        # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))  # (N, H, Fz)
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future: return self.X[i], self.Y[i], self.Z[i]
        else:                return self.X[i], self.Y[i]

def make_loaders(Xtr, Ytr, Xva, Yva, Ztr=None, Zva=None, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr, Ytr, Ztr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True, drop_last=True),
        DataLoader(SeqDataset(Xva, Yva, Zva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

# ---------- LSTM + attention, with optional Z fusion ----------
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, layers=1, dropout=0.2, horizon=24, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.future_proj = nn.Linear(future_feat_dim, hidden) if (future_feat_dim and future_feat_dim > 0) else None
        head_in = 2*hidden + (hidden if self.future_proj is not None else 0)
        # per-horizon shared head applied across H (B,H,head_in) -> (B,H,1)
        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, x, z_future=None):           # x: (B,L,D), z_future: (B,H,Fz) or None
        seq, (h_n, _) = self.lstm(x)               # seq: (B,L,Hd), h_n[-1]: (B,Hd)
        q = h_n[-1]                                # (B,Hd)   — query = last hidden
        # temporal attention over L
        scores = torch.sum(seq * q.unsqueeze(1), dim=-1) / math.sqrt(seq.size(-1))  # (B,L)
        w = torch.softmax(scores, dim=1)           # (B,L)
        ctx = torch.sum(seq * w.unsqueeze(-1), dim=1)                               # (B,Hd)

        B, Hd, H = x.size(0), seq.size(-1), self.horizon
        base = torch.cat([ctx, q], dim=-1).unsqueeze(1).expand(B, H, -1)            # (B,H,2Hd)

        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)                                      # (B,H,Hd)
            fused = torch.cat([base, z_proj], dim=-1)                                # (B,H,2Hd+Hd)
        else:
            fused = base                                                             # (B,H,2Hd)

        yhat = self.head(self.dropout(fused)).squeeze(-1)                            # (B,H)
        return yhat, w

# ---------- Eval (MSE on scaled) ----------
@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    mse, n = 0.0, 0
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x, y = x.to(device), y.to(device)
        z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        mse += torch.mean((yhat - y)**2).item() * x.size(0)
        n += x.size(0)
    return mse / n

# ---------- Train ----------
def train_model(
    X_train, Y_train_s, X_val, Y_val_s, *,
    Z_train=None, Z_val=None,
    hidden=128, layers=1, dropout=0.2, batch=128,
    epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    D = X_train.shape[2]; H = Y_train_s.shape[1]
    Fz = 0 if Z_train is None else Z_train.shape[2]

    train_loader, val_loader = make_loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val, batch=batch)

    model = LSTMAttn(input_dim=D, hidden=hidden, layers=layers, dropout=dropout,
                     horizon=H, future_feat_dim=Fz).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scaler = GradScaler(enabled=(device.type == "cuda"))
    best_state, best_val = None, float("inf")
    patience, patience_left = 6, 6

    for epoch in range(1, epochs+1):
        model.train()
        running, nitems = 0.0, 0
        for batch in train_loader:
            if len(batch) == 2: x, y = batch; z = None
            else:               x, y, z = batch
            x, y = x.to(device), y.to(device)
            z = None if z is None else z.to(device)

            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat, _ = model(x, z_future=z)
                loss = nn.functional.mse_loss(yhat, y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            running += loss.item() * x.size(0); nitems += x.size(0)

        val_mse = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch:02d} | train MSE {running/nitems:.4f} | val MSE {val_mse:.4f}")

        if val_mse + 1e-6 < best_val:
            best_val = val_mse
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience_left = patience
        else:
            patience_left -= 1
            if patience_left == 0:
                print("Early stopping."); break

    if best_state is not None: model.load_state_dict(best_state)
    return model

# ---------- Metrics on original MW ----------
@torch.no_grad()
def evaluate_on_test(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    device = next(model.parameters()).device
    loader = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    model.eval()
    preds_s, trues_s = [], []
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x = x.to(device); z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        preds_s.append(yhat.cpu().numpy()); trues_s.append(y.numpy())
    preds_s = np.concatenate(preds_s, axis=0)
    trues_s = np.concatenate(trues_s, axis=0)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "preds": preds, "trues": trues}

# ---------- Train ----------
if __name__ == "__main__":
    model = train_model(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        hidden=128, layers=1, dropout=0.2,
        batch=128, epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0
    )
    metrics = evaluate_on_test(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    print(f"\nTest MAE (MW):  {metrics['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics['RMSE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics["MAE_by_h"], 2))


  scaler = GradScaler(enabled=(device.type == "cuda"))
  with autocast(enabled=(device.type == "cuda")):


Epoch 01 | train MSE 0.1636 | val MSE 0.0478
Epoch 02 | train MSE 0.1256 | val MSE 0.0495
Epoch 03 | train MSE 0.1197 | val MSE 0.0347
Epoch 04 | train MSE 0.1171 | val MSE 0.0431
Epoch 05 | train MSE 0.1142 | val MSE 0.0436
Epoch 06 | train MSE 0.1123 | val MSE 0.0387
Epoch 07 | train MSE 0.1126 | val MSE 0.0296
Epoch 08 | train MSE 0.1077 | val MSE 0.0400
Epoch 09 | train MSE 0.1048 | val MSE 0.0395
Epoch 10 | train MSE 0.1038 | val MSE 0.0366
Epoch 11 | train MSE 0.1007 | val MSE 0.0380
Epoch 12 | train MSE 0.0993 | val MSE 0.0316
Epoch 13 | train MSE 0.0981 | val MSE 0.0348
Early stopping.

Test MAE (MW):  207.98
Test RMSE (MW): 286.86
Horizon-wise MAE (MW): [207.66 206.95 202.71 197.52 192.69 191.64 195.67 198.64 199.53 201.79
 205.65 207.73 210.86 212.61 213.83 215.62 216.32 217.51 217.81 219.92
 218.8  217.73 213.98 208.3 ]


In [35]:
# train_lstm_attention.py  (minimal Z-aware version)

import math, os, random, numpy as np, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.benchmark = True

# ---------- Dataset: optionally carry known-ahead Z ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)        # (N, L, D)
        self.Y = torch.from_numpy(Y)        # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))  # (N, H, Fz)
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future: return self.X[i], self.Y[i], self.Z[i]
        else:                return self.X[i], self.Y[i]

def make_loaders(Xtr, Ytr, Xva, Yva, Ztr=None, Zva=None, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr, Ytr, Ztr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True, drop_last=True),
        DataLoader(SeqDataset(Xva, Yva, Zva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

# ---------- LSTM + attention, with optional Z fusion ----------
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, layers=1, dropout=0.2, horizon=24, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.future_proj = nn.Linear(future_feat_dim, hidden) if (future_feat_dim and future_feat_dim > 0) else None
        head_in = 2*hidden + (hidden if self.future_proj is not None else 0)
        # per-horizon shared head applied across H (B,H,head_in) -> (B,H,1)
        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, x, z_future=None):           # x: (B,L,D), z_future: (B,H,Fz) or None
        seq, (h_n, _) = self.lstm(x)               # seq: (B,L,Hd), h_n[-1]: (B,Hd)
        q = h_n[-1]                                # (B,Hd)   — query = last hidden
        # temporal attention over L
        scores = torch.sum(seq * q.unsqueeze(1), dim=-1) / math.sqrt(seq.size(-1))  # (B,L)
        w = torch.softmax(scores, dim=1)           # (B,L)
        ctx = torch.sum(seq * w.unsqueeze(-1), dim=1)                               # (B,Hd)

        B, Hd, H = x.size(0), seq.size(-1), self.horizon
        base = torch.cat([ctx, q], dim=-1).unsqueeze(1).expand(B, H, -1)            # (B,H,2Hd)

        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)                                      # (B,H,Hd)
            fused = torch.cat([base, z_proj], dim=-1)                                # (B,H,2Hd+Hd)
        else:
            fused = base                                                             # (B,H,2Hd)

        yhat = self.head(self.dropout(fused)).squeeze(-1)                            # (B,H)
        return yhat, w

# ---------- Eval (MSE on scaled) ----------
@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    mse, n = 0.0, 0
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x, y = x.to(device), y.to(device)
        z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        mse += torch.mean((yhat - y)**2).item() * x.size(0)
        n += x.size(0)
    return mse / n

# ---------- Train ----------
def train_model(
    X_train, Y_train_s, X_val, Y_val_s, *,
    Z_train=None, Z_val=None,
    hidden=128, layers=1, dropout=0.2, batch=128,
    epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    D = X_train.shape[2]; H = Y_train_s.shape[1]
    Fz = 0 if Z_train is None else Z_train.shape[2]

    train_loader, val_loader = make_loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val, batch=batch)

    model = LSTMAttn(input_dim=D, hidden=hidden, layers=layers, dropout=dropout,
                     horizon=H, future_feat_dim=Fz).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(device.type == "cuda"))
    best_state, best_val = None, float("inf")
    patience, patience_left = 6, 6

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.15, 0.95, Y_train_s.shape[1], device=device).view(1, -1)
    for epoch in range(1, epochs+1):
        model.train()
        running, nitems = 0.0, 0
        for batch in train_loader:
            if len(batch) == 2: x, y = batch; z = None
            else:               x, y, z = batch
            x, y = x.to(device), y.to(device)
            z = None if z is None else z.to(device)

            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat, _ = model(x, z_future=z)
                L = loss_fn(yhat, y)   # shape (B,H)
                L = (L * w).mean()
            scaler.scale(L).backward()
            if grad_clip is not None:
                scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            running += L.item() * x.size(0); nitems += x.size(0)

        val_mse = eval_epoch(model, val_loader, device)
        sched.step(val_mse)
        print(f"Epoch {epoch:02d} | train MSE {running/nitems:.4f} | val MSE {val_mse:.4f}")
        
        if val_mse + 1e-6 < best_val:
            best_val = val_mse
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience_left = patience
        else:
            patience_left -= 1
            if patience_left == 0:
                print("Early stopping."); break

    if best_state is not None: model.load_state_dict(best_state)
    return model

# ---------- Metrics on original MW ----------
@torch.no_grad()
def evaluate_on_test(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    device = next(model.parameters()).device
    loader = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    model.eval()
    preds_s, trues_s = [], []
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x = x.to(device); z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        preds_s.append(yhat.cpu().numpy()); trues_s.append(y.numpy())
    preds_s = np.concatenate(preds_s, axis=0)
    trues_s = np.concatenate(trues_s, axis=0)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "preds": preds, "trues": trues}

# ---------- Train ----------
if __name__ == "__main__":
    model = train_model(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        hidden=160, layers=2, dropout=0.15,
        batch=128, epochs=90, lr=7.5e-4, weight_decay=3e-5, grad_clip=1.0
    )
    metrics = evaluate_on_test(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    print(f"\nTest MAE (MW):  {metrics['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics['RMSE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics["MAE_by_h"], 2))


  scaler = GradScaler(enabled=(device.type == "cuda"))
  with autocast(enabled=(device.type == "cuda")):


Epoch 01 | train MSE 0.0746 | val MSE 0.0528
Epoch 02 | train MSE 0.0567 | val MSE 0.0374
Epoch 03 | train MSE 0.0543 | val MSE 0.0334
Epoch 04 | train MSE 0.0527 | val MSE 0.0287
Epoch 05 | train MSE 0.0516 | val MSE 0.0310
Epoch 06 | train MSE 0.0509 | val MSE 0.0277
Epoch 07 | train MSE 0.0502 | val MSE 0.0512
Epoch 08 | train MSE 0.0497 | val MSE 0.0442
Epoch 09 | train MSE 0.0495 | val MSE 0.0316
Epoch 10 | train MSE 0.0481 | val MSE 0.0342
Epoch 11 | train MSE 0.0482 | val MSE 0.0377
Epoch 12 | train MSE 0.0475 | val MSE 0.0352
Early stopping.

Test MAE (MW):  209.00
Test RMSE (MW): 300.62
Horizon-wise MAE (MW): [198.5  193.88 192.61 193.08 195.92 198.16 202.07 203.41 205.04 208.45
 213.71 217.04 220.16 221.28 222.77 222.73 223.58 220.69 217.9  213.71
 210.52 209.2  206.92 204.65]


Use this below

In [21]:
# train_lstm_attention.py  (minimal Z-aware version)

import math, os, random, numpy as np, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.benchmark = True

# ---------- Dataset: optionally carry known-ahead Z ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)        # (N, L, D)
        self.Y = torch.from_numpy(Y)        # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))  # (N, H, Fz)
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future: return self.X[i], self.Y[i], self.Z[i]
        else:                return self.X[i], self.Y[i]

def make_loaders(Xtr, Ytr, Xva, Yva, Ztr=None, Zva=None, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr, Ytr, Ztr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True, drop_last=True),
        DataLoader(SeqDataset(Xva, Yva, Zva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

# ---------- LSTM + attention, with optional Z fusion ----------
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, layers=1, dropout=0.2, horizon=24, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.future_proj = nn.Linear(future_feat_dim, hidden) if (future_feat_dim and future_feat_dim > 0) else None
        head_in = 2*hidden + (hidden if self.future_proj is not None else 0)
        # per-horizon shared head applied across H (B,H,head_in) -> (B,H,1)
        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, x, z_future=None):           # x: (B,L,D), z_future: (B,H,Fz) or None
        seq, (h_n, _) = self.lstm(x)               # seq: (B,L,Hd), h_n[-1]: (B,Hd)
        q = h_n[-1]                                # (B,Hd)   — query = last hidden
        # temporal attention over L
        scores = torch.sum(seq * q.unsqueeze(1), dim=-1) / math.sqrt(seq.size(-1))  # (B,L)
        w = torch.softmax(scores, dim=1)           # (B,L)
        ctx = torch.sum(seq * w.unsqueeze(-1), dim=1)                               # (B,Hd)

        B, Hd, H = x.size(0), seq.size(-1), self.horizon
        base = torch.cat([ctx, q], dim=-1).unsqueeze(1).expand(B, H, -1)            # (B,H,2Hd)

        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)                                      # (B,H,Hd)
            fused = torch.cat([base, z_proj], dim=-1)                                # (B,H,2Hd+Hd)
        else:
            fused = base                                                             # (B,H,2Hd)

        yhat = self.head(self.dropout(fused)).squeeze(-1)                            # (B,H)
        return yhat, w

# ---------- Eval (MSE on scaled) ----------
@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    mse, n = 0.0, 0
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x, y = x.to(device), y.to(device)
        z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        mse += torch.mean((yhat - y)**2).item() * x.size(0)
        n += x.size(0)
    return mse / n

# ---------- Train ----------
def train_model(
    X_train, Y_train_s, X_val, Y_val_s, *,
    Z_train=None, Z_val=None,
    hidden=128, layers=1, dropout=0.2, batch=128,
    epochs=40, lr=1e-3, weight_decay=1e-5, grad_clip=1.0,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    D = X_train.shape[2]; H = Y_train_s.shape[1]
    Fz = 0 if Z_train is None else Z_train.shape[2]

    train_loader, val_loader = make_loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val, batch=batch)

    model = LSTMAttn(input_dim=D, hidden=hidden, layers=layers, dropout=dropout,
                     horizon=H, future_feat_dim=Fz).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scaler = GradScaler(enabled=(device.type == "cuda"))
    best_state, best_val = None, float("inf")
    patience, patience_left = 6, 6

    for epoch in range(1, epochs+1):
        model.train()
        running, nitems = 0.0, 0
        for batch in train_loader:
            if len(batch) == 2: x, y = batch; z = None
            else:               x, y, z = batch
            x, y = x.to(device), y.to(device)
            z = None if z is None else z.to(device)

            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat, _ = model(x, z_future=z)
                loss = nn.functional.mse_loss(yhat, y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            running += loss.item() * x.size(0); nitems += x.size(0)

        val_mse = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch:02d} | train MSE {running/nitems:.4f} | val MSE {val_mse:.4f}")
        
        if val_mse + 1e-6 < best_val:
            best_val = val_mse
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience_left = patience
        else:
            patience_left -= 1
            if patience_left == 0:
                print("Early stopping."); break

    if best_state is not None: model.load_state_dict(best_state)
    return model

# ---------- Metrics on original MW ----------
@torch.no_grad()
def evaluate_on_test(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    device = next(model.parameters()).device
    loader = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    model.eval()
    preds_s, trues_s = [], []
    for batch in loader:
        if len(batch) == 2: x, y = batch; z = None
        else:               x, y, z = batch
        x = x.to(device); z = None if z is None else z.to(device)
        yhat, _ = model(x, z_future=z)
        preds_s.append(yhat.cpu().numpy()); trues_s.append(y.numpy())
    preds_s = np.concatenate(preds_s, axis=0)
    trues_s = np.concatenate(trues_s, axis=0)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    def safe_mape(y_true, y_pred, eps=1e-6):
        y_true = np.asarray(y_true)
        return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0
    
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    mape = safe_mape(trues, preds)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "MAPE": mape, "preds": preds, "trues": trues}

# ---------- Train ----------
if __name__ == "__main__":
    model_lstmattention = train_model(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        hidden=112, layers=2, dropout=0.12,
        batch=160, epochs=100, lr=9e-4, weight_decay=2e-5, grad_clip=1.0
    )
    metrics_lstmattention = evaluate_on_test(model_lstmattention, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    print(f"\nTest MAE (MW):  {metrics_lstmattention['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics_lstmattention['RMSE']:.2f}")
    print(f"Test MAPE (MW): {metrics_lstmattention['MAPE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics_lstmattention["MAE_by_h"], 2))


Epoch 01 | train MSE 0.1622 | val MSE 0.0328
Epoch 02 | train MSE 0.1215 | val MSE 0.0302
Epoch 03 | train MSE 0.1163 | val MSE 0.0369
Epoch 04 | train MSE 0.1128 | val MSE 0.0322
Epoch 05 | train MSE 0.1116 | val MSE 0.0336
Epoch 06 | train MSE 0.1106 | val MSE 0.0282
Epoch 07 | train MSE 0.1077 | val MSE 0.0325
Epoch 08 | train MSE 0.1088 | val MSE 0.0351
Epoch 09 | train MSE 0.1071 | val MSE 0.0282
Epoch 10 | train MSE 0.1083 | val MSE 0.0334
Epoch 11 | train MSE 0.1049 | val MSE 0.0289
Epoch 12 | train MSE 0.1031 | val MSE 0.0291
Epoch 13 | train MSE 0.0993 | val MSE 0.0270
Epoch 14 | train MSE 0.0927 | val MSE 0.0287
Epoch 15 | train MSE 0.0893 | val MSE 0.0289
Epoch 16 | train MSE 0.0916 | val MSE 0.0286
Epoch 17 | train MSE 0.0861 | val MSE 0.0301
Epoch 18 | train MSE 0.0828 | val MSE 0.0291
Epoch 19 | train MSE 0.0831 | val MSE 0.0277
Early stopping.

Test MAE (MW):  183.55
Test RMSE (MW): 256.13
Test MAPE (MW): 7.60
Horizon-wise MAE (MW): [182.14 181.66 181.04 180.34 181.51 18

In [22]:
torch.save(model_lstmattention.state_dict(), "lstmattention_forecaster.pth")

## Light bigger LSTM + attention

In [23]:
# train_lstm_attention_mae.py
import math, numpy as np, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

# expects: X_train, Y_train_s, X_val, Y_val_s, X_test, Y_test_s, bundle

# ---------- Data ----------
class SeqDataset(Dataset):
    def __init__(self, X, Y): self.X = torch.from_numpy(X); self.Y = torch.from_numpy(Y)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i): return self.X[i], self.Y[i]

def loaders(Xtr, Ytr, Xva, Yva, batch=128, workers=0):
    return (
        DataLoader(SeqDataset(Xtr,Ytr), batch_size=batch, shuffle=True,  num_workers=workers, pin_memory=True),
        DataLoader(SeqDataset(Xva,Yva), batch_size=batch, shuffle=False, num_workers=workers, pin_memory=True),
    )

# ---------- Model (2-layer bi-LSTM + multi-head temporal attention) ----------
class MHAttnPool(nn.Module):
    def __init__(self, d_model, n_heads=4, p=0.15):
        super().__init__()
        assert d_model % n_heads == 0
        self.nh, self.dk = n_heads, d_model // n_heads
        self.q = nn.Linear(d_model, d_model, bias=False)
        self.k = nn.Linear(d_model, d_model, bias=False)
        self.v = nn.Linear(d_model, d_model, bias=False)
        self.drop = nn.Dropout(p)
    def forward(self, seq, q):  # seq: (B,L,D), q: (B,D)
        B,L,D = seq.shape; H,d = self.nh, self.dk
        Q = self.q(q).view(B,H,d)
        K = self.k(seq).view(B,L,H,d)
        V = self.v(seq).view(B,L,H,d)
        score = torch.einsum("bhd,bLhd->bhL", Q, K) / math.sqrt(d)
        w = torch.softmax(score, dim=-1)
        w = self.drop(w)
        ctx = torch.einsum("bhL,bLhd->bhd", w, V).reshape(B,D)
        return ctx, w  # (B,D), (B,H,L)

class BigLSTMAttn(nn.Module):
    def __init__(self, input_dim, horizon=24, hidden=256, layers=2, heads=4,
                 dropout=0.30, bidirectional=True):
        super().__init__()
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden, num_layers=layers,
                            bidirectional=bidirectional, batch_first=True,
                            dropout=dropout if layers>1 else 0.0)
        D = hidden * (2 if bidirectional else 1)
        self.attn = MHAttnPool(D, n_heads=heads, p=dropout*0.5)
        self.norm = nn.LayerNorm(D)
        self.mlp  = nn.Sequential(
            nn.Linear(D*2, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, horizon)
        )
    def _last(self, h_n):  # (layers*dirs,B,H)
        layers_dirs, B, H = h_n.shape
        dirs = 2 if self.bidirectional else 1
        last = h_n.view(-1, dirs, B, H)[-1]              # (dirs,B,H)
        return torch.cat([last[d] for d in range(dirs)], dim=1) if dirs==2 else last[0]
    def forward(self, x):
        seq, (h_n, _) = self.lstm(x)
        q = self._last(h_n)
        ctx, attn = self.attn(seq, q)
        fused = torch.cat([self.norm(ctx), q], dim=-1)
        yhat = self.mlp(fused)
        return yhat, attn

# ---------- Metrics ----------
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    mse_s, n = 0.0, 0
    preds_s, trues_s = [], []
    for x, y in loader:
        x, y = x.to(next(model.parameters()).device), y.to(next(model.parameters()).device)
        yhat_s, _ = model(x)
        mse_s += torch.mean((yhat_s - y)**2).item() * x.size(0); n += x.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(y.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, mse_s/n  # (MW), (scaled MSE)

# ---------- Train with early-stop on val MAE (MW) ----------
def train(
    X_train, Y_train_s, X_val, Y_val_s,
    hidden=256, layers=2, heads=4, dropout=0.30, bidirectional=True,
    batch=128, epochs=80, lr=1e-3, weight_decay=2e-4, grad_clip=1.0,
    horizon_weighted=True
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    D, H = X_train.shape[2], Y_train_s.shape[1]
    tr_loader = DataLoader(SeqDataset(X_train, Y_train_s), batch_size=batch, shuffle=True,  pin_memory=True)
    va_loader = DataLoader(SeqDataset(X_val,   Y_val_s),   batch_size=batch, shuffle=False, pin_memory=True)

    model = BigLSTMAttn(D, horizon=H, hidden=hidden, layers=layers,
                        heads=heads, dropout=dropout, bidirectional=bidirectional).to(device)
    opt   = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # ↓ remove verbose (older torch doesn't support it)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(device.type == "cuda"))

    loss_fn = nn.SmoothL1Loss(beta=1.0, reduction="none")
    w = None
    if horizon_weighted:
        # weight early horizons a bit more
        w = torch.linspace(1.25, 0.85, H, device=device).view(1, H)

    best_mae, best_state = float("inf"), None
    patience, left = 8, 8

    for epoch in range(1, epochs + 1):
        model.train()
        run, nitems = 0.0, 0
        for xb, yb in tr_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(device.type == "cuda")):
                yhat_s, _ = model(xb)
                L = loss_fn(yhat_s, yb)         # (B,H)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            if grad_clip:
                scaler.unscale_(opt)
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        # ---- validation: early-stop on MAE (MW)
        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)

        # ---- LR scheduler (log when it reduces LR)
        prev_lr = opt.param_groups[0]["lr"]
        sched.step(val_mae_mw)
        new_lr  = opt.param_groups[0]["lr"]
        if new_lr < prev_lr:
            print(f"[scheduler] LR reduced: {prev_lr:.2e} → {new_lr:.2e}")

        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} "
              f"| val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if val_mae_mw + 1e-6 < best_mae:
            best_mae = val_mae_mw
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            left = patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model

# ---------- Evaluate on test (MW)
@torch.no_grad()
def test_report(model, X_test, Y_test_s, y_scaler, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for xb, yb in dl:
        xb = xb.to(device)
        yhat_s, _ = model(xb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW):  {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "preds": preds, "trues": trues}

# ---------- Run ----------
if __name__ == "__main__":
    model = train(
        X_train, Y_train_s, X_val, Y_val_s,
        hidden=256, layers=2, heads=4, dropout=0.30, bidirectional=True,
        batch=128, epochs=80, lr=1e-3, weight_decay=2e-4, grad_clip=1.0,
        horizon_weighted=True
    )
    _ = test_report(model, X_test, Y_test_s, bundle.y_scaler)

  scaler = GradScaler(enabled=(device.type == "cuda"))
  with autocast(enabled=(device.type == "cuda")):


Epoch 01 | train loss 0.1028 | val MAE(MW) 214.86 | val MSE(scaled) 0.0623


  with autocast(enabled=(device.type == "cuda")):


Epoch 02 | train loss 0.0679 | val MAE(MW) 194.68 | val MSE(scaled) 0.0572


  with autocast(enabled=(device.type == "cuda")):


Epoch 03 | train loss 0.0648 | val MAE(MW) 182.88 | val MSE(scaled) 0.0434


  with autocast(enabled=(device.type == "cuda")):


Epoch 04 | train loss 0.0632 | val MAE(MW) 191.77 | val MSE(scaled) 0.0532


  with autocast(enabled=(device.type == "cuda")):


Epoch 05 | train loss 0.0618 | val MAE(MW) 197.32 | val MSE(scaled) 0.0611


  with autocast(enabled=(device.type == "cuda")):


[scheduler] LR reduced: 1.00e-03 → 5.00e-04
Epoch 06 | train loss 0.0613 | val MAE(MW) 195.51 | val MSE(scaled) 0.0586


  with autocast(enabled=(device.type == "cuda")):


Epoch 07 | train loss 0.0581 | val MAE(MW) 198.03 | val MSE(scaled) 0.0535


  with autocast(enabled=(device.type == "cuda")):


Epoch 08 | train loss 0.0576 | val MAE(MW) 207.59 | val MSE(scaled) 0.0646


  with autocast(enabled=(device.type == "cuda")):


[scheduler] LR reduced: 5.00e-04 → 2.50e-04
Epoch 09 | train loss 0.0571 | val MAE(MW) 197.29 | val MSE(scaled) 0.0578


  with autocast(enabled=(device.type == "cuda")):


Epoch 10 | train loss 0.0557 | val MAE(MW) 182.44 | val MSE(scaled) 0.0485


  with autocast(enabled=(device.type == "cuda")):


Epoch 11 | train loss 0.0547 | val MAE(MW) 200.66 | val MSE(scaled) 0.0613


  with autocast(enabled=(device.type == "cuda")):


Epoch 12 | train loss 0.0548 | val MAE(MW) 195.16 | val MSE(scaled) 0.0575


  with autocast(enabled=(device.type == "cuda")):


Epoch 13 | train loss 0.0528 | val MAE(MW) 179.04 | val MSE(scaled) 0.0491


  with autocast(enabled=(device.type == "cuda")):


Epoch 14 | train loss 0.0522 | val MAE(MW) 193.16 | val MSE(scaled) 0.0585


  with autocast(enabled=(device.type == "cuda")):


Epoch 15 | train loss 0.0497 | val MAE(MW) 193.02 | val MSE(scaled) 0.0570


  with autocast(enabled=(device.type == "cuda")):


[scheduler] LR reduced: 2.50e-04 → 1.25e-04
Epoch 16 | train loss 0.0487 | val MAE(MW) 184.98 | val MSE(scaled) 0.0521


  with autocast(enabled=(device.type == "cuda")):


Epoch 17 | train loss 0.0460 | val MAE(MW) 189.19 | val MSE(scaled) 0.0559


  with autocast(enabled=(device.type == "cuda")):


Epoch 18 | train loss 0.0453 | val MAE(MW) 186.58 | val MSE(scaled) 0.0528


  with autocast(enabled=(device.type == "cuda")):


[scheduler] LR reduced: 1.25e-04 → 6.25e-05
Epoch 19 | train loss 0.0445 | val MAE(MW) 188.23 | val MSE(scaled) 0.0555


  with autocast(enabled=(device.type == "cuda")):


Epoch 20 | train loss 0.0437 | val MAE(MW) 186.39 | val MSE(scaled) 0.0545


  with autocast(enabled=(device.type == "cuda")):


Epoch 21 | train loss 0.0432 | val MAE(MW) 186.22 | val MSE(scaled) 0.0529
Early stopping.

Test  MAE (MW):  268.98
Test RMSE (MW):  380.96
Horizon-wise MAE (MW): [216.22 222.28 238.67 241.88 246.47 243.83 254.75 283.58 289.61 296.28
 308.41 309.31 311.46 313.38 320.23 312.05 298.55 283.03 275.32 261.39
 249.72 237.57 221.82 219.67]


In [None]:
  torch.save(model.state_dict(), "lstm_attn_L48_earlystop_mae.pt")

# Transformer (baseline)

In [None]:
torch.save(model.state_dict(), "transformer_forecaster.pt")

In [23]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=None, Z_val=None,
        d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
        batch=128, epochs=60, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed=SEED
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=None)




Epoch 01 | train loss 0.2592 | val MAE(MW) 362.17 | val MSE(scaled) 0.1532
Epoch 02 | train loss 0.0881 | val MAE(MW) 258.74 | val MSE(scaled) 0.0898
Epoch 03 | train loss 0.0683 | val MAE(MW) 218.81 | val MSE(scaled) 0.0710
Epoch 04 | train loss 0.0625 | val MAE(MW) 207.39 | val MSE(scaled) 0.0610
Epoch 05 | train loss 0.0599 | val MAE(MW) 204.47 | val MSE(scaled) 0.0608
Epoch 06 | train loss 0.0580 | val MAE(MW) 197.91 | val MSE(scaled) 0.0607
Epoch 07 | train loss 0.0571 | val MAE(MW) 187.93 | val MSE(scaled) 0.0527
Epoch 08 | train loss 0.0560 | val MAE(MW) 187.41 | val MSE(scaled) 0.0498
Epoch 09 | train loss 0.0553 | val MAE(MW) 179.84 | val MSE(scaled) 0.0470
Epoch 10 | train loss 0.0545 | val MAE(MW) 196.45 | val MSE(scaled) 0.0542
Epoch 11 | train loss 0.0541 | val MAE(MW) 179.93 | val MSE(scaled) 0.0474
Epoch 12 | train loss 0.0537 | val MAE(MW) 193.80 | val MSE(scaled) 0.0546
Epoch 13 | train loss 0.0520 | val MAE(MW) 191.28 | val MSE(scaled) 0.0524
Epoch 14 | train loss 0.0

In [25]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=320, nhead=8, layers=4, d_ff=1024, dropout=0.18,
        batch=128, epochs=80, lr=2e-4, weight_decay=6e-4, horizon_weighted=True, seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)




Epoch 01 | train loss 0.0892 | val MAE(MW) 156.84 | val MSE(scaled) 0.0341
Epoch 02 | train loss 0.0548 | val MAE(MW) 158.52 | val MSE(scaled) 0.0347
Epoch 03 | train loss 0.0528 | val MAE(MW) 146.32 | val MSE(scaled) 0.0299
Epoch 04 | train loss 0.0516 | val MAE(MW) 149.31 | val MSE(scaled) 0.0316
Epoch 05 | train loss 0.0507 | val MAE(MW) 149.00 | val MSE(scaled) 0.0324
Epoch 06 | train loss 0.0500 | val MAE(MW) 138.64 | val MSE(scaled) 0.0284
Epoch 07 | train loss 0.0498 | val MAE(MW) 137.99 | val MSE(scaled) 0.0287
Epoch 08 | train loss 0.0491 | val MAE(MW) 144.71 | val MSE(scaled) 0.0302
Epoch 09 | train loss 0.0487 | val MAE(MW) 138.13 | val MSE(scaled) 0.0289
Epoch 10 | train loss 0.0479 | val MAE(MW) 136.31 | val MSE(scaled) 0.0276
Epoch 11 | train loss 0.0480 | val MAE(MW) 138.54 | val MSE(scaled) 0.0290
Epoch 12 | train loss 0.0472 | val MAE(MW) 134.81 | val MSE(scaled) 0.0271
Epoch 13 | train loss 0.0470 | val MAE(MW) 134.70 | val MSE(scaled) 0.0269
Epoch 14 | train loss 0.0

In [31]:
torch.save(model.state_dict(), "transformer_forecaster_baseline.pt")

# Transformer (bigger)

In [None]:
torch.save(model.state_dict(), "transformer_horizon_queries.pt")

In [25]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    # "Large" 
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=512, nhead=8, layers=6, d_ff=2048, dropout=0.20,
        batch=128, epochs=120, lr=1.8e-4, weight_decay=8e-4,
        horizon_weighted=True, seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)




Epoch 01 | train loss 0.1072 | val MAE(MW) 163.71 | val MSE(scaled) 0.0365
Epoch 02 | train loss 0.0560 | val MAE(MW) 146.05 | val MSE(scaled) 0.0294
Epoch 03 | train loss 0.0538 | val MAE(MW) 155.62 | val MSE(scaled) 0.0311
Epoch 04 | train loss 0.0525 | val MAE(MW) 151.90 | val MSE(scaled) 0.0309
Epoch 05 | train loss 0.0517 | val MAE(MW) 146.79 | val MSE(scaled) 0.0296
Epoch 06 | train loss 0.0500 | val MAE(MW) 142.88 | val MSE(scaled) 0.0291
Epoch 07 | train loss 0.0497 | val MAE(MW) 143.38 | val MSE(scaled) 0.0289
Epoch 08 | train loss 0.0494 | val MAE(MW) 150.46 | val MSE(scaled) 0.0321
Epoch 09 | train loss 0.0492 | val MAE(MW) 157.57 | val MSE(scaled) 0.0339
Epoch 10 | train loss 0.0478 | val MAE(MW) 155.62 | val MSE(scaled) 0.0338
Epoch 11 | train loss 0.0472 | val MAE(MW) 143.70 | val MSE(scaled) 0.0293
Epoch 12 | train loss 0.0466 | val MAE(MW) 147.42 | val MSE(scaled) 0.0308
Epoch 13 | train loss 0.0464 | val MAE(MW) 147.16 | val MSE(scaled) 0.0296
Epoch 14 | train loss 0.0

In [None]:
print(df.)

# Transformer (XL)

In [None]:
torch.save(model.state_dict(), "transformer_horizon_queries_xl.pt")

In [26]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    # "XL"
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=640, nhead=10, layers=8, d_ff=2560, dropout=0.22,
        batch=96, epochs=140, lr=1.6e-4, weight_decay=1.0e-3,
        horizon_weighted=True, seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)



Epoch 01 | train loss 0.1127 | val MAE(MW) 176.08 | val MSE(scaled) 0.0389
Epoch 02 | train loss 0.0568 | val MAE(MW) 149.13 | val MSE(scaled) 0.0299
Epoch 03 | train loss 0.0544 | val MAE(MW) 160.79 | val MSE(scaled) 0.0327
Epoch 04 | train loss 0.0535 | val MAE(MW) 162.87 | val MSE(scaled) 0.0343
Epoch 05 | train loss 0.0524 | val MAE(MW) 162.02 | val MSE(scaled) 0.0364
Epoch 06 | train loss 0.0504 | val MAE(MW) 142.93 | val MSE(scaled) 0.0287
Epoch 07 | train loss 0.0505 | val MAE(MW) 149.64 | val MSE(scaled) 0.0302
Epoch 08 | train loss 0.0497 | val MAE(MW) 149.16 | val MSE(scaled) 0.0303
Epoch 09 | train loss 0.0494 | val MAE(MW) 151.89 | val MSE(scaled) 0.0318
Epoch 10 | train loss 0.0473 | val MAE(MW) 155.92 | val MSE(scaled) 0.0333
Epoch 11 | train loss 0.0462 | val MAE(MW) 156.65 | val MSE(scaled) 0.0330
Epoch 12 | train loss 0.0454 | val MAE(MW) 159.36 | val MSE(scaled) 0.0336
Epoch 13 | train loss 0.0389 | val MAE(MW) 157.71 | val MSE(scaled) 0.0325
Epoch 14 | train loss 0.0

# Tranformer (tiny)

In [28]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=128,    # much smaller hidden size
        nhead=4,        # fewer attention heads
        layers=2,       # shallower stack
        d_ff=256,       # reduced feedforward
        dropout=0.1,    # lighter regularization
        batch=128,
        epochs=80,
        lr=3e-4,        # slightly higher LR works better with small nets
        weight_decay=5e-4,
        horizon_weighted=True,
        seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)




Epoch 01 | train loss 0.0800 | val MAE(MW) 154.40 | val MSE(scaled) 0.0334
Epoch 02 | train loss 0.0528 | val MAE(MW) 146.51 | val MSE(scaled) 0.0300
Epoch 03 | train loss 0.0508 | val MAE(MW) 137.59 | val MSE(scaled) 0.0278
Epoch 04 | train loss 0.0497 | val MAE(MW) 143.23 | val MSE(scaled) 0.0289
Epoch 05 | train loss 0.0487 | val MAE(MW) 144.89 | val MSE(scaled) 0.0292
Epoch 06 | train loss 0.0483 | val MAE(MW) 143.43 | val MSE(scaled) 0.0301
Epoch 07 | train loss 0.0472 | val MAE(MW) 140.38 | val MSE(scaled) 0.0288
Epoch 08 | train loss 0.0469 | val MAE(MW) 144.01 | val MSE(scaled) 0.0303
Epoch 09 | train loss 0.0466 | val MAE(MW) 135.81 | val MSE(scaled) 0.0272
Epoch 10 | train loss 0.0463 | val MAE(MW) 146.70 | val MSE(scaled) 0.0317
Epoch 11 | train loss 0.0460 | val MAE(MW) 134.38 | val MSE(scaled) 0.0274
Epoch 12 | train loss 0.0457 | val MAE(MW) 136.54 | val MSE(scaled) 0.0284
Epoch 13 | train loss 0.0455 | val MAE(MW) 136.83 | val MSE(scaled) 0.0272
Epoch 14 | train loss 0.0

In [25]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=160,    # middle ground, not too tiny
        nhead=4,        # keeps per-head dim = 40
        layers=3,       # slightly deeper than 2
        d_ff=320,       # ~2× d_model (lightweight FFN)
        dropout=0.12,   # modest regularization
        batch=128,
        epochs=100,     # let it converge longer
        lr=2.5e-4,      # tuned for stability
        weight_decay=5e-4,
        horizon_weighted=True,
        seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)


Epoch 01 | train loss 0.0821 | val MAE(MW) 152.35 | val MSE(scaled) 0.0329
Epoch 02 | train loss 0.0533 | val MAE(MW) 181.14 | val MSE(scaled) 0.0406
Epoch 03 | train loss 0.0512 | val MAE(MW) 151.35 | val MSE(scaled) 0.0322
Epoch 04 | train loss 0.0500 | val MAE(MW) 151.32 | val MSE(scaled) 0.0324
Epoch 05 | train loss 0.0491 | val MAE(MW) 146.47 | val MSE(scaled) 0.0312
Epoch 06 | train loss 0.0488 | val MAE(MW) 146.16 | val MSE(scaled) 0.0312
Epoch 07 | train loss 0.0483 | val MAE(MW) 144.22 | val MSE(scaled) 0.0318
Epoch 08 | train loss 0.0478 | val MAE(MW) 147.73 | val MSE(scaled) 0.0312
Epoch 09 | train loss 0.0474 | val MAE(MW) 138.10 | val MSE(scaled) 0.0291
Epoch 10 | train loss 0.0470 | val MAE(MW) 137.60 | val MSE(scaled) 0.0288
Epoch 11 | train loss 0.0470 | val MAE(MW) 139.35 | val MSE(scaled) 0.0297
Epoch 12 | train loss 0.0464 | val MAE(MW) 132.37 | val MSE(scaled) 0.0270
Epoch 13 | train loss 0.0465 | val MAE(MW) 137.84 | val MSE(scaled) 0.0287
Epoch 14 | train loss 0.0

In [26]:
# --- AMP that works on both old/new PyTorch ---
USE_AMP = True
try:
    # New API (PyTorch ≥ 2.0)
    from torch.amp import autocast as _autocast_new, GradScaler as _GradScaler_new
    def amp_ctx():
        if not (USE_AMP and torch.cuda.is_available()):
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.get_device_capability()[0] >= 8)
        return _autocast_new(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
    GradScaler = _GradScaler_new
except Exception:
    # Old API fallback
    from torch.cuda.amp import autocast as _autocast_old, GradScaler as _GradScaler_old
    def amp_ctx():
        return _autocast_old(enabled=(USE_AMP and torch.cuda.is_available()))
    GradScaler = _GradScaler_old

# ======== Auto-tune with Optuna (tiny, safe search) ========
# pip install optuna==3.*  (once)

import math
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

def _valid_heads(d_model):
    # prefer heads with >=16 dims/head (stability) and common divisors
    cands = [2, 3, 4, 5, 6, 8, 10]
    return [h for h in cands if d_model % h == 0 and (d_model // h) >= 16]

def objective(trial: optuna.Trial):
    d_model = trial.suggest_categorical("d_model", [96, 128, 160, 192, 224])

    # pick heads with unique names per d_model (so distributions never change)
    if d_model == 96:
        nhead = trial.suggest_categorical("nhead_96", [3, 4, 6, 8])      # 96 / h is integer
    elif d_model == 128:
        nhead = trial.suggest_categorical("nhead_128", [4, 8, 16])
    elif d_model == 160:
        nhead = trial.suggest_categorical("nhead_160", [4, 5, 10])
    elif d_model == 192:
        nhead = trial.suggest_categorical("nhead_192", [3, 4, 6, 8, 12])
    else:  # 224
        nhead = trial.suggest_categorical("nhead_224", [4, 7, 8, 14])

    layers     = trial.suggest_categorical("layers", [2, 3, 4])
    dff_mult   = trial.suggest_categorical("d_ff_mult", [2, 3, 4])
    d_ff       = d_model * dff_mult
    dropout    = trial.suggest_float("dropout", 0.08, 0.18)
    batch      = trial.suggest_categorical("batch", [96, 128, 160])
    lr         = trial.suggest_float("lr", 1.5e-4, 4.0e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 2e-5, 1e-3, log=True)

    try:
        model = train_transformer(
            X_train, Y_train_s, X_val, Y_val_s,
            Z_train=Z_train, Z_val=Z_val,
            d_model=d_model, nhead=nhead, layers=layers, d_ff=d_ff,
            dropout=dropout, batch=batch, epochs=100 if d_model <= 160 else 90,
            lr=lr, weight_decay=weight_decay, horizon_weighted=True, seed=SEED
        )
        # reuse your MW-scale validation metric for selection
        va_loader = DataLoader(SeqDataset(X_val, Y_val_s, Z_val),
                               batch_size=256, shuffle=False)
        val_mae_mw, _ = val_metrics_MW(model, va_loader, bundle.y_scaler)
        # report so pruners/visualizers can see it
        trial.set_user_attr("val_mae_mw", float(val_mae_mw))
        return float(val_mae_mw)

    except RuntimeError as e:
        # Handle occasional CUDA OOM / numerical hiccups gracefully
        if "CUDA" in str(e) or "out of memory" in str(e).lower():
            raise optuna.TrialPruned()  # prune the bad combo
        raise

def run_study(n_trials=24, study_name="tf_tiny_sweep", seed=42):
    study = optuna.create_study(
        direction="minimize",
        study_name=study_name,
        sampler=TPESampler(seed=seed, multivariate=True, n_startup_trials=8),
        pruner=MedianPruner(n_startup_trials=6),
        storage="sqlite:///optuna_tf.db",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print("\nBest trial:")
    bt = study.best_trial
    print(f"  val_MAE(MW): {bt.value:.3f}")
    for k, v in bt.params.items():
        print(f"  {k}: {v}")

    # ------- retrain on train+val, then test --------
    import numpy as np
    X_all = np.concatenate([X_train, X_val], axis=0)
    Y_all = np.concatenate([Y_train_s, Y_val_s], axis=0)
    Z_all = None if Z_train is None else np.concatenate([Z_train, Z_val], axis=0)

    best = bt.params
    nhead_key = next(k for k in best_params if k.startswith("nhead"))
    model = train_transformer(
        X_all, Y_all, X_val, Y_val_s,   # dummy val to drive early-stopping; you can pass a small holdout, too
        Z_train=Z_all, Z_val=Z_val,     # keep Z for shape; early-stop will trigger quickly
        d_model=best["d_model"],
        nhead=best[nhead_key],
        layers=best["layers"],
        d_ff=best["d_model"] * best["d_ff_mult"],
        dropout=best["dropout"],
        batch=best["batch"],
        epochs=110 if best["d_model"] <= 160 else 100,
        lr=best["lr"],
        weight_decay=best["weight_decay"],
        horizon_weighted=True,
        seed=SEED
    )

    print("\n=== Final Test Report with best params ===")
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    return study

# ----- entry point for tuning -----
if __name__ == "__main__":
    # Comment out your manual single-run block and run the tuner:
    study = run_study(n_trials=24, study_name="tf_tiny_sweep_v1", seed=42)


[I 2025-09-24 20:14:39,498] A new study created in RDB with name: tf_tiny_sweep_v1


  0%|          | 0/24 [00:00<?, ?it/s]

Epoch 01 | train loss 0.0965 | val MAE(MW) 162.07 | val MSE(scaled) 0.0400
Epoch 02 | train loss 0.0555 | val MAE(MW) 160.96 | val MSE(scaled) 0.0362
Epoch 03 | train loss 0.0525 | val MAE(MW) 147.98 | val MSE(scaled) 0.0322
Epoch 04 | train loss 0.0510 | val MAE(MW) 145.68 | val MSE(scaled) 0.0318
Epoch 05 | train loss 0.0501 | val MAE(MW) 160.27 | val MSE(scaled) 0.0356
Epoch 06 | train loss 0.0493 | val MAE(MW) 138.20 | val MSE(scaled) 0.0297
Epoch 07 | train loss 0.0487 | val MAE(MW) 140.45 | val MSE(scaled) 0.0297
Epoch 08 | train loss 0.0481 | val MAE(MW) 135.19 | val MSE(scaled) 0.0289
Epoch 09 | train loss 0.0478 | val MAE(MW) 136.43 | val MSE(scaled) 0.0286
Epoch 10 | train loss 0.0474 | val MAE(MW) 153.78 | val MSE(scaled) 0.0356
Epoch 11 | train loss 0.0469 | val MAE(MW) 144.18 | val MSE(scaled) 0.0321
Epoch 12 | train loss 0.0459 | val MAE(MW) 138.70 | val MSE(scaled) 0.0301
Epoch 13 | train loss 0.0456 | val MAE(MW) 146.88 | val MSE(scaled) 0.0332
Epoch 14 | train loss 0.0

KeyError: 'nhead'

In [27]:
def canonicalize_best(best_params):
    d_model = best_params["d_model"]
    # find the only key that startswith "nhead"
    nhead_key = next(k for k in best_params if k.startswith("nhead"))
    nhead = best_params[nhead_key]

    layers   = best_params["layers"]
    d_ff     = d_model * best_params["d_ff_mult"]
    dropout  = best_params["dropout"]
    batch    = best_params["batch"]
    lr       = best_params["lr"]
    wd       = best_params.get("weight_decay", best_params.get("wd", 0.0))

    return dict(
        d_model=d_model, nhead=nhead, layers=layers, d_ff=d_ff,
        dropout=dropout, batch=batch, lr=lr, weight_decay=wd
    )


In [28]:
bt = study.best_trial
print("Best trial:\n ", bt.value, bt.params)

best = canonicalize_best(bt.params)
best

NameError: name 'study' is not defined

In [None]:


# (optional) retrain on train+val and evaluate on test
import numpy as np
X_all = np.concatenate([X_train, X_val], axis=0)
Y_all = np.concatenate([Y_train_s, Y_val_s], axis=0)
Z_all = None if Z_train is None else np.concatenate([Z_train, Z_val], axis=0)

model = train_transformer(
    X_all, Y_all, X_val, Y_val_s,                       # dummy val to drive early stop
    Z_train=Z_all, Z_val=Z_val,
    d_model=best["d_model"], nhead=best["nhead"],
    layers=best["layers"], d_ff=best["d_ff"],
    dropout=best["dropout"], batch=best["batch"],
    epochs=100 if best["d_model"] <= 160 else 90,
    lr=best["lr"], weight_decay=best["weight_decay"],
    horizon_weighted=True, seed=SEED
)

print("\n=== Final Test Report (best params) ===")
_ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)

In [49]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=160,    # middle ground, not too tiny
        nhead=4,        # keeps per-head dim = 40
        layers=4,       # slightly deeper than 2
        d_ff=640,       # ~2× d_model (lightweight FFN)
        dropout=0.13600591019653185,   # modest regularization
        batch=128,
        epochs=100,     # let it converge longer
        lr=0.00037664214231482003,      # tuned for stability
        weight_decay=0.0004831779595293628,
        horizon_weighted=True,
        seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)




Epoch 01 | train loss 0.0849 | val MAE(MW) 166.08 | val MSE(scaled) 0.0361
Epoch 02 | train loss 0.0538 | val MAE(MW) 155.65 | val MSE(scaled) 0.0335
Epoch 03 | train loss 0.0519 | val MAE(MW) 144.63 | val MSE(scaled) 0.0294
Epoch 04 | train loss 0.0507 | val MAE(MW) 142.90 | val MSE(scaled) 0.0294
Epoch 05 | train loss 0.0497 | val MAE(MW) 150.81 | val MSE(scaled) 0.0321
Epoch 06 | train loss 0.0491 | val MAE(MW) 137.15 | val MSE(scaled) 0.0278
Epoch 07 | train loss 0.0489 | val MAE(MW) 138.29 | val MSE(scaled) 0.0287
Epoch 08 | train loss 0.0483 | val MAE(MW) 133.04 | val MSE(scaled) 0.0270
Epoch 09 | train loss 0.0479 | val MAE(MW) 143.96 | val MSE(scaled) 0.0300
Epoch 10 | train loss 0.0482 | val MAE(MW) 144.10 | val MSE(scaled) 0.0292
Epoch 11 | train loss 0.0474 | val MAE(MW) 140.22 | val MSE(scaled) 0.0288
Epoch 12 | train loss 0.0456 | val MAE(MW) 135.72 | val MSE(scaled) 0.0277
Epoch 13 | train loss 0.0447 | val MAE(MW) 138.43 | val MSE(scaled) 0.0275
Epoch 14 | train loss 0.0

In [50]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n)

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    print(f"\nTest  MAE (MW):  {mae:.2f}")
    print(f"Test RMSE (MW): {rmse:.2f}")
    print("Horizon-wise MAE (MW):", np.round(mae_h, 2))
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h}

# ---------- Run ----------
if __name__ == "__main__":
    model = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=160,    # middle ground, not too tiny
        nhead=4,        # keeps per-head dim = 40
        layers=2,       # slightly deeper than 2
        d_ff=640,       # ~2× d_model (lightweight FFN)
        dropout=0.1450990521919241,   # modest regularization
        batch=96,
        epochs=100,     # let it converge longer
        lr=0.0002934709738664784,      # tuned for stability
        weight_decay=0.0009177960560335439,
        horizon_weighted=True,
        seed=42
    )
    _ = test_report_tf(model, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)




Epoch 01 | train loss 0.0746 | val MAE(MW) 161.62 | val MSE(scaled) 0.0346
Epoch 02 | train loss 0.0532 | val MAE(MW) 163.89 | val MSE(scaled) 0.0364
Epoch 03 | train loss 0.0515 | val MAE(MW) 148.26 | val MSE(scaled) 0.0319
Epoch 04 | train loss 0.0502 | val MAE(MW) 143.16 | val MSE(scaled) 0.0305
Epoch 05 | train loss 0.0492 | val MAE(MW) 145.67 | val MSE(scaled) 0.0307
Epoch 06 | train loss 0.0486 | val MAE(MW) 135.78 | val MSE(scaled) 0.0279
Epoch 07 | train loss 0.0483 | val MAE(MW) 132.26 | val MSE(scaled) 0.0265
Epoch 08 | train loss 0.0479 | val MAE(MW) 135.29 | val MSE(scaled) 0.0291
Epoch 09 | train loss 0.0474 | val MAE(MW) 137.29 | val MSE(scaled) 0.0295
Epoch 10 | train loss 0.0469 | val MAE(MW) 136.06 | val MSE(scaled) 0.0276
Epoch 11 | train loss 0.0454 | val MAE(MW) 137.51 | val MSE(scaled) 0.0298
Epoch 12 | train loss 0.0447 | val MAE(MW) 133.59 | val MSE(scaled) 0.0277
Epoch 13 | train loss 0.0446 | val MAE(MW) 142.58 | val MSE(scaled) 0.0292
Epoch 14 | train loss 0.0

In [None]:
{'d_model': 160, 'nhead_160': 4, 'layers': 2, 'd_ff_mult': 4, 'dropout': 0.1450990521919241, 'batch': 96, 'lr': 0.0002934709738664784, 'weight_decay': 0.0009177960560335439}

In [37]:
@torch.no_grad()
def predict_tf(model, X, Y_s, y_scaler, Z=None, batch=256):
    """
    Returns:
      P (MW):  predictions, shape (N, H)
      Y (MW):  ground-truth, shape (N, H)  (inverse-transformed from *_s)
    """
    model.eval()
    dl = DataLoader(SeqDataset(X, Y_s, Z), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device

    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch) == 2:
            xb, yb = batch; zb = None
        else:
            xb, yb, zb = batch
        xb = xb.to(device)
        zb = None if zb is None else zb.to(device)

        yhat_s = model(xb, zb)                     # (B, H) in *scaled* space
        preds_s.append(yhat_s.cpu().numpy())
        trues_s.append(yb.cpu().numpy())

    preds_s = np.concatenate(preds_s, axis=0)      # (N, H), scaled
    trues_s = np.concatenate(trues_s, axis=0)      # (N, H), scaled

    # inverse-transform per element (your scaler expects 2D)
    P = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)  # (N,H) MW
    Y = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)  # (N,H) MW
    return P, Y

# 

In [23]:
# train_transformer_baseline_fixed.py  (minimal deterministic-friendly patch)

import os, random, math, numpy as np

# ---- Toggle: set True for strict determinism (requires kernel restart) ----
STRICT_DETERMINISM = False

# If strict, cuBLAS needs this BEFORE importing torch (and a fresh kernel)
if STRICT_DETERMINISM:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- seeding (simple & safe) ---
SEED = 42
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    # keep deterministic cuDNN kernels, but don't force torch.use_deterministic_algorithms
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # TF32 off helps reproducibility
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

seed_everything(SEED)

# --- AMP (new API with fallback); set USE_AMP=False for tighter reproducibility ---
USE_AMP = True
try:
    from torch.amp import autocast, GradScaler
    def amp_ctx():
        if not USE_AMP:
            class _NoOp:
                def __enter__(self): return None
                def __exit__(self, *a): return False
            return _NoOp()
        use_bf16 = (torch.cuda.is_available()
                    and torch.cuda.get_device_capability()[0] >= 8)
        return autocast(device_type="cuda", dtype=torch.bfloat16 if use_bf16 else torch.float16)
except Exception:
    from torch.cuda.amp import autocast, GradScaler
    def amp_ctx():
        return autocast(enabled=(USE_AMP and torch.cuda.is_available()))

# ========== Data ==========
class SeqDataset(Dataset):
    """Returns (X, Y) if Z is None; otherwise (X, Y, Z). Avoids collating None."""
    def __init__(self, X, Y, Z_future=None):
        self.X = torch.from_numpy(X)              # (N, L, D)
        self.Y = torch.from_numpy(Y)              # (N, H)
        self.Z = None if Z_future is None else torch.from_numpy(Z_future.astype(np.float32))
        self.has_future = self.Z is not None
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        if self.has_future:
            return self.X[i], self.Y[i], self.Z[i]
        else:
            return self.X[i], self.Y[i]

def _seed_worker(worker_id: int):
    wseed = torch.initial_seed() % 2**32
    np.random.seed(wseed); random.seed(wseed)

def loaders(Xtr,Ytr,Xva,Yva, Ztr=None, Zva=None, batch=128, workers=0, seed: int = SEED):
    g = torch.Generator().manual_seed(seed)  # deterministic shuffle
    return (
        DataLoader(SeqDataset(Xtr,Ytr,Ztr), batch_size=batch, shuffle=True,  drop_last=True,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
        DataLoader(SeqDataset(Xva,Yva,Zva), batch_size=batch, shuffle=False, drop_last=False,
                   num_workers=workers, pin_memory=True, worker_init_fn=_seed_worker, generator=g),
    )

# ========== Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerForecaster(nn.Module):
    def __init__(self, input_dim, horizon=24, d_model=256, nhead=8, num_layers=3,
                 d_ff=512, dropout=0.1, future_feat_dim=0):
        super().__init__()
        self.horizon = horizon
        self.input_proj = nn.Linear(input_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.pos = PositionalEncoding(d_model)
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.horz_emb = nn.Parameter(torch.randn(horizon, d_model) * 0.02)
        self.future_proj = nn.Linear(future_feat_dim, d_model) if future_feat_dim and future_feat_dim>0 else None
        head_in = d_model + d_model + (d_model if self.future_proj else 0)
        self.head = nn.Sequential(nn.LayerNorm(head_in), nn.Linear(head_in, 256),
                                  nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 1))

    def forward(self, x, z_future=None):
        B, L, _ = x.shape
        x = self.pos(self.input_proj(x))             # (B,L,d)
        cls = self.cls.expand(B, 1, -1)
        enc = self.encoder(torch.cat([cls, x], dim=1))
        context = enc[:, 0, :]                       # (B,d)
        H = self.horizon
        he = self.horz_emb.unsqueeze(0).expand(B, H, -1)  # (B,H,d)
        ctx = context.unsqueeze(1).expand(B, H, -1)       # (B,H,d)
        if self.future_proj is not None and z_future is not None:
            z_proj = self.future_proj(z_future)           # (B,H,d)
            fuse = torch.cat([ctx, he, z_proj], dim=-1)
        else:
            fuse = torch.cat([ctx, he], dim=-1)
        return self.head(fuse).squeeze(-1)                # (B,H)

# ========== Metrics & training ==========
@torch.no_grad()
def val_metrics_MW(model, loader, y_scaler):
    model.eval()
    preds_s, trues_s, n, mse_s = [], [], 0, 0.0
    for batch in loader:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(next(model.parameters()).device)
        yb = yb.to(next(model.parameters()).device)
        zb = None if zb is None else zb.to(xb.device)
        yhat_s = model(xb, zb)
        mse_s += torch.mean((yhat_s - yb)**2).item() * xb.size(0); n += xb.size(0)
        preds_s.append(yhat_s.detach().cpu().numpy()); trues_s.append(yb.detach().cpu().numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = bundle.y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = bundle.y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    mae = np.mean(np.abs(preds - trues))
    return mae, (mse_s / n), preds

def train_transformer(
    X_train, Y_train_s, X_val, Y_val_s, Z_train=None, Z_val=None,
    d_model=256, nhead=8, layers=3, d_ff=512, dropout=0.1,
    batch=128, epochs=80, lr=2e-4, weight_decay=2e-4, horizon_weighted=True, seed: int = SEED
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Din, H = X_train.shape[2], Y_train_s.shape[1]
    Ff = 0 if Z_train is None else Z_train.shape[2]
    tr_loader, va_loader = loaders(X_train, Y_train_s, X_val, Y_val_s, Z_train, Z_val,
                                   batch=batch, workers=0, seed=seed)

    model = TransformerForecaster(Din, horizon=H, d_model=d_model, nhead=nhead,
                                  num_layers=layers, d_ff=d_ff, dropout=dropout,
                                  future_feat_dim=Ff).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
    scaler = GradScaler(enabled=(USE_AMP and torch.cuda.is_available()))

    loss_fn = nn.SmoothL1Loss(reduction="none")
    w = torch.linspace(1.20, 0.90, H, device=device).view(1, H) if horizon_weighted else None

    best_mae, best = float("inf"), None
    patience, left = 8, 8
    MIN_DELTA = 0.0

    for epoch in range(1, epochs+1):
        model.train()
        run, nitems = 0.0, 0
        for batch in tr_loader:
            if len(batch)==2: xb, yb = batch; zb = None
            else: xb, yb, zb = batch
            xb = xb.to(device); yb = yb.to(device); zb = None if zb is None else zb.to(device)
            opt.zero_grad(set_to_none=True)
            with amp_ctx():
                yhat_s = model(xb, zb)
                L = loss_fn(yhat_s, yb)
                if w is not None: L = L * w
                L = L.mean()
            scaler.scale(L).backward()
            scaler.unscale_(opt); nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            run += L.item() * xb.size(0); nitems += xb.size(0)

        val_mae_mw, val_mse_s, yhat_val = val_metrics_MW(model, va_loader, bundle.y_scaler)
        sched.step(val_mae_mw)
        print(f"Epoch {epoch:02d} | train loss {run/nitems:.4f} | val MAE(MW) {val_mae_mw:.2f} | val MSE(scaled) {val_mse_s:.4f}")

        if best_mae - val_mae_mw > MIN_DELTA:
            best_mae, best, left = val_mae_mw, {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}, patience
        else:
            left -= 1
            if left == 0:
                print("Early stopping."); break

    if best is not None: model.load_state_dict(best)
    return model, yhat_val

@torch.no_grad()
def test_report_tf(model, X_test, Y_test_s, y_scaler, Z_test=None, batch=256):
    dl = DataLoader(SeqDataset(X_test, Y_test_s, Z_test), batch_size=batch, shuffle=False)
    device = next(model.parameters()).device
    preds_s, trues_s = [], []
    for batch in dl:
        if len(batch)==2: xb, yb = batch; zb = None
        else: xb, yb, zb = batch
        xb = xb.to(device); zb = None if zb is None else zb.to(device)
        yhat_s = model(xb, zb)
        preds_s.append(yhat_s.cpu().numpy()); trues_s.append(yb.numpy())
    preds_s, trues_s = np.concatenate(preds_s), np.concatenate(trues_s)
    preds = y_scaler.inverse_transform(preds_s.reshape(-1,1)).reshape(preds_s.shape)
    trues = y_scaler.inverse_transform(trues_s.reshape(-1,1)).reshape(trues_s.shape)
    
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues)**2))
    mae_h = np.mean(np.abs(preds - trues), axis=0)
    def safe_mape(y_true, y_pred, eps=1e-6):
        y_true = np.asarray(y_true)
        return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0
    mape = safe_mape(trues, preds)
    return {"MAE": mae, "RMSE": rmse, "MAE_by_h": mae_h, "MAPE": mape}, preds

# ---------- Run ----------
if __name__ == "__main__":
    model_transformer, P_trf_val = train_transformer(
        X_train, Y_train_s, X_val, Y_val_s,
        Z_train=Z_train, Z_val=Z_val,
        d_model=160,    # middle ground, not too tiny
        nhead=4,        # keeps per-head dim = 40
        layers=2,       # slightly deeper than 2
        d_ff=640,       # ~2× d_model (lightweight FFN)
        dropout=0.1450990521919241,   # modest regularization
        batch=96,
        epochs=100,     # let it converge longer
        lr=0.0002934709738664784,      # tuned for stability
        weight_decay=0.0009177960560335439,
        horizon_weighted=True,
        seed=42
    )
    metrics_transformer, P_trf_test = test_report_tf(model_transformer, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
    print(f"\nTest MAE (MW):  {metrics_transformer['MAE']:.2f}")
    print(f"Test RMSE (MW): {metrics_transformer['RMSE']:.2f}")
    print(f"Test MAPE (MW): {metrics_transformer['MAPE']:.2f}")
    print("Horizon-wise MAE (MW):", np.round(metrics_transformer["MAE_by_h"], 2))

Epoch 01 | train loss 0.0746 | val MAE(MW) 161.62 | val MSE(scaled) 0.0346
Epoch 02 | train loss 0.0532 | val MAE(MW) 163.89 | val MSE(scaled) 0.0364
Epoch 03 | train loss 0.0515 | val MAE(MW) 148.26 | val MSE(scaled) 0.0319
Epoch 04 | train loss 0.0502 | val MAE(MW) 143.16 | val MSE(scaled) 0.0305
Epoch 05 | train loss 0.0492 | val MAE(MW) 145.67 | val MSE(scaled) 0.0307
Epoch 06 | train loss 0.0486 | val MAE(MW) 135.78 | val MSE(scaled) 0.0279
Epoch 07 | train loss 0.0483 | val MAE(MW) 132.26 | val MSE(scaled) 0.0265
Epoch 08 | train loss 0.0479 | val MAE(MW) 135.29 | val MSE(scaled) 0.0291
Epoch 09 | train loss 0.0474 | val MAE(MW) 137.29 | val MSE(scaled) 0.0295
Epoch 10 | train loss 0.0469 | val MAE(MW) 136.06 | val MSE(scaled) 0.0276
Epoch 11 | train loss 0.0454 | val MAE(MW) 137.51 | val MSE(scaled) 0.0298
Epoch 12 | train loss 0.0447 | val MAE(MW) 133.59 | val MSE(scaled) 0.0277
Epoch 13 | train loss 0.0446 | val MAE(MW) 142.58 | val MSE(scaled) 0.0292
Epoch 14 | train loss 0.0

In [24]:
print(P_trf_val.shape, P_trf_test.shape)   # both -> (N, 24)

(1344, 24) (1344, 24)


In [25]:
torch.save(model_transformer.state_dict(), "transformer_forecaster.pth")

# XGBoost/LightGBM combining with Transformer

In [26]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## XGBoost

In [27]:
import xgboost as xgb

In [28]:
def avg_neg_rmse(y_true, y_pred):
        y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
        rmse_per_h = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
        return -float(np.mean(rmse_per_h))

In [29]:
def runXGBoost(df_feat):
    # 3. train/valid/test split by time
    label_cols   = [f'y_t+{h}' for h in range(1, HORIZON+1)]
    feature_cols = [c for c in df_feat.columns if c not in label_cols + [TIME_COL, TARGET_COL]]
    
    X = df_feat[feature_cols]
    Y = df_feat[label_cols].values
    
    N        = len(X)
    train_end= int(N - 2*8*7*24)
    valid_end= int(N - 8*7*24)
    
    X_train, Y_train = X.iloc[:train_end],    Y[:train_end]
    X_valid, Y_valid = X.iloc[train_end:valid_end], Y[train_end:valid_end]
    X_test,  Y_test  = X.iloc[valid_end:],    Y[valid_end:]
    
    print(X_train.shape, Y_train.shape)
    print(X_valid.shape, Y_valid.shape)
    print(X_test.shape, Y_test.shape)
    # 4. preprocessor: one‑hot encode the 4 categorical columns, pass others through
    preprocess = ColumnTransformer(
        [
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CAT_COLS),
            ('num', 'passthrough', [c for c in feature_cols if c not in CAT_COLS])
        ],
        remainder='drop'
    )
    # Scorer: average RMSE across 24 horizons (negated for maximization)
    # =========================
    scorer = make_scorer(avg_neg_rmse, greater_is_better=True)
    import joblib
    xgb_tuner = joblib.load('/kaggle/input/xgboostlightgbm/scikitlearn/default/1/xgb_tuner_1.pkl')
    print(xgb_tuner.best_params_)
    # =========================
    # XGBoost + known-ahead Z
    # Train on TRAIN only; predict on VALID and TEST
    # =========================
    DTYPE = np.float32
    
    # 1) Build Z for EXACT rows in df_feat, then split into train/val/test
    def build_future_covariates(times, H, df, TIME_COL):
        df_key = df[[TIME_COL, 'is_holiday', 'season']].copy()
        df_key[TIME_COL] = pd.to_datetime(df_key[TIME_COL])
        df_key = df_key.set_index(TIME_COL).sort_index()
    
        Z_list = []
        for ts in times:
            row = []
            for h in range(1, H+1):
                t_h = pd.Timestamp(ts) + pd.Timedelta(hours=h)
                hour = t_h.hour; dow = t_h.dayofweek; doy = t_h.dayofyear
                v = [
                    np.sin(2*np.pi*hour/24), np.cos(2*np.pi*hour/24),
                    np.sin(2*np.pi*dow/7),  np.cos(2*np.pi*dow/7),
                    np.sin(2*np.pi*doy/365.25), np.cos(2*np.pi*doy/365.25),
                    1 if dow>=5 else 0,  # is_weekend
                ]
                if t_h in df_key.index:
                    hol = str(df_key.loc[t_h, 'is_holiday'])
                    sea = str(df_key.loc[t_h, 'season'])
                else:
                    hol, sea = "False", None
                hol_vec = [int(hol=="False"), int(hol=="tet"), int(hol=="national")]
                sea_vec = [int(sea==s) for s in ["winter","spring","summer","autumn"]] if sea is not None else [0,0,0,0]
                v += hol_vec + sea_vec
                row.append(v)
            Z_list.append(row)
        return np.array(Z_list, dtype=DTYPE)  # (N, H, Ff)
    
    # -- Build & split Z to align with X/Y splits defined above --
    end_times_all = df_feat[TIME_COL].values
    Z_all = build_future_covariates(end_times_all, HORIZON, df=df, TIME_COL=TIME_COL)
    
    Z_train = Z_all[:train_end]
    Z_val   = Z_all[train_end:valid_end]
    Z_test  = Z_all[valid_end:]
    
    # 2) Fit preprocessor on TRAIN only; transform VALID & TEST
    X_train_enc = preprocess.fit_transform(X_train)
    X_val_enc   = preprocess.transform(X_valid)
    X_test_enc  = preprocess.transform(X_test)
    
    # 3) Map tuned params to xgb.train
    bp = xgb_tuner.best_params_  # already loaded from your joblib
    def g(k, default=None): return bp.get(f"model__estimator__{k}", default)
    
    xgb_params = {
        "max_depth":         g("max_depth"),
        "eta":               g("learning_rate"),
        "subsample":         g("subsample"),
        "colsample_bytree":  g("colsample_bytree"),
        "min_child_weight":  g("min_child_weight"),
        "gamma":             g("gamma"),
        "lambda":            g("reg_lambda"),
        "alpha":             g("reg_alpha"),
        "objective":         "reg:squarederror",
        "eval_metric":       "rmse",
        "verbosity":         0,
        "tree_method":       "gpu_hist",   # switch to 'hist' if no GPU
    }
    num_boost_round = g("n_estimators", 1500)
    early_stopping_rounds = 100
    
    # 4) Train per-horizon on TRAIN, early-stop on VALID; predict both VALID and TEST
    models_xgb = []
    y_hat_val_list  = []
    y_hat_test_list = []
    
    for h in range(HORIZON):  # 0..23 => t+1..t+24
        Xtr_h = np.hstack([X_train_enc, Z_train[:, h, :]]).astype(DTYPE)
        Xva_h = np.hstack([X_val_enc,   Z_val[:,   h, :]]).astype(DTYPE)
        Xte_h = np.hstack([X_test_enc,  Z_test[:,  h, :]]).astype(DTYPE)
    
        dtrain = xgb.DMatrix(Xtr_h, label=Y_train[:, h])
        dvalid = xgb.DMatrix(Xva_h, label=Y_valid[:, h])
    
        booster = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "valid")],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )
        models_xgb.append(booster)
    
        # predictions for VAL & TEST under the SAME horizon-specific features
        y_hat_val_list.append(
            booster.predict(xgb.DMatrix(Xva_h), iteration_range=(0, booster.best_iteration + 1))
        )
        y_hat_test_list.append(
            booster.predict(xgb.DMatrix(Xte_h), iteration_range=(0, booster.best_iteration + 1))
        )
    
    Y_hat_val  = np.column_stack(y_hat_val_list)   # shape (N_val,  H)
    Y_hat_test = np.column_stack(y_hat_test_list)  # shape (N_test, H)
    
    # 5) Optional: metrics for quick inspection
    def safe_mape(y_true, y_pred, eps=1e-6):
        y_true = np.asarray(y_true)
        return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0
    
    rmse_val = [math.sqrt(mean_squared_error(Y_valid[:, h], Y_hat_val[:, h])) for h in range(HORIZON)]
    rmse_tst = [math.sqrt(mean_squared_error(Y_test[:,  h], Y_hat_test[:, h])) for h in range(HORIZON)]
    print("VAL  Avg RMSE:", round(np.mean(rmse_val), 3), "  TEST Avg RMSE:", round(np.mean(rmse_tst), 3))

    return Y_hat_val, Y_hat_test

P_xgb_val, P_xgb_test = runXGBoost(df_feat)

(44689, 131) (44689, 24)
(1344, 131) (1344, 24)
(1344, 131) (1344, 24)
{'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.5, 'model__estimator__reg_alpha': 0.0, 'model__estimator__n_estimators': 1000, 'model__estimator__min_child_weight': 3, 'model__estimator__max_depth': 6, 'model__estimator__learning_rate': 0.03, 'model__estimator__gamma': 0.1, 'model__estimator__colsample_bytree': 0.7}
VAL  Avg RMSE: 213.258   TEST Avg RMSE: 285.286


In [61]:
models_xgb

NameError: name 'models_xgb' is not defined

In [30]:
print(P_xgb_val.shape, P_xgb_test.shape)

(1344, 24) (1344, 24)


In [53]:
import numpy as np

# ---- 1) Simple average
P_avg_val_tfxgb  = 0.5*P_trf_val + 0.5*P_xgb_val
P_avg_test_tfxgb = 0.5*P_trf_test + 0.5*P_xgb_test

# ---- 2) Per-horizon optimal weight α_h via closed-form least squares (clip to [0,1])
def optimal_alphas_per_horizon(Y, P1, P2):
    # Y, P1, P2: (n, H)
    H = Y.shape[1]
    alpha = np.zeros(H)
    D = P1 - P2                       # (n, H)
    num = ((Y - P2) * D).sum(axis=0)  # covariance-like term
    den = (D * D).sum(axis=0) + 1e-12 # variance-like term (avoid 0-division)
    alpha = num / den
    return np.clip(alpha, 0.0, 1.0)   # enforce convex combo

alph_tfxgb = optimal_alphas_per_horizon(Y_val, P_trf_val, P_xgb_val)  # (24,)
# Broadcast to (n,H)
P_wavg_val_tfxgb  = alph_tfxgb * P_trf_val  + (1 - alph_tfxgb) * P_xgb_val
P_wavg_test_tfxgb = alph_tfxgb * P_trf_test + (1 - alph_tfxgb) * P_xgb_test
print("Per-horizon alphas:", np.round(alph_tfxgb, 3))


Per-horizon alphas: [0.656 0.678 0.626 0.629 0.597 0.62  0.596 0.576 0.6   0.628 0.643 0.651
 0.598 0.672 0.609 0.616 0.609 0.573 0.643 0.681 0.634 0.653 0.712 0.696]


In [54]:
from sklearn.metrics import mean_squared_error

def rmse(y, yhat): return np.sqrt(mean_squared_error(y, yhat))
def nrmse(y, yhat): return rmse(y, yhat) / (np.mean(y) + 1e-12)
def mape(y, yhat):  return np.mean(np.abs((y - yhat) / (y + 1e-12)))
def mae(y, yhat): return np.mean(np.abs(y - yhat))
def report(name, Y, P):
    print(f"{name:12s} | MAE:{mae(Y,P):8.2f}  RMSE:{rmse(Y,P):8.2f}  nRMSE:{100*nrmse(Y,P):6.2f}%  MAPE:{100*mape(Y,P):6.2f}%")


print("On validation:")
report("XGBoost",     Y_val, P_xgb_val)
report("Transformer", Y_val, P_trf_val)
report("Avg(50/50)",  Y_val, P_avg_val_tfxgb)
report("Weighted",    Y_val, P_wavg_val_tfxgb)

print("On test (final results):")
report("XGBoost",     Y_test, P_xgb_test)
report("Transformer", Y_test, P_trf_test)
report("Avg(50/50)",  Y_test, P_avg_test_tfxgb)
report("Weighted",    Y_test, P_wavg_test_tfxgb)


On validation:
XGBoost      | MAE:  149.58  RMSE:  213.38  nRMSE: 10.88%  MAPE:  7.45%
Transformer  | MAE:  134.50  RMSE:  193.76  nRMSE:  9.88%  MAPE:  6.66%
Avg(50/50)   | MAE:  124.69  RMSE:  184.98  nRMSE:  9.44%  MAPE:  6.06%
Weighted     | MAE:  123.87  RMSE:  183.40  nRMSE:  9.36%  MAPE:  6.02%
On test (final results):
XGBoost      | MAE:  203.20  RMSE:  285.64  nRMSE: 11.54%  MAPE:  8.13%
Transformer  | MAE:  177.82  RMSE:  247.16  nRMSE:  9.99%  MAPE:  7.17%
Avg(50/50)   | MAE:  177.35  RMSE:  250.49  nRMSE: 10.12%  MAPE:  7.06%
Weighted     | MAE:  174.71  RMSE:  245.96  nRMSE:  9.94%  MAPE:  6.97%


In [47]:
np.savetxt("ensemble_weights_xgb_trf.csv", alph, delimiter=",")

## LightGBM

In [33]:
import os, json, joblib, numpy as np, gc
from sklearn.model_selection import ParameterSampler
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [34]:
lgb_tuner = joblib.load('/kaggle/input/lightgbm/scikitlearn/default/1/lgb_tuner_1.pkl')
with open('/kaggle/input/lightgbm/scikitlearn/default/1/lgbm_best_params.json', "r") as f:
    best_params_lgb = json.load(f)


print(lgb_tuner)
print(best_params_lgb)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['Weather', 'Wind Direction',
                                                   'season', 'is_holiday']),
                                                 ('num', 'passthrough',
                                                  ['Temperature',
                                                   'Precipitation',
                                                   'Chance of snow', 'Humidity',
                                                   'Wind', 'Wind Gust',
                                                   'Wind Degree', 'Cloud Cover',
                                                   'Visibility', 'is_weekend',
                                                   'hour', 'dow', '

In [35]:
def avg_neg_rmse(y_true, y_pred):
        y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
        rmse_per_h = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
        return -float(np.mean(rmse_per_h))
    
def runLightGBM(df_feat):
    # 3. train/valid/test split by time
    label_cols   = [f'y_t+{h}' for h in range(1, HORIZON+1)]
    feature_cols = [c for c in df_feat.columns if c not in label_cols + [TIME_COL, TARGET_COL]]
    
    X = df_feat[feature_cols]
    Y = df_feat[label_cols].values
    
    N        = len(X)
    train_end= int(N - 2*8*7*24)
    valid_end= int(N - 8*7*24)
    
    X_train, Y_train = X.iloc[:train_end],    Y[:train_end]
    X_valid, Y_valid = X.iloc[train_end:valid_end], Y[train_end:valid_end]
    X_test,  Y_test  = X.iloc[valid_end:],    Y[valid_end:]
    
    print(X_train.shape, Y_train.shape)
    print(X_valid.shape, Y_valid.shape)
    print(X_test.shape, Y_test.shape)
    # 4. preprocessor: one‑hot encode the 4 categorical columns, pass others through
    preprocess = ColumnTransformer(
        [
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CAT_COLS),
            ('num', 'passthrough', [c for c in feature_cols if c not in CAT_COLS])
        ],
        remainder='drop'
    )
    # Scorer: average RMSE across 24 horizons (negated for maximization)
    # =========================
    scorer = make_scorer(avg_neg_rmse, greater_is_better=True)

    # =========================
    # LightGBM + known-ahead Z
    # Train on TRAIN only; predict on VALID and TEST
    # =========================
    DTYPE = np.float32

    # 1) Build Z for EXACT rows in df_feat, then split into train/val/test
    def build_future_covariates(times, H, df, TIME_COL):
        df_key = df[[TIME_COL, 'is_holiday', 'season']].copy()
        df_key[TIME_COL] = pd.to_datetime(df_key[TIME_COL])
        df_key = df_key.set_index(TIME_COL).sort_index()
    
        Z_list = []
        for ts in times:
            row = []
            for h in range(1, H+1):
                t_h = pd.Timestamp(ts) + pd.Timedelta(hours=h)
                hour = t_h.hour; dow = t_h.dayofweek; doy = t_h.dayofyear
                v = [
                    np.sin(2*np.pi*hour/24), np.cos(2*np.pi*hour/24),
                    np.sin(2*np.pi*dow/7),  np.cos(2*np.pi*dow/7),
                    np.sin(2*np.pi*doy/365.25), np.cos(2*np.pi*doy/365.25),
                    1 if dow>=5 else 0,  # is_weekend
                ]
                if t_h in df_key.index:
                    hol = str(df_key.loc[t_h, 'is_holiday'])
                    sea = str(df_key.loc[t_h, 'season'])
                else:
                    hol, sea = "False", None
                hol_vec = [int(hol=="False"), int(hol=="tet"), int(hol=="national")]
                sea_vec = [int(sea==s) for s in ["winter","spring","summer","autumn"]] if sea is not None else [0,0,0,0]
                v += hol_vec + sea_vec
                row.append(v)
            Z_list.append(row)
        return np.array(Z_list, dtype=DTYPE)  # (N, H, Ff)
    
    # -- Build & split Z to align with X/Y splits defined above --
    end_times_all = df_feat[TIME_COL].values
    Z_all = build_future_covariates(end_times_all, HORIZON, df=df, TIME_COL=TIME_COL)
    
    Z_train = Z_all[:train_end]
    Z_val   = Z_all[train_end:valid_end]
    Z_test  = Z_all[valid_end:]
    
    # Fit preprocessor on TRAIN only; transform VALID & TEST
    X_train_enc = preprocess.fit_transform(X_train)
    X_val_enc   = preprocess.transform(X_valid)
    X_test_enc  = preprocess.transform(X_test)
    
    print(X_train_enc.shape)
    print(X_val_enc.shape)
    print(X_test_enc.shape)
    
    # 1) map best params from tuner to lgb.train
    bp = best_params_lgb
    g  = lambda k, d=None: bp.get(f"model__estimator__{k}", d)
    
    lgb_params = {
        "seed": 42,                       # master seed (alias: random_state)
        "bagging_seed": 42,
        "feature_fraction_seed": 42,
        "data_random_seed": 42,
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": g("learning_rate", 0.05),
        "num_leaves": g("num_leaves", 127),
        "max_depth": g("max_depth", -1),
        "feature_fraction": g("colsample_bytree", 0.8),   # alias
        "bagging_fraction": g("subsample", 0.8),          # alias
        "bagging_freq": bp.get("model__estimator__subsample_freq", 0),
        "min_data_in_leaf": g("min_child_samples", 20),
        "lambda_l1": g("reg_alpha", 0.0),
        "lambda_l2": g("reg_lambda", 0.1),
        "verbosity": -1,
        "num_threads": -1,
        # Optional GPU:
        "device_type": "gpu",
        # Optional speed knob during refit:
        "max_bin": 255,
    }
    num_boost_round = g("n_estimators", 4000)
    early_stopping_rounds = 100
    
    # 4) Train per-horizon on TRAIN, early-stop on VALID; predict both VALID and TEST
    models_lgb = []
    y_hat_val_list  = []
    y_hat_test_list = []

    for h in range(HORIZON):  # 0..23 => t+1..t+24
        Xtr_h = np.hstack([X_train_enc, Z_train[:, h, :]]).astype(DTYPE)
        Xva_h = np.hstack([X_val_enc,   Z_val[:,   h, :]]).astype(DTYPE)
        Xte_h = np.hstack([X_test_enc,  Z_test[:,  h, :]]).astype(DTYPE)
    
        dtrain = lgb.Dataset(Xtr_h, label=Y_train[:, h])
        dvalid = lgb.Dataset(Xva_h, label=Y_valid[:, h], reference=dtrain)
    
        booster = lgb.train(
            params=lgb_params,
            train_set=dtrain,
            num_boost_round=num_boost_round,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(-1)],
        )
        models_lgb.append((booster, Xte_h))
        
        # predictions for VAL & TEST under the SAME horizon-specific features
        y_hat_val_list.append(
            booster.predict(Xva_h, iteration_range=(0, booster.best_iteration + 1))
        )
        y_hat_test_list.append(
            booster.predict(Xte_h, iteration_range=(0, booster.best_iteration + 1))
        )
    
    Y_hat_val  = np.column_stack(y_hat_val_list)   # shape (N_val,  H)
    Y_hat_test = np.column_stack(y_hat_test_list)  # shape (N_test, H)
    
    # Optional: metrics for quick inspection
    def safe_mape(y_true, y_pred, eps=1e-6):
        y_true = np.asarray(y_true)
        return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100.0

    # Val
    rmse_val_lgb = [math.sqrt(mean_squared_error(Y_valid[:, h], Y_hat_val[:, h])) for h in range(HORIZON)]
    nrmse_val_lgb = [rm / (np.mean(Y_valid[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_val_lgb)]
    mape_val_lgb  = [safe_mape(Y_valid[:, h], Y_hat_val[:, h]) for h in range(HORIZON)]
    # Test
    rmse_test_lgb = [math.sqrt(mean_squared_error(Y_test[:, h], Y_hat_test[:, h])) for h in range(HORIZON)]
    nrmse_test_lgb = [rm / (np.mean(Y_test[:, h]) + 1e-6) * 100 for h, rm in enumerate(rmse_test_lgb)]
    mape_test_lgb  = [safe_mape(Y_test[:, h], Y_hat_test[:, h]) for h in range(HORIZON)]
    print("LightGBM val:")
    print("Per-horizon RMSE:", [round(x,2) for x in rmse_val_lgb])
    print("Avg RMSE:  {:.3f}".format(np.mean(rmse_val_lgb)))
    print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_val_lgb)))
    print("Avg MAPE:  {:.2f}%".format(np.mean(mape_val_lgb)))

    print("LightGBM test:")
    print("Per-horizon RMSE:", [round(x,2) for x in rmse_test_lgb])
    print("Avg RMSE:  {:.3f}".format(np.mean(rmse_test_lgb)))
    print("Avg nRMSE: {:.2f}%".format(np.mean(nrmse_test_lgb)))
    print("Avg MAPE:  {:.2f}%".format(np.mean(mape_test_lgb)))
    return Y_hat_val, Y_hat_test

P_lgb_val, P_lgb_test = runLightGBM(df_feat)

(44689, 131) (44689, 24)
(1344, 131) (1344, 24)
(1344, 131) (1344, 24)
(44689, 155)
(1344, 155)
(1344, 155)




Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's rmse: 221.383
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[236]	valid_0's rmse: 214.613
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's rmse: 205.398
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's rmse: 209.231
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[299]	valid_0's rmse: 204.038
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[298]	valid_0's rmse: 207.496
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's rmse: 202.338
Training until valid

In [36]:
print(P_lgb_val.shape, P_lgb_test.shape)

(1344, 24) (1344, 24)


In [51]:
# ---- 1) Simple average
P_avg_val_tflgb  = 0.5*P_trf_val + 0.5*P_lgb_val
P_avg_test_tflgb = 0.5*P_trf_test + 0.5*P_lgb_test

# ---- 2) Per-horizon optimal weight α_h via closed-form least squares (clip to [0,1])
def optimal_alphas_per_horizon(Y, P1, P2):
    # Y, P1, P2: (n, H)
    H = Y.shape[1]
    alpha = np.zeros(H)
    D = P1 - P2                       # (n, H)
    num = ((Y - P2) * D).sum(axis=0)  # covariance-like term
    den = (D * D).sum(axis=0) + 1e-12 # variance-like term (avoid 0-division)
    alpha = num / den
    return np.clip(alpha, 0.0, 1.0)   # enforce convex combo

alph_tflgb = optimal_alphas_per_horizon(Y_val, P_trf_val, P_lgb_val)  # (24,)
# Broadcast to (n,H)
P_wavg_val_tflgb  = alph_tflgb * P_trf_val  + (1 - alph_tflgb) * P_lgb_val
P_wavg_test_tflgb = alph_tflgb * P_trf_test + (1 - alph_tflgb) * P_lgb_test
print("Per-horizon alphas:", np.round(alph_tflgb, 3))


Per-horizon alphas: [0.704 0.657 0.606 0.63  0.598 0.626 0.584 0.548 0.584 0.607 0.623 0.593
 0.607 0.622 0.628 0.602 0.633 0.591 0.597 0.691 0.655 0.657 0.696 0.744]


In [52]:
from sklearn.metrics import mean_squared_error

def rmse(y, yhat): return np.sqrt(mean_squared_error(y, yhat))
def nrmse(y, yhat): return rmse(y, yhat) / (np.mean(y) + 1e-12)
def mape(y, yhat):  return np.mean(np.abs((y - yhat) / (y + 1e-12)))
def mae(y, yhat): return np.mean(np.abs(y - yhat))

def report(name, Y, P):
    print(f"{name:12s} | MAE:{mae(Y,P):8.2f}  RMSE:{rmse(Y,P):8.2f}  nRMSE:{100*nrmse(Y,P):6.2f}%  MAPE:{100*mape(Y,P):6.2f}%")

print("On validation:")
report("LightGBM",     Y_val, P_lgb_val)
report("Transformer", Y_val, P_trf_val)
report("Avg(50/50)",  Y_val, P_avg_val_tflgb)
report("Weighted",    Y_val, P_wavg_val_tflgb)

print("On test (final results):")
report("LightGBM",     Y_test, P_lgb_test)
report("Transformer", Y_test, P_trf_test)
report("Avg(50/50)",  Y_test, P_avg_test_tflgb)
report("Weighted",    Y_test, P_wavg_test_tflgb)


On validation:
LightGBM     | MAE:  146.83  RMSE:  211.68  nRMSE: 10.80%  MAPE:  7.30%
Transformer  | MAE:  134.50  RMSE:  193.76  nRMSE:  9.88%  MAPE:  6.66%
Avg(50/50)   | MAE:  123.86  RMSE:  185.15  nRMSE:  9.44%  MAPE:  6.00%
Weighted     | MAE:  123.56  RMSE:  183.70  nRMSE:  9.37%  MAPE:  6.00%
On test (final results):
LightGBM     | MAE:  197.19  RMSE:  283.70  nRMSE: 11.47%  MAPE:  7.79%
Transformer  | MAE:  177.82  RMSE:  247.16  nRMSE:  9.99%  MAPE:  7.17%
Avg(50/50)   | MAE:  174.45  RMSE:  249.52  nRMSE: 10.08%  MAPE:  6.89%
Weighted     | MAE:  172.56  RMSE:  245.27  nRMSE:  9.91%  MAPE:  6.85%


In [61]:
np.savetxt("ensemble_weights_lgb_trf.csv", alph, delimiter=",")

# Results
Note: The models training cells above should be executed already.

## LSTM + Attention

In [40]:
metrics_lstmattention = evaluate_on_test(model_lstmattention, X_test, Y_test_s, bundle.y_scaler, Z_test=Z_test)
print(f"\nTest MAE (MW):  {metrics_lstmattention['MAE']:.2f}")
print(f"Test RMSE (MW): {metrics_lstmattention['RMSE']:.2f}")
print(f"Test MAPE (MW): {metrics_lstmattention['MAPE']:.2f}")
print("Horizon-wise MAE (MW):", np.round(metrics_lstmattention["MAE_by_h"], 2))


Test MAE (MW):  183.55
Test RMSE (MW): 256.13
Test MAPE (MW): 7.60
Horizon-wise MAE (MW): [182.14 181.66 181.04 180.34 181.51 182.5  183.43 184.05 184.49 185.49
 186.56 187.31 187.18 186.12 185.39 185.43 185.65 184.05 182.69 181.02
 180.85 181.4  182.23 182.64]


## Transformer

In [43]:
print(f"\nTest MAE (MW):  {metrics_transformer['MAE']:.2f}")
print(f"Test RMSE (MW): {metrics_transformer['RMSE']:.2f}")
print(f"Test MAPE (MW): {metrics_transformer['MAPE']:.2f}")
print("Horizon-wise MAE (MW):", np.round(metrics_transformer["MAE_by_h"], 2))


Test MAE (MW):  177.82
Test RMSE (MW): 247.16
Test MAPE (MW): 7.17
Horizon-wise MAE (MW): [177.03 176.56 176.4  176.69 176.97 177.27 176.72 176.67 176.7  177.
 177.32 177.62 178.27 178.11 178.08 178.69 179.04 179.25 178.54 178.82
 178.77 178.53 178.96 179.72]


## Transformer + XGBoost

In [55]:
print("On validation:")
report("XGBoost",     Y_val, P_xgb_val)
report("Transformer", Y_val, P_trf_val)
report("Avg(50/50)",  Y_val, P_avg_val_tfxgb)
report("Weighted",    Y_val, P_wavg_val_tfxgb)

print("On test (final results):")
report("XGBoost",     Y_test, P_xgb_test)
report("Transformer", Y_test, P_trf_test)
report("Avg(50/50)",  Y_test, P_avg_test_tfxgb)
report("Weighted",    Y_test, P_wavg_test_tfxgb)


On validation:
XGBoost      | MAE:  149.58  RMSE:  213.38  nRMSE: 10.88%  MAPE:  7.45%
Transformer  | MAE:  134.50  RMSE:  193.76  nRMSE:  9.88%  MAPE:  6.66%
Avg(50/50)   | MAE:  124.69  RMSE:  184.98  nRMSE:  9.44%  MAPE:  6.06%
Weighted     | MAE:  123.87  RMSE:  183.40  nRMSE:  9.36%  MAPE:  6.02%
On test (final results):
XGBoost      | MAE:  203.20  RMSE:  285.64  nRMSE: 11.54%  MAPE:  8.13%
Transformer  | MAE:  177.82  RMSE:  247.16  nRMSE:  9.99%  MAPE:  7.17%
Avg(50/50)   | MAE:  177.35  RMSE:  250.49  nRMSE: 10.12%  MAPE:  7.06%
Weighted     | MAE:  174.71  RMSE:  245.96  nRMSE:  9.94%  MAPE:  6.97%


## Transformer + LightGBM

In [57]:
print("On validation:")
report("LightGBM",     Y_val, P_lgb_val)
report("Transformer", Y_val, P_trf_val)
report("Avg(50/50)",  Y_val, P_avg_val_tflgb)
report("Weighted",    Y_val, P_wavg_val_tflgb)

print("On test (final results):")
report("LightGBM",     Y_test, P_lgb_test)
report("Transformer", Y_test, P_trf_test)
report("Avg(50/50)",  Y_test, P_avg_test_tflgb)
report("Weighted",    Y_test, P_wavg_test_tflgb)

On validation:
LightGBM     | MAE:  146.83  RMSE:  211.68  nRMSE: 10.80%  MAPE:  7.30%
Transformer  | MAE:  134.50  RMSE:  193.76  nRMSE:  9.88%  MAPE:  6.66%
Avg(50/50)   | MAE:  123.86  RMSE:  185.15  nRMSE:  9.44%  MAPE:  6.00%
Weighted     | MAE:  123.56  RMSE:  183.70  nRMSE:  9.37%  MAPE:  6.00%
On test (final results):
LightGBM     | MAE:  197.19  RMSE:  283.70  nRMSE: 11.47%  MAPE:  7.79%
Transformer  | MAE:  177.82  RMSE:  247.16  nRMSE:  9.99%  MAPE:  7.17%
Avg(50/50)   | MAE:  174.45  RMSE:  249.52  nRMSE: 10.08%  MAPE:  6.89%
Weighted     | MAE:  172.56  RMSE:  245.27  nRMSE:  9.91%  MAPE:  6.85%


# Inference

In [58]:
import joblib

# load back the fitted model
loaded_model = joblib.load("/kaggle/input/prophet/other/default/1/prophet_model.pkl")


In [65]:
cut_valid_ts

Timestamp('2025-04-04 00:00:00')

In [64]:
end_times[end_times > np.datetime64(cut_valid_ts)]

numpy.datetime64('2025-05-30T00:00:00.000000000')

In [60]:
# You already have:
# X_raw, Y_raw, end_times = make_windows(F_all, y_all, t_all, L, HORIZON)

# Determine the test mask as you did previously:
test_mask = (end_times > np.datetime64(cut_valid_ts))

# 1. Extract the test sequences (shape: (n_test_sequences, L, D))
X_seq_test = X_raw[test_mask]

# 2. Get the corresponding timestamps at the end of each window
times_test = end_times[test_mask]  # these are the same timestamps you used for splitting

# 3. Build the future-known covariates for these timestamps
#    Only pass the two expected arguments: times and horizon length
Z_seq_test = build_future_covariates(times_test, HORIZON)

# 4. Now you can select the final sequence (i.e., the one for the last day)
last_x_seq = X_seq_test[-1]        # shape (L, D)
last_z_seq = Z_seq_test[-1]        # shape (HORIZON, Fz)

print(last_x_seq.shape)  # (history_window, num_features)
print(last_z_seq.shape)  # (24, num_future_features)


(48, 48)
(24, 14)


In [None]:
# Use the first column of Y_all for sliding windows (it's y_t+1)
y_single = Y_all[:, 0:1]
X_raw, Y_raw, end_times = make_windows(F_all, y_single, df['timestamp'].to_numpy(), L_win, HORIZON)

# Align sequence masks using end_times
train_mask_seq = (end_times <= df_feat.iloc[train_end-1]['timestamp'])
valid_mask_seq = (end_times >  df_feat.iloc[train_end-1]['timestamp']) & (end_times <= df_feat.iloc[valid_end-1]['timestamp'])
test_mask_seq  = (end_times >  df_feat.iloc[valid_end-1]['timestamp'])

X_seq_train, X_seq_test  = X_raw[train_mask_seq], X_raw[test_mask_seq]
Z_seq_train, Z_seq_test  = Z_train, Z_test   # Already built above

In [66]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with torch.no_grad():
    X_test_t = torch.tensor(X_seq_test, dtype=torch.float32).to(device)
    Z_test_t = torch.tensor(Z_seq_test, dtype=torch.float32).to(device)
    lstmattention_pred       = model_lstmattention(X_test_t, Z_test_t).cpu().numpy()[0]
    transformer_pred         = model_transformer(X_test_t, Z_test_t).cpu().numpy()[0]

print('LSTM+Attention predictions:', lstmattention_pred)
print('Transformer predictions:', transformer_pred)

NameError: name 'device' is not defined

In [None]:


# -------------------------------------------------------------------------
# 7. (Optional) Evaluate against ground truth of the last day
# -------------------------------------------------------------------------
actual = df_feat[[f'y_t+{h}' for h in range(1,HORIZON+1)]].iloc[-1].values.astype(float)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-6, None))) * 100

from sklearn.metrics import mean_absolute_error, mean_squared_error
for name, pred in {'LSTM': lstm_pred, 'Transformer': transformer_pred}.items():
    mae  = mean_absolute_error(actual, pred)
    rmse = math.sqrt(mean_squared_error(actual, pred))
    mape_val = mape(actual, pred)
    print(f'{name}: MAE={mae:.2f}, RMSE={rmse:.2f}, MAPE={mape_val:.2f}%')

In [67]:
preprocess = ColumnTransformer(
    [
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CAT_COLS),
        ('num', 'passthrough', [c for c in feature_cols if c not in CAT_COLS])
    ],
    remainder='drop'
)
# Scorer: average RMSE across 24 horizons (negated for maximization)
# =========================
scorer = make_scorer(avg_neg_rmse, greater_is_better=True)
import joblib
xgb_tuner = joblib.load('/kaggle/input/xgboostlightgbm/scikitlearn/default/1/xgb_tuner_1.pkl')
print(xgb_tuner.best_params_)
print(xgb_tuner)

{'model__estimator__subsample': 0.7, 'model__estimator__reg_lambda': 0.5, 'model__estimator__reg_alpha': 0.0, 'model__estimator__n_estimators': 1000, 'model__estimator__min_child_weight': 3, 'model__estimator__max_depth': 6, 'model__estimator__learning_rate': 0.03, 'model__estimator__gamma': 0.1, 'model__estimator__colsample_bytree': 0.7}
RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('cat',
                                                                               OneHotEncoder(handle_unknown='ignore',
                                                                                             sparse_output=False),
                                                                               ['Weather',
                                                                                'Wind '
            

In [68]:
lgb_tuner = joblib.load('/kaggle/input/lightgbm/scikitlearn/default/1/lgb_tuner_1.pkl')
with open('/kaggle/input/lightgbm/scikitlearn/default/1/lgbm_best_params.json', "r") as f:
    best_params_lgb = json.load(f)


print(lgb_tuner)
print(best_params_lgb)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['Weather', 'Wind Direction',
                                                   'season', 'is_holiday']),
                                                 ('num', 'passthrough',
                                                  ['Temperature',
                                                   'Precipitation',
                                                   'Chance of snow', 'Humidity',
                                                   'Wind', 'Wind Gust',
                                                   'Wind Degree', 'Cloud Cover',
                                                   'Visibility', 'is_weekend',
                                                   'hour', 'dow', '

In [None]:
import numpy as np
import pandas as pd
from datetime import timedelta

from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -------------------------------------------------------------------------
# 1. Load data and rebuild all features
# -------------------------------------------------------------------------
DATA_PATH = '/kaggle/input/weather-hanoi-2020-2025-normed/merge_weather_energy_hanoi_20202025_norm.csv'
df = pd.read_csv(DATA_PATH)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# Convert categoricals to strings
for col in ['Weather','Wind Direction','season','is_holiday']:
    df[col] = df[col].astype(str)

# Add calendar signals
df['hour'] = df['timestamp'].dt.hour
df['dow']  = df['timestamp'].dt.dayofweek
df['doy']  = df['timestamp'].dt.dayofyear
df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)
df['dow_sin']  = np.sin(2*np.pi*df['dow']/7)
df['dow_cos']  = np.cos(2*np.pi*df['dow']/7)
df['doy_sin']  = np.sin(2*np.pi*df['doy']/365.25)
df['doy_cos']  = np.cos(2*np.pi*df['doy']/365.25)
df['is_weekend'] = (df['dow'] >= 5).astype(int)

# Define hyperparameters
TARGET_COL = 'total_consumption_mw'
HORIZON = 24
TARGET_LAGS = range(1,49)
ROLL_WINDOWS = [3,6,12,24,48]
EXOG_NUM = ['Temperature','Precipitation','Humidity','Wind',
            'Wind Gust','Wind Degree','Cloud Cover','Visibility']

# Lagged target and rolling stats
for L in TARGET_LAGS:
    df[f'y_lag_{L}'] = df[TARGET_COL].shift(L)
for W in ROLL_WINDOWS:
    df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
    df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()

# Lagged exogenous vars
for col in EXOG_NUM:
    for L in [0,1,3,6,12,24]:
        df[f'{col}_lag{L}'] = df[col].shift(L)

# Multi-horizon labels
for h in range(1, HORIZON+1):
    df[f'y_t+{h}'] = df[TARGET_COL].shift(-h)

# Drop rows with NaNs
df_feat = df.dropna().reset_index(drop=True)

# -------------------------------------------------------------------------
# 2. Prepare feature matrix and future-known Z covariates
# -------------------------------------------------------------------------
def build_future_covariates(times, H, df, TIME_COL='timestamp'):
    df_key = df[[TIME_COL, 'is_holiday','season']].copy()
    df_key[TIME_COL] = pd.to_datetime(df_key[TIME_COL])
    df_key = df_key.set_index(TIME_COL).sort_index()
    Z_list = []
    for ts in times:
        row = []
        for h in range(1, H+1):
            future = pd.Timestamp(ts) + timedelta(hours=h)
            hour  = future.hour; dow = future.dayofweek; doy = future.dayofyear
            v = [
                np.sin(2*np.pi*hour/24), np.cos(2*np.pi*hour/24),
                np.sin(2*np.pi*dow/7),  np.cos(2*np.pi*dow/7),
                np.sin(2*np.pi*doy/365.25), np.cos(2*np.pi*doy/365.25),
                1.0 if dow>=5 else 0.0
            ]
            # holiday & season one-hots
            if future in df_key.index:
                hol = df_key.at[future,'is_holiday']
                sea = df_key.at[future,'season']
            else:
                hol = 'False'
                sea = None
            hol_vec = [1-int(hol=='True'), int(hol=='True')]
            # adjust this if you have more holiday types (e.g. tet vs national)
            season_list = ['winter','spring','summer','autumn']
            sea_vec  = [1 if sea==s else 0 for s in season_list]
            row.append(v + hol_vec + sea_vec)
        Z_list.append(row)
    return np.array(Z_list, dtype=np.float32)

# Build base feature matrix
label_cols   = [f'y_t+{h}' for h in range(1,HORIZON+1)]
feature_cols = [c for c in df_feat.columns if c not in label_cols + ['timestamp', TARGET_COL]]
X_all = df_feat[feature_cols].values
Y_all = df_feat[label_cols].values
times = df_feat['timestamp'].to_numpy()

# Split indices
N = len(df_feat)
train_end = int(N - 2*8*7*24)     # last 8 weeks test, previous 8 weeks val
valid_end = int(N - 8*7*24)

# Use end timestamps for splitting
train_mask = (times[:len(X_all)] <= df_feat.iloc[train_end-1]['timestamp'])
valid_mask = (times[:len(X_all)] >  df_feat.iloc[train_end-1]['timestamp']) & (times[:len(X_all)] <= df_feat.iloc[valid_end-1]['timestamp'])
test_mask  = (times[:len(X_all)] >  df_feat.iloc[valid_end-1]['timestamp'])

X_train, Y_train = X_all[train_mask], Y_all[train_mask]
X_valid, Y_valid = X_all[valid_mask], Y_all[valid_mask]
X_test,  Y_test  = X_all[test_mask],  Y_all[test_mask]
times_train, times_valid, times_test = times[train_mask], times[valid_mask], times[test_mask]

# Build Z covariates per split
Z_train = build_future_covariates(times_train, HORIZON, df_feat)
Z_valid = build_future_covariates(times_valid, HORIZON, df_feat)
Z_test  = build_future_covariates(times_test,  HORIZON, df_feat)

# -------------------------------------------------------------------------
# 3. Train/Load XGBoost and LightGBM multi-output models and predict
# -------------------------------------------------------------------------
# (You can load tuned hyperparameters if you have them; otherwise use defaults)
# Flatten Z covariates (HORIZON * Fz) and append to X for XGB/LGB
X_train_flat = np.hstack([X_train, Z_train.reshape(Z_train.shape[0], -1)])
X_test_flat  = np.hstack([X_test,  Z_test.reshape(Z_test.shape[0],  -1)])
xgb_model.fit(X_train_flat, Y_train)
Y_hat_test_xgb = xgb_model.predict(X_test_flat)

lgb_model = MultiOutputRegressor(LGBMRegressor(
    n_estimators=600, max_depth=4, learning_rate=0.05,
    subsample=0.9, colsample_bytree=0.9, random_state=42
))
lgb_model.fit(X_train_flat, Y_train)
Y_hat_test_lgb = lgb_model.predict(X_test_flat)

# -------------------------------------------------------------------------
# 4. Build sequence windows and load your trained LSTM and Transformer models
# -------------------------------------------------------------------------
# Sequence features for LSTM/Transformer (use the same columns you used for training)
seq_cols = [TARGET_COL] + EXOG_NUM + [
    'hour_sin','hour_cos','dow_sin','dow_cos','doy_sin','doy_cos','is_weekend','is_holiday']
F_all = df[seq_cols].astype(np.float32).values

# Build sliding windows: [t-L+1 .. t] -> [t+1 .. t+H]
L_win = 48
def make_windows(F_all, y_all, times, L, H):
    T, D = F_all.shape
    N = T - L - H + 1
    X = np.empty((N, L, D), dtype=np.float32)
    Y = np.empty((N, H), dtype=np.float32)
    end_ts = np.empty(N, dtype='datetime64[ns]')
    for i in range(N):
        X[i] = F_all[i:i+L]
        Y[i] = y_all[i+L:i+L+H,0]
        end_ts[i] = times[i+L-1]
    return X, Y, end_ts

# Use the first column of Y_all for sliding windows (it's y_t+1)
y_single = Y_all[:, 0:1]
X_raw, Y_raw, end_times = make_windows(F_all, y_single, df['timestamp'].to_numpy(), L_win, HORIZON)

# Align sequence masks using end_times
train_mask_seq = (end_times <= df_feat.iloc[train_end-1]['timestamp'])
valid_mask_seq = (end_times >  df_feat.iloc[train_end-1]['timestamp']) & (end_times <= df_feat.iloc[valid_end-1]['timestamp'])
test_mask_seq  = (end_times >  df_feat.iloc[valid_end-1]['timestamp'])

X_seq_train, X_seq_test  = X_raw[train_mask_seq], X_raw[test_mask_seq]
Z_seq_train, Z_seq_test  = Z_train, Z_test   # Already built above

# Load your trained LSTM_Attention model
class LSTMAttn(nn.Module):
    def __init__(self, input_dim, hidden=128, horizon=24, future_dim=0):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden, batch_first=True)
        self.future_proj = nn.Linear(future_dim, hidden) if future_dim>0 else None
        self.fc = nn.Linear(hidden*2 + (hidden if future_dim>0 else 0), 1)
        self.horizon = horizon
    def forward(self, x, z_future=None):
        B = x.size(0)
        seq_out, (h_n, _) = self.lstm(x)
        q = h_n[-1]
        attn_scores = torch.softmax(torch.sum(seq_out * q.unsqueeze(1), dim=-1), dim=1)
        ctx = torch.sum(seq_out * attn_scores.unsqueeze(-1), dim=1)
        base = torch.cat([ctx, q], dim=-1).unsqueeze(1).expand(-1, self.horizon, -1)
        if z_future is not None:
            z_proj = self.future_proj(z_future)
            fused = torch.cat([base, z_proj], dim=-1)
        else:
            fused = base
        out = self.fc(fused).squeeze(-1)
        return out

# Load your Transformer class (same as training)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=4, num_layers=2, d_ff=256, horizon=24, future_dim=0):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_ff, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.future_proj = nn.Linear(future_dim, d_model) if future_dim > 0 else None
        self.fc = nn.Linear(d_model*2, horizon)
        self.horizon = horizon
    def forward(self, x, z_future=None):
        x = self.input_proj(x)
        enc_out = self.encoder(x)
        pooled = enc_out.mean(dim=1)
        if z_future is not None:
            z_proj = self.future_proj(z_future).mean(dim=1)
            fused = torch.cat([pooled, z_proj], dim=-1)
        else:
            fused = torch.cat([pooled, pooled], dim=-1)
        return self.fc(fused)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load saved models (replace with your actual paths)
lstm_model = LSTMAttn(input_dim=X_seq_train.shape[2], hidden=128, horizon=HORIZON, future_dim=Z_seq_train.shape[2])
lstm_model.load_state_dict(torch.load('/kaggle/input/your-models/lstm_attention_model.pth', map_location=device))
lstm_model.to(device).eval()

transformer_model = SimpleTransformer(input_dim=X_seq_train.shape[2], d_model=128, nhead=4, num_layers=2,
                                      d_ff=256, horizon=HORIZON, future_dim=Z_seq_train.shape[2])
transformer_model.load_state_dict(torch.load('/kaggle/input/your-models/transformer_model.pth', map_location=device))
transformer_model.to(device).eval()

# Predict sequences for the test set
with torch.no_grad():
    X_test_t  = torch.tensor(X_seq_test, dtype=torch.float32).to(device)
    Z_test_t  = torch.tensor(Z_seq_test, dtype=torch.float32).to(device)
    Y_hat_test_lstm = lstm_model(X_test_t, Z_test_t).cpu().numpy()
    Y_hat_test_trf  = transformer_model(X_test_t, Z_test_t).cpu().numpy()

# -------------------------------------------------------------------------
# 5. Hybrid models: average Transformer with XGB/LGB
# -------------------------------------------------------------------------
# Align shapes: Y_hat_test_trf and Y_hat_test_xgb must have same shape (n_test_samples, HORIZON)
# If needed, trim or pad sequences accordingly
min_rows = min(len(Y_hat_test_trf), len(Y_hat_test_xgb))
Y_hat_test_trf  = Y_hat_test_trf[-min_rows:]
Y_hat_test_xgb  = Y_hat_test_xgb[-min_rows:]
Y_hat_test_lgb  = Y_hat_test_lgb[-min_rows:]

# Simple average hybrids (equal weight)
Y_hat_test_hybrid_xgb = 0.5 * Y_hat_test_trf + 0.5 * Y_hat_test_xgb
Y_hat_test_hybrid_lgb = 0.5 * Y_hat_test_trf + 0.5 * Y_hat_test_lgb

# -------------------------------------------------------------------------
# 6. Evaluate and print metrics for the test set
# -------------------------------------------------------------------------
# Use Y_test[-min_rows:] to align with predictions
Y_test_trim = Y_test[-min_rows:]
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-6))) * 100

models_preds = {
    'XGBoost': Y_hat_test_xgb,
    'LightGBM': Y_hat_test_lgb,
    'LSTM+Attn': Y_hat_test_lstm,
    'Transformer': Y_hat_test_trf,
    'Hybrid_XGB': Y_hat_test_hybrid_xgb,
    'Hybrid_LGB': Y_hat_test_hybrid_lgb,
}
for name, pred in models_preds.items():
    mae  = mean_absolute_error(Y_test_trim.ravel(), pred.ravel())
    rmse = np.sqrt(mean_squared_error(Y_test_trim.ravel(), pred.ravel()))
    mp   = mape(Y_test_trim, pred)
    print(f"{name}: MAE={mae:.2f}, RMSE={rmse:.2f}, MAPE={mp:.2f}%")

# -------------------------------------------------------------------------
# The last row of each prediction array gives the 24-hour forecast for the final day:
final_day_pred = {
    model: pred[-1] for model, pred in models_preds.items()
}
print("24-hour forecasts for the last day:")
for model, pred in final_day_pred.items():
    print(model, pred)
