# Bayesian LSTM

In [1]:
import gc
from pathlib import Path
import numpy as np
import pickle

import torch
from torch.utils.data import Dataset, DataLoader
import mlflow

import blitz.modules as blitz_modules
from blitz.utils import variational_estimator

from sklearn.metrics import mean_absolute_error, root_mean_squared_error

import copy
import optuna

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


DATA_PATH = Path("../../data/processed")

EXPERIMENT_NAME = "fs_06_load_calendar_future_weather"
OUTPUT_PATH = DATA_PATH / "ml_data" / EXPERIMENT_NAME
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
# Possible fix for stability in searches
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

print("cudnn.enabled       =", torch.backends.cudnn.enabled)
print("cudnn.benchmark     =", torch.backends.cudnn.benchmark)
print("cudnn.deterministic =", torch.backends.cudnn.deterministic)

cudnn.enabled       = True
cudnn.benchmark     = False
cudnn.deterministic = True


## First: Start the MLFlow-Server for logging the experiments

Run the following command: 

Powershell:

mlflow server ` 
    --backend-store-uri sqlite:///mlflow.db ` 
    --default-artifact-root ./mlartifacts/  


Bash / Git Bash / WSL / Linux / macOS
mlflow server \
  --backend-store-uri sqlite:///mlflow.db \
  --default-artifact-root ./mlartifacts


In [4]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("load_forecasting_bayesian_lstm")

mlflow.enable_system_metrics_logging()

## Defining the model class

In [5]:
@variational_estimator
class BayesianLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, horizon):
        super().__init__()
        self.lstm = blitz_modules.BayesianLSTM(
            in_features=input_size,
            out_features=hidden_size
        )

        self.fc = torch.nn.Linear(hidden_size, 2 * horizon) # times two, because we need to output a mean and variance for the GausianNLLLoss
    
    def forward(self, x):
        # x is in format (batch_size, seq_len, input_size) but (seq_len, batch_size, input_size) is required
        x = x.transpose(0, 1) # now in the correct format
        out, _ = self.lstm(x) # (seq_len, batch_size, input_size)
        h = out[-1, :, :] # The last hidden state (batch_size, hidden_size)
        return self.fc(h) # (batch, 2 * horizon) each batch having the shape [mu_1, mu_2, ... var_1, var_2 ...]

## Load the data

In [6]:
# Load the scaler
scalers_dir = OUTPUT_PATH / "scalers"

with open(scalers_dir / "y_scaler.pkl", "rb") as f:
    y_scaler = pickle.load(f)

# Define the dataset

In [7]:
class TimeSeriesDataset(Dataset):
    """PyTorch Dataset for time series forecasting."""
    
    def __init__(self, X, y):
        """
        Args:
            X: Input sequences (n_samples, window_size, n_features)
            y: Target values (n_samples, forecast_horizon)
        """
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    
    def __len__(self):
        
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


X_train_seq = np.load(OUTPUT_PATH / "X_train.npy")
y_train_seq = np.load(OUTPUT_PATH / "y_train.npy")
train_dataset = TimeSeriesDataset(X_train_seq, y_train_seq)
del X_train_seq, y_train_seq
gc.collect()

X_val_seq = np.load(OUTPUT_PATH / "X_val.npy")
y_val_seq = np.load(OUTPUT_PATH / "y_val.npy")
val_dataset = TimeSeriesDataset(X_val_seq, y_val_seq)
del X_val_seq, y_val_seq
gc.collect()

X_test_seq = np.load(OUTPUT_PATH / "X_test.npy")
y_test_seq = np.load(OUTPUT_PATH / "y_test.npy")
test_dataset = TimeSeriesDataset(X_test_seq, y_test_seq)
del X_test_seq, y_test_seq
gc.collect()

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Val dataset:   {len(val_dataset)} samples")
print(f"Test dataset:  {len(test_dataset)} samples")
print(f"Sample shape: X={train_dataset[0][0].shape}, y={train_dataset[0][1].shape}")

                

Train dataset: 164214 samples
Val dataset:   39863 samples
Test dataset:  39863 samples
Sample shape: X=torch.Size([672, 19]), y=torch.Size([96])


## Hyperparameter tuning

In [None]:
def objective(trial):
    with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True):
        hidden_size = trial.suggest_int("hidden_size", 64, 256, step=32)
        lr = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
        weight_decay = trial.suggest_float("weight_decay", 1e-7, 1e-3, log=True)
        sample_nbr = trial.suggest_categorical("sample_nbr", [3, 5, 7, 10])
        eval_sample_nbr = 50
        kl_warmup_epochs = trial.suggest_int("kl_warmup_epochs", 5, 20)

        mlflow.log_params(trial.params)
        mlflow.log_param("eval_sample_nbr", eval_sample_nbr)

        model = BayesianLSTM(
            input_size=train_dataset[0][0].shape[-1],
            hidden_size=hidden_size,
            horizon=train_dataset[0][1].shape[-1],
        ).to(DEVICE)

        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=lr,
            weight_decay=weight_decay,
        )

        gauss_nll = torch.nn.GaussianNLLLoss(eps=1e-6, reduction="mean") #torch.nn.MSELoss()

        def nll_criterion(pred_params, y):
            mu, log_var = torch.chunk(pred_params, 2, dim=-1)
            var = torch.nn.functional.softplus(log_var) + 1e-6
            return gauss_nll(mu, y, var)

        train_dataloader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
        )

        val_dataloader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
        )

        base_kl_weight = 1.0 / len(train_dataset)
        best_val = float("inf")
        best_state = None
        best_epoch = -1
        patience = 10
        wait = 0
        num_epochs = 50

        for epoch in range(num_epochs):
            model.train()
            train_elbo = 0.0

            kl_weight = base_kl_weight * min(1.0, (epoch + 1) / kl_warmup_epochs)

            for xb, yb in train_dataloader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)

                optimizer.zero_grad()

                elbo = model.sample_elbo(
                    inputs=xb,
                    labels=yb,
                    criterion=nll_criterion,
                    sample_nbr=sample_nbr,
                    complexity_cost_weight=kl_weight,
                )

                elbo.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                train_elbo += elbo.item()

            train_elbo /= len(train_dataloader)
            mlflow.log_metric("train_elbo", train_elbo, step=epoch)

            model.eval()
            val_elbo = 0.0
            val_nll = 0.0
            val_mu_preds = []
            val_targets = []

            with torch.no_grad():
                for xb, yb in val_dataloader:
                    xb, yb = xb.to(DEVICE), yb.to(DEVICE)

                    elbo = model.sample_elbo(
                        inputs=xb,
                        labels=yb,
                        criterion=nll_criterion,
                        sample_nbr=sample_nbr,
                        complexity_cost_weight=kl_weight,
                    )

                    pred_samples = torch.stack([model(xb) for _ in range(eval_sample_nbr)], dim=0)
                    mu_s, log_var_s = torch.chunk(pred_samples, 2, dim=-1)
                    var_s = torch.nn.functional.softplus(log_var_s) + 1e-6
                    mu_pred = mu_s.mean(dim=0)
                    var_pred = var_s.mean(dim=0) + mu_s.var(dim=0, unbiased=False)
                    nll = gauss_nll(mu_pred, yb, var_pred)

                    val_mu_preds.append(mu_pred.detach().cpu().numpy())
                    val_targets.append(yb.detach().cpu().numpy())

                    val_elbo += elbo.item()
                    val_nll += nll.item()

            val_elbo /= len(val_dataloader)
            val_nll /= len(val_dataloader)
            y_pred_val = np.concatenate(val_mu_preds, axis=0)
            y_true_val = np.concatenate(val_targets, axis=0)
            val_mae = mean_absolute_error(y_true_val, y_pred_val)
            val_rmse = root_mean_squared_error(y_true_val, y_pred_val)
            val_mse = val_rmse ** 2
            val_rmse_h = [
                root_mean_squared_error(y_true_val[:, h], y_pred_val[:, h])
                for h in [0, 3, 11, 23, 47, 95]
            ]

            mlflow.log_metric("val_elbo", val_elbo, step=epoch)
            mlflow.log_metric("val_nll", val_nll, step=epoch)
            mlflow.log_metric("val_mse", val_mse, step=epoch)
            mlflow.log_metric("val_mae", val_mae, step=epoch)
            mlflow.log_metric("val_rmse", val_rmse, step=epoch)
            for h_idx, rmse_h in enumerate(val_rmse_h):
                mlflow.log_metric(f"val_rmse_h{h_idx+1}", rmse_h, step=epoch)
            mlflow.log_metric("kl_weight", kl_weight, step=epoch)

            trial.report(val_nll, epoch)

            if trial.should_prune():
                del model, optimizer
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                raise optuna.TrialPruned()

            if val_nll < best_val:
                best_val = val_nll
                wait = 0
                best_state = copy.deepcopy(model.state_dict())
                best_epoch = epoch
            else:
                wait += 1

            if wait >= patience:
                break

            print(
                
                f"Epoch {epoch+1}/{num_epochs}, "
                f"Train ELBO: {train_elbo:.4f}, "
                f"Val ELBO: {val_elbo:.4f}, "
                f"Val NLL: {val_nll:.6f}, "
                f"Val MSE: {val_mse:.6f}, "
                f"Val MAE: {val_mae:.6f}, "
                f"KL w: {kl_weight:.2e}"
            )

        if best_state is not None:
            model.load_state_dict(best_state)
            mlflow.log_metric("best_epoch", best_epoch)
            mlflow.pytorch.log_model(model, artifact_path="best_model")

        mlflow.log_metric("best_val_nll", best_val)

        del model, optimizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return best_val



In [9]:
STUDY_NAME = f"bayesian_lstm_nll_{EXPERIMENT_NAME}"
STUDY_STORAGE = "sqlite:///optuna_bayesian_lstm.db"

study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=STUDY_STORAGE,
    load_if_exists=True,
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=1234),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
)

with mlflow.start_run(run_name=f"optuna_search_bayesian_lstm_nll_{EXPERIMENT_NAME}"):
    study.optimize(objective, n_trials=50, show_progress_bar=True, gc_after_trial=True)
    mlflow.log_metric("best_val_nll", study.best_value)
    mlflow.log_params(study.best_params)
    mlflow.log_param("best_trial_number", study.best_trial.number)

best_params = study.best_params
best_params



[I 2026-02-09 14:28:53,697] A new study created in RDB with name: bayesian_lstm_nll_fs_06_load_calendar_future_weather
2026/02/09 14:28:53 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2026/02/09 14:28:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


  0%|          | 0/50 [00:00<?, ?it/s]

2026/02/09 14:28:54 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2026/02/09 14:28:54 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2026/02/09 14:51:05 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2026/02/09 14:51:05 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2026/02/09 14:51:05 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2026/02/09 14:51:05 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


üèÉ View run trial_0 at: http://127.0.0.1:5000/#/experiments/3/runs/80969d7697c04af2b7887bef94ddfd6f
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/3
[W 2026-02-09 14:51:05,498] Trial 0 failed with parameters: {'hidden_size': 96, 'lr': 0.0011401031390814089, 'batch_size': 64, 'weight_decay': 1.2313185468743886e-06, 'sample_nbr': 7, 'kl_warmup_epochs': 10} because of the following error: IndexError('index 96 is out of bounds for axis 1 with size 96').
Traceback (most recent call last):
  File "c:\Users\lhaus\Documents\FH\probabilistic-load-forecast-project\.venv\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\lhaus\AppData\Local\Temp\ipykernel_13900\3717101500.py", line 120, in objective
    root_mean_squared_error(y_true_val[:, h], y_pred_val[:, h])
                            ~~~~~~~~~~^^^^^^
IndexError: index 96 is out of bounds for axis 1 with size 96
[W 2026-02-

IndexError: index 96 is out of bounds for axis 1 with size 96

## Final training after the hyperparameter tuning

In [None]:
BATCH_SIZE = 64
NUM_EPOCHS = 70
HIDDEN_SIZE = 64
SAMPLE_NBR = 7
VAL_EVAL_SAMPLE_NBR = 50
TEST_SAMPLE_NBR = 90
KL_WARMUP_EPOCHS = 10

optim_kwargs = {
    "lr": 3e-4,
    "weight_decay": 1e-5,
}

model = BayesianLSTM(
    input_size=train_dataset[0][0].shape[-1],
    horizon=train_dataset[0][1].shape[-1],
    hidden_size=HIDDEN_SIZE
).to(DEVICE)

optim = torch.optim.Adam(
    params=model.parameters(),
    **optim_kwargs
)

gauss_nll = torch.nn.GaussianNLLLoss(eps=1e-6, reduction="mean") #torch.nn.MSELoss()

def nll_criterion(pred_params, y):
    mu, log_var = torch.chunk(pred_params, 2, dim=-1)
    var = torch.nn.functional.softplus(log_var) + 1e-3
    return gauss_nll(mu, y, var)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=64,
    shuffle=False
)

# Scale KL term by dataset size; warm up to full weight over KL_WARMUP_EPOCHS.
base_kl_weight = 1e-4 #1.0 / len(train_dataset)

best_val = float("inf")
patience = 10
wait = 0
with mlflow.start_run(run_name=f"baysian_lstm_testrun_nll_fs_06_load_calendar_future_weather"):
    mlflow.log_params(optim_kwargs)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("hidden_size", HIDDEN_SIZE)
    mlflow.log_param("sample_nbr", SAMPLE_NBR)
    mlflow.log_param("val_eval_sample_nbr", VAL_EVAL_SAMPLE_NBR)
    mlflow.log_param("kl_warmup_epochs", KL_WARMUP_EPOCHS)
    mlflow.log_param("base_kl_weight", base_kl_weight)

    for epoch in range(NUM_EPOCHS):

        model.train()
        train_elbo = 0.0

        kl_weight = base_kl_weight * min(1.0, (epoch + 1) / KL_WARMUP_EPOCHS)

        for xb, yb in train_dataloader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)

            optim.zero_grad()

            elbo = model.sample_elbo(
                inputs=xb,
                labels=yb,
                criterion=nll_criterion,
                sample_nbr=SAMPLE_NBR,
                complexity_cost_weight=kl_weight
            )

            elbo.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optim.step()

            train_elbo += elbo.item()

        train_elbo /= len(train_dataloader)

        mlflow.log_metric("train_elbo", train_elbo, step=epoch)

        model.eval()
        val_elbo = 0.0
        val_nll = 0.0

        val_var_mean_acc = 0.0
        val_var_min_acc = float("inf")
        val_var_max_acc = -float("inf")

        with torch.no_grad():
            for xb, yb in val_dataloader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)

                elbo = model.sample_elbo(
                    inputs=xb,
                    labels=yb,
                    criterion=nll_criterion,
                    sample_nbr=SAMPLE_NBR,
                    complexity_cost_weight=kl_weight
                )

                pred_samples = torch.stack([model(xb) for _ in range(VAL_EVAL_SAMPLE_NBR)], dim=0)
                mu_s, log_var_s = torch.chunk(pred_samples, 2, dim=-1)
                var_s = torch.nn.functional.softplus(log_var_s) + 1e-6
                mu_pred = mu_s.mean(dim=0)
                var_pred = var_s.mean(dim=0) + mu_s.var(dim=0, unbiased=False)
                nll = gauss_nll(mu_pred, yb, var_pred)

                val_elbo += elbo.item()
                val_nll += nll.item()

                val_var_mean_acc += var_pred.mean().item()
                val_var_min_acc = min(val_var_min_acc, var_pred.min().item())
                val_var_max_acc = max(val_var_max_acc, var_pred.max().item())

            val_elbo /= len(val_dataloader)
            val_nll /= len(val_dataloader)
            val_var_mean = val_var_mean_acc / len(val_dataloader)

            mlflow.log_metric("val_elbo", val_elbo, step=epoch)
            mlflow.log_metric("val_nll", val_nll, step=epoch)
            mlflow.log_metric("kl_weight", kl_weight, step=epoch)
            
            mlflow.log_metric("val_var_mean", val_var_mean, step=epoch)
            mlflow.log_metric("val_var_min", val_var_min_acc, step=epoch)
            mlflow.log_metric("val_var_max", val_var_max_acc, step=epoch)

            if val_nll < best_val:
                best_val = val_nll
                wait = 0
                best_state = copy.deepcopy(model.state_dict())
                best_epoch = epoch

            else:
                wait +=1
            
            if wait >= patience:
                break

            print(
                f"Epoch {epoch+1}/{NUM_EPOCHS}, "
                f"Train ELBO: {train_elbo:.4f}, "
                f"Val ELBO: {val_elbo:.4f}, "
                f"Val NLL: {val_nll:.6f}, "
                f"KL w: {kl_weight:.2e}"
            )

    model.load_state_dict(best_state)
    mlflow.log_metric("best_epoch", best_epoch)
    mlflow.pytorch.log_model(model, artifact_path="best_model")
    best_model = model

    best_model.eval()

    all_preds = []
    all_trues = []

    with torch.no_grad():
        for xb, yb in test_dataloader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)

            samples = torch.stack([best_model(xb) for _ in range(TEST_SAMPLE_NBR)], dim=0)  # [S, N, 2H]
            mu_s, log_var_s = torch.chunk(samples, 2, dim=-1)                                # [S, N, H], [S, N, H]
            var_s = torch.nn.functional.softplus(log_var_s) + 1e-6

            mu_scaled = mu_s.mean(dim=0)  # point forecast in scaled space, [N, H]
            pred_var_scaled = var_s.mean(dim=0) + mu_s.var(dim=0, unbiased=False)  # total predictive var

            N, H = mu_scaled.shape

            y_pred = y_scaler.inverse_transform(mu_scaled.cpu().numpy().reshape(-1, 1)).reshape(N, H)
            y_true = y_scaler.inverse_transform(yb.cpu().numpy().reshape(-1, 1)).reshape(N, H)

            all_preds.append(y_pred)
            all_trues.append(y_true)

    # concat across batches
    y_pred_full = np.concatenate(all_preds, axis=0)
    y_true_full = np.concatenate(all_trues, axis=0)

    mae = mean_absolute_error(y_true_full, y_pred_full)
    rmse = root_mean_squared_error(y_true_full, y_pred_full)

    mlflow.log_metric("test_mae_unscaled", mae)
    mlflow.log_metric("test_rmse_unscaled", rmse)

    for h in [0, 3, 11, 23, 47, 95]:
        mlflow.log_metric(
            f"rmse_h{h+1}",
            root_mean_squared_error(y_true_full[:, h], y_pred_full[:, h])
        )

2026/02/09 19:07:10 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2026/02/09 19:07:10 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/70, Train ELBO: -0.0683, Val ELBO: -0.2934, Val NLL: -0.879707, KL w: 1.00e-05
Epoch 2/70, Train ELBO: -0.0655, Val ELBO: -0.0866, Val NLL: -0.956000, KL w: 2.00e-05
Epoch 3/70, Train ELBO: -0.0819, Val ELBO: -0.0886, Val NLL: -0.990061, KL w: 3.00e-05
Epoch 4/70, Train ELBO: -0.2503, Val ELBO: -0.2983, Val NLL: -1.078628, KL w: 4.00e-05
Epoch 5/70, Train ELBO: -0.4686, Val ELBO: -0.4372, Val NLL: -1.072030, KL w: 5.00e-05
Epoch 6/70, Train ELBO: -0.6184, Val ELBO: -0.4761, Val NLL: -1.057159, KL w: 6.00e-05
Epoch 7/70, Train ELBO: -0.6338, Val ELBO: -0.4570, Val NLL: -1.075121, KL w: 7.00e-05
Epoch 8/70, Train ELBO: -0.5670, Val ELBO: -0.3971, Val NLL: -1.089457, KL w: 8.00e-05
Epoch 9/70, Train ELBO: -0.4839, Val ELBO: -0.3220, Val NLL: -1.078291, KL w: 9.00e-05
Epoch 10/70, Train ELBO: -0.4120, Val ELBO: -0.2388, Val NLL: -1.051803, KL w: 1.00e-04
Epoch 11/70, Train ELBO: -0.4167, Val ELBO: -0.2394, Val NLL: -1.055545, KL w: 1.00e-04
Epoch 12/70, Train ELBO: -0.4161, Val ELB

2026/02/10 01:39:28 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2026/02/10 01:39:28 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2026/02/10 01:39:28 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2026/02/10 01:39:28 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


üèÉ View run baysian_lstm_testrun_nll_fs_06_load_calendar_future_weather at: http://127.0.0.1:5000/#/experiments/3/runs/0df404bd6fee454788a60bacb548f643
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/3




## Evaluation on the test set

In [18]:
ranked_checkpoints = mlflow.search_logged_models(
    filter_string=f"source_run_id='0df404bd6fee454788a60bacb548f643'",
    order_by=[{"field_name": "metrics.val_nll", "ascending": True}],
    output_format="list",
)

best_checkpoint = ranked_checkpoints[0]
best_checkpoint

LoggedModel(artifact_location='mlflow-artifacts:/3/models/m-59d7d88d721247bc91c1ced7635a401f/artifacts', creation_timestamp=1770719453301, experiment_id='3', last_updated_timestamp=1770719462357, model_id='m-59d7d88d721247bc91c1ced7635a401f', model_type='', model_uri='models:/m-59d7d88d721247bc91c1ced7635a401f', name='best_model', source_run_id='0df404bd6fee454788a60bacb548f643', status=<LoggedModelStatus.READY: 'READY'>, status_message='')

In [19]:
loaded_model = mlflow.pytorch.load_model(best_checkpoint.model_uri)

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=64,
    shuffle=True
)
loaded_model.eval()

all_preds = []
all_trues = []

with torch.no_grad():
    for xb, yb in test_dataloader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        samples = torch.stack([loaded_model(xb) for _ in range(TEST_SAMPLE_NBR)], dim=0)  # [S, N, 2H]
        mu_s, log_var_s = torch.chunk(samples, 2, dim=-1)                                # [S, N, H], [S, N, H]
        var_s = torch.nn.functional.softplus(log_var_s) + 1e-6

        mu_scaled = mu_s.mean(dim=0)  # point forecast in scaled space, [N, H]
        pred_var_scaled = var_s.mean(dim=0) + mu_s.var(dim=0, unbiased=False)  # total predictive var

        N, H = mu_scaled.shape

        y_pred = y_scaler.inverse_transform(mu_scaled.cpu().numpy().reshape(-1, 1)).reshape(N, H)
        y_true = y_scaler.inverse_transform(yb.cpu().numpy().reshape(-1, 1)).reshape(N, H)

        all_preds.append(y_pred)
        all_trues.append(y_true)

# concat across batches
y_pred_full = np.concatenate(all_preds, axis=0)
y_true_full = np.concatenate(all_trues, axis=0)

mae = mean_absolute_error(y_true_full, y_pred_full)
rmse = root_mean_squared_error(y_true_full, y_pred_full)

mlflow.log_metric("test_mae_unscaled", mae)
mlflow.log_metric("test_rmse_unscaled", rmse)

for h in [0, 3, 11, 23, 47, 95]:
    mlflow.log_metric(
        f"rmse_h{h+1}",
        root_mean_squared_error(y_true_full[:, h], y_pred_full[:, h])
    )



2026/02/05 12:26:31 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2026/02/05 12:26:31 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
