In [None]:
import os
import pandas as pd
import numpy as np

import optuna
import torch

import plotly.graph_objects as go
import matplotlib.pyplot as plt

import shap

from tqdm.notebook import tqdm_notebook

In [None]:
# to avoid the verbose output of optuna optimization process
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
seed = 42

In [None]:
utils_folder = os.path.join("..", "..", "utils")

data_folder = os.path.join("..", "..", "data")
clean_data_folder = os.path.join(data_folder, "Clean Data")
metadata_folder = os.path.join(data_folder, "Metadata")
plot_folder = os.path.join(data_folder, "Plots", "Feltre")

sensor_folder = os.path.join(clean_data_folder, "sensors")

feltre_sqlites_folder = 'feltre_sqlites_second'

In [None]:
second_part_df = pd.read_excel(os.path.join(clean_data_folder, 'Feltre', 'second_part.xlsx'))

In [None]:
second_part_df

In [None]:
target_variables = {
    'ICC [1/mL]': 'ICC (1/mL)',
    'HNAC [1/mL]': 'HNAC (1/mL)', 
    'LNAC [1/mL]': 'LNAC (1/mL)',
    'HNAP [%]': 'HNAP (%)',
}

In [None]:
input_variables = {
    'Pressione [atm]': 'Pressione (atm)',
    'TOCeq [mg/l]': 'TOCeq (mg/l)',
    'DOCeq [mg/l]': 'DOCeq (mg/l)',
    'Turbidity [FTU]': 'Turbidity (FTU)', 
    'Conductivity [uS/cm]': 'Conductivity (uS/cm)',
    'Temperature [°C]': 'Temperature (°C)',
    'pH': 'pH',
    'Free Chlorine [mg/l]': 'Free Chlorine (mg/l)',
    'Nitrate [mg/l]': 'Nitrate (mg/l)',
    'UV254 [1/m]': 'UV254 (1/m)',
}

In [None]:
second_part_df.rename(
    columns=input_variables,
    inplace=True
)
second_part_df.rename(
    columns=target_variables,
    inplace=True
)

In [None]:
datasets = {}

for target_variable in target_variables.values():
    datasets[target_variable] = second_part_df[['DateTime', target_variable] + list(input_variables.values())].copy()
    datasets[target_variable].set_index('DateTime', inplace=True)
    datasets[target_variable].sort_index(inplace=True)
    datasets[target_variable].dropna(inplace=True)    

In [None]:
from sklearn.preprocessing import MinMaxScaler 

# We are going to extend the features of the input variables for each target variable
# -
# We are going to add:

scaler = MinMaxScaler()

lags_in_hours = 3
shifts_in_indexes = int(0.25 * 4 * lags_in_hours)
rolling_window_in_hours = 6
rolling_window = int(0.25 * 4 * rolling_window_in_hours)
polyn_degree = 2

ds = datasets.copy()
lstm_datasets = {}

for target_variable, df in datasets.items():
    ds[target_variable] = df[list(input_variables.values())].copy(), df[target_variable].copy()
    
    X = ds[target_variable][0]
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
    
    # uncomment based on the dataset you want to use
    # X_extended = extend_features(X, lags_in_hours, rolling_window, polyn_degree)
    X_extended = X
    
    y = ds[target_variable][1]
    
    # we are going to use the log1p of the target variable for the modelling to avoid instability
    y = np.log1p(y)
    
    # need to change the name of target variable to avoid the / character
    ds.pop(target_variable)
    
    target_variable = target_variable.replace("/", "_")
    
    ds[target_variable] = X_extended, y
    
    # do not use the extended features for the LSTM model
    lstm_datasets[target_variable] = X, y
    
datasets = ds

In [None]:
for target_variable, (X, y) in datasets.items():
    print(f"Target variable: {target_variable}")
    # print number of nan values in X
    print(f"Number of nan values in X: {X.isna().sum().sum()}")
    # print number of nan values in y
    print(f"Number of nan values in y: {y.isna().sum().sum()}")
    print("-"*100)
    
    

In [None]:
from itertools import combinations

feature_combinations = []
for i in range(1, len(input_variables.values()) + 1):
    feature_combinations.extend(combinations(list(input_variables.values()), i))

In [None]:
feature_combinations = [list(comb) for comb in feature_combinations]

# Modelling

We are going to train different models:
- XGBoost
- LightGBM
- QRNN
- LSTM

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit

In [None]:
n_folds = 5

In [None]:
for target_variable, (X, y) in datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    datasets[target_variable] = X_train, X_test, y_train, y_test

In [None]:
for target_variable, (X_train, X_test, y_train, y_test) in datasets.items():
    print(f"Target variable: {target_variable}")
    # print number of nan values in X
    print(f"Number of nan values in X: {X_train.isna().sum().sum()}")
    # print number of nan values in y
    print(f"Number of nan values in y: {y_train.isna().sum().sum()}")
    print("-"*100)
    

In [None]:
from sklearn.feature_selection import mutual_info_regression

# perform mutual information regression to select the most important features
target_variable = 'HNAC (1_mL)'
X_train, X_test, y_train, y_test = datasets[target_variable]

# perform mutual information regression to select the most important features
mi_scores = mutual_info_regression(X_train, y_train)

mi_scores = pd.Series(mi_scores, index=X_train.columns)

mi_scores.sort_values(ascending=False).plot.bar(figsize=(10, 6))

plt.show()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_results = {}

cv = TimeSeriesSplit(n_splits=n_folds)

for target_variable, (X_train, X_test, y_train, y_test) in datasets.items():
    X_cv = X_train.copy()
    y_cv = y_train.copy()
    
    lr_results[target_variable] = {}
    
    for feature_combination in feature_combinations:
        
        X_comb = X_cv[feature_combination]

        cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
        for i, (train_index, test_index) in enumerate(
            cv.split(X_comb, y_cv)
        ):
            X_train_cv, X_test_cv = X_comb.iloc[train_index], X_comb.iloc[test_index]
            y_train_cv, y_test_cv = y_cv.iloc[train_index], y_cv.iloc[test_index]
            
            model = LinearRegression()
            model.fit(X_train_cv, y_train_cv)
            
            y_pred_cv = model.predict(X_test_cv)
            cv_rmse[i] = np.sqrt(mean_squared_error(y_test_cv, y_pred_cv)) 
        
        lr_results[target_variable][str(feature_combination)] = {
            "mean_cv_rmse": np.mean(cv_rmse),
        }

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = XGBRegressor(random_state=seed, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    if np.isnan(y_val).any() or np.isnan(y_val_pred).any():
        print(f"y_val: {y_val}")
        print(f"y_val_pred: {y_val_pred}")
    return np.sqrt(mean_squared_error(y_val.values, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1e-1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    cv = TimeSeriesSplit(n_splits=n_folds)
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_xgb_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [None]:
xgb_studies = {}

for target_variable, (X_train, _, y_train, _) in datasets.items():
    
    if target_variable != "HNAC (1_mL)":
        continue
    
    xgb_studies[target_variable] = {}
    
    for feature_combination in tqdm_notebook(feature_combinations, desc='Feature combination'):
        
        X_train_comb = X_train[feature_combination]
        
        path = f"{feltre_sqlites_folder}/XGBoost - {target_variable}" + str(feature_combination).replace('/', '_') + ".sqlite3"
        storage_path = f"sqlite:///" + path
        study_name = "Hyperparameter Tuning - XGBoost - " + target_variable + str(feature_combination)

        if os.path.exists(path):
                
            study = optuna.load_study(
            study_name=study_name,
            storage=storage_path,
            )
                
        else:
                
            study = optuna.create_study(
                direction="minimize",
                storage=storage_path,
                study_name=study_name,
                load_if_exists=True,
            )
            
            print(f"Optimizing XGBoost for {target_variable} with {feature_combination}")
            
            study.optimize(lambda trial: objective(trial, X_train_comb, y_train), n_trials=100, show_progress_bar=False, )
                
        xgb_studies[target_variable][str(feature_combination)] = study

In [None]:
# get all the studies
best_studies = {}

for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    best_studies[cluster_name] = {}
    
    # get the best study for each model
    pls_cluster = pls_studies[cluster_name]
    pls_sorted = sorted(pls_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['pls'] = pls_sorted
    
    svr_cluster = svr_studies[cluster_name]
    svr_sorted = sorted(svr_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['svr'] = svr_sorted
    
    qrnn_cluster = qrnn_studies[cluster_name]
    qrnn_sorted = sorted(qrnn_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['qrnn'] = qrnn_sorted
    
    xgb_cluster = xgb_studies[cluster_name]
    xgb_sorted = sorted(xgb_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['xgb'] = xgb_sorted

In [None]:
# get all the studies
best_studies = {}

for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    best_studies[cluster_name] = {}
    
    # get the best study for each model
    pls_cluster = pls_studies[cluster_name]
    pls_sorted = sorted(pls_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['pls'] = pls_sorted
    
    svr_cluster = svr_studies[cluster_name]
    svr_sorted = sorted(svr_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['svr'] = svr_sorted
    
    qrnn_cluster = qrnn_studies[cluster_name]
    qrnn_sorted = sorted(qrnn_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['qrnn'] = qrnn_sorted
    
    xgb_cluster = xgb_studies[cluster_name]
    xgb_sorted = sorted(xgb_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['xgb'] = xgb_sorted

In [None]:
# get all the studies
best_studies = {}

for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    best_studies[cluster_name] = {}
    
    # get the best study for each model
    pls_cluster = pls_studies[cluster_name]
    pls_sorted = sorted(pls_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['pls'] = pls_sorted
    
    svr_cluster = svr_studies[cluster_name]
    svr_sorted = sorted(svr_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['svr'] = svr_sorted
    
    qrnn_cluster = qrnn_studies[cluster_name]
    qrnn_sorted = sorted(qrnn_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['qrnn'] = qrnn_sorted
    
    xgb_cluster = xgb_studies[cluster_name]
    xgb_sorted = sorted(xgb_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['xgb'] = xgb_sorted

In [None]:
# get all the studies
best_studies = {}

for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    best_studies[cluster_name] = {}
    
    # get the best study for each model
    pls_cluster = pls_studies[cluster_name]
    pls_sorted = sorted(pls_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['pls'] = pls_sorted
    
    svr_cluster = svr_studies[cluster_name]
    svr_sorted = sorted(svr_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['svr'] = svr_sorted
    
    qrnn_cluster = qrnn_studies[cluster_name]
    qrnn_sorted = sorted(qrnn_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['qrnn'] = qrnn_sorted
    
    xgb_cluster = xgb_studies[cluster_name]
    xgb_sorted = sorted(xgb_cluster.items(), key=lambda x: x[1].best_value)
    best_studies[cluster_name]['xgb'] = xgb_sorted

In [None]:
for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    print(f'Cluster {cluster_name}')
    print('Best models:')
    print('=='*50)
    print('PLS')
    print('=='*50)
    # print the best 5 configurations for each model
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['pls'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['pls'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('SVR')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['svr'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['svr'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('XGB')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['xgb'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['xgb'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('QRNN')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['qrnn'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['qrnn'][i][1].best_value, 3))
        print()
    print('=='*50)

In [None]:
# For cluster 0, the best model is XGBoost with the following variables:
# ['Conductivity (uS/cm)', 'TOC (mg/L)', 'Temperature (°C)']

# For cluster 1, the best model is XGBoost with the following variables:
# ['Color (CU)', 'pH', 'Conductivity (uS/cm)', 'Temperature (°C)']

In [None]:
for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    print(f'Cluster {cluster_name}')
    print('Best models:')
    print('=='*50)
    print('PLS')
    print('=='*50)
    # print the best 5 configurations for each model
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['pls'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['pls'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('SVR')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['svr'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['svr'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('XGB')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['xgb'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['xgb'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('QRNN')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['qrnn'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['qrnn'][i][1].best_value, 3))
        print()
    print('=='*50)

In [None]:
# For cluster 0, the best model is XGBoost with the following variables:
# ['Conductivity (uS/cm)', 'TOC (mg/L)', 'Temperature (°C)']

# For cluster 1, the best model is XGBoost with the following variables:
# ['Color (CU)', 'pH', 'Conductivity (uS/cm)', 'Temperature (°C)']

In [None]:
for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    print(f'Cluster {cluster_name}')
    print('Best models:')
    print('=='*50)
    print('PLS')
    print('=='*50)
    # print the best 5 configurations for each model
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['pls'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['pls'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('SVR')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['svr'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['svr'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('XGB')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['xgb'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['xgb'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('QRNN')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['qrnn'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['qrnn'][i][1].best_value, 3))
        print()
    print('=='*50)

In [None]:
# For cluster 0, the best model is XGBoost with the following variables:
# ['Conductivity (uS/cm)', 'TOC (mg/L)', 'Temperature (°C)']

# For cluster 1, the best model is XGBoost with the following variables:
# ['Color (CU)', 'pH', 'Conductivity (uS/cm)', 'Temperature (°C)']

In [None]:
for cluster_name in clusters.keys():
    
    if cluster_name == 'cluster_1':
        continue
    
    print(f'Cluster {cluster_name}')
    print('Best models:')
    print('=='*50)
    print('PLS')
    print('=='*50)
    # print the best 5 configurations for each model
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['pls'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['pls'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('SVR')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['svr'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['svr'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('XGB')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['xgb'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['xgb'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('QRNN')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[cluster_name]['qrnn'][i][0])
        print('Best Value:', np.round(best_studies[cluster_name]['qrnn'][i][1].best_value, 3))
        print()
    print('=='*50)

In [None]:
# For cluster 0, the best model is XGBoost with the following variables:
# ['Conductivity (uS/cm)', 'TOC (mg/L)', 'Temperature (°C)']

# For cluster 1, the best model is XGBoost with the following variables:
# ['Color (CU)', 'pH', 'Conductivity (uS/cm)', 'Temperature (°C)']

## LightGBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
def fit_and_validate_lgbm_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = LGBMRegressor(
        objective="regression",
        random_state=seed,
        linear_tree=True,
    )

    if params is not None:
        model.set_params(**params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val.values, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "n_estimators": trial.suggest_int(
            "n_estimators", 1, 20, step=1
        ),
        "learning_rate": trial.suggest_float(
            "learning_rate", 1e-5, 1e-1, log=True
        ),
        "max_depth": trial.suggest_int("max_depth", 2, 16, step=1),
        "num_leaves": trial.suggest_int("num_leaves", 2, 20, step=1),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf", 2, 50, step=1
        ),
        "lambda_l1": trial.suggest_float(
            "lambda_l1", 1e-3, 10, log=True
        ),
        "lambda_l2": trial.suggest_float(
            "lambda_l2", 1e-3, 10, log=True
        ),
        "min_split_gain": trial.suggest_float(
            "min_split_gain", 0, 15, step=0.5
        ),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 1e-3, 1, log=True
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 1e-3, 1, log=True
        ),
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 20, 1000, log=True
        ),
        "max_bin": trial.suggest_int("max_bin", 10, 500, step=10),
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_lgbm_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
lgbm_studies = {}

for target_variable, (X_train, _, y_train, _) in datasets.items():
    
    if target_variable != "HNAC (1_mL)":
        continue
    
    lgbm_studies[target_variable] = {}
    
    for feature_combination in tqdm_notebook(feature_combinations, desc='Feature combination'):
        
        X_train_comb = X_train[feature_combination]
        
        path = f"{feltre_sqlites_folder}/LGBM - {target_variable}" + str(feature_combination).replace('/', '_') + ".sqlite3"
        storage_path = f"sqlite:///" + path
        study_name = "Hyperparameter Tuning - LGBM - " + target_variable + str(feature_combination)

        if os.path.exists(path):
                
            study = optuna.load_study(
            study_name=study_name,
            storage=storage_path,
            )
                
        else:
                
            study = optuna.create_study(
                direction="minimize",
                storage=storage_path,
                study_name=study_name,
                load_if_exists=True,
            )
            
            print(f"Optimizing LGBM for {target_variable} with {feature_combination}")
            
            study.optimize(lambda trial: objective(trial, X_train_comb, y_train), n_trials=100, show_progress_bar=False, )
                
        lgbm_studies[target_variable][str(feature_combination)] = study  

## QRNN

In [None]:
from quantnn.qrnn import QRNN

In [None]:
quantiles = np.linspace(0.01, 0.99, 50)

def fit_and_validate_qrnn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index].to_numpy(), X.iloc[val_index].to_numpy()
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
    n_layers = params["n_layers"]
    n_units = params["n_units"]
    activation = params["activation"]

    model = QRNN(
        n_inputs=X_tr.shape[1],
        quantiles=quantiles,
        model=(n_layers, n_units, activation),
    )
    
    n_epochs = 50
    optimizer = torch.optim.AdamW(model.model.parameters())
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)
    
    model.train(
        training_data=(X_tr, np.array(y_tr)),
        validation_data=(X_val, np.array(y_val)),
        optimizer=optimizer,
        scheduler=scheduler,
        n_epochs=n_epochs,
        device="cpu",
        batch_size=params["batch_size"],
        logger=None,
        
        
    )
    
    with torch.no_grad():
        y_val_pred = model.predict(X_val).numpy()
    

    # return metrics
    return np.sqrt(mean_squared_error(y_val.values, y_val_pred.mean(axis=-1)))

In [None]:
activations = [
    "elu",
    "hardshrink",
    "hardtanh",
    "prelu",
    "relu",
    "selu",
    "celu",
    "sigmoid",
    "softplus",
    "softmin",
]

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "n_layers": trial.suggest_int("n_layers", 1, 3),
        "n_units": trial.suggest_int("n_units", 32, 512, log=True),
        "activation": trial.suggest_categorical("activation", activations),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64, 128]),
    }

    cv = TimeSeriesSplit(n_splits=n_folds)
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_qrnn_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
qrnn_studies = {}

for target_variable, (X_train, _, y_train, _) in datasets.items():
    
    if target_variable != "HNAC (1_mL)":
        continue
    
    qrnn_studies[target_variable] = {}
    
    for feature_combination in tqdm_notebook(feature_combinations, desc='Feature combination'):
        
        X_train_comb = X_train[feature_combination]
        
        path = f"{feltre_sqlites_folder}/QRNN - {target_variable}" + str(feature_combination).replace('/', '_') + ".sqlite3"
        storage_path = f"sqlite:///" + path
        study_name = "Hyperparameter Tuning - QRNN - " + target_variable + str(feature_combination)
    
        if os.path.exists(path):
                
            study = optuna.load_study(
            study_name=study_name,
            storage=storage_path,
            )
                
        else:
                
            study = optuna.create_study(
                direction="minimize",
                storage=storage_path,
                study_name=study_name,
                load_if_exists=True,
            )
            
            print(f"Optimizing QRNN for {target_variable} with {feature_combination}")
            
            study.optimize(lambda trial: objective(trial, X_train_comb, y_train), n_trials=100, show_progress_bar=False, )
        
    qrnn_studies[target_variable][str(feature_combination)] = study

## LSTM

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping

import tensorboard

In [None]:
# since the LSTM model takes as input a tensor of shape (num_samples, time_steps, n_features)
# we need to convert the pandas dataframe into a numpy array of shape (num_samples, time_steps, n_features)
# each sample is a sequence of window_size time steps, containing the features and the target variable
def create_sequences(X_df, y_df, window_size):
    """
    Converts Pandas DataFrames into overlapping sequences for LSTM input.
    
    Returns:
        X_seq: NumPy array of shape (num_samples - window_size, window_size, n_features)
        y_seq: NumPy array of shape (num_samples - window_size, 1) with the last target value of each window
        y_timestamps: List of timestamps corresponding to the predictions.
    """
    timesteps = X_df.index
    
    X_values = X_df.to_numpy()
    y_values = y_df.to_numpy()
    
    X_seq, y_seq, y_timestamps = [], [], []
    
    # Create sequences for X and corresponding y for only the last value of each window
    for i in range(len(X_values) - window_size):
        X_seq.append(X_values[i : i + window_size])  # Input sequence
        y_seq.append(y_values[i + window_size - 1])  # Only the last value in the target window
        y_timestamps.append(timesteps[i + window_size - 1])  # Timestamp for the last timestep
        
    return np.array(X_seq), np.array(y_seq), np.array(y_timestamps)


In [None]:
def fit_and_validate_lstm_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X[train_index], X[val_index]
    y_tr, y_val = y[train_index], y[val_index]
    
    model = Sequential()
    model.add(Input(shape=(params["window_size"], X_tr.shape[-1])))
    model.add(LSTM(units=params["n_units_1"], return_sequences=False, seed=seed))
    model.add(Dropout(params["dropout_1"], seed=seed))
    # model.add(LSTM(units=params["n_units_2"], return_sequences=False, seed=seed))
    # model.add(Dropout(params["dropout_2"], seed=seed))
    model.add(Dense(params["n_neurons"]))
    model.add(Dense(1))
    model.compile(
        optimizer=Adam(learning_rate=params["learning_rate"]),
        loss=MeanSquaredError(),
        metrics=[RootMeanSquaredError()],
    )
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
    
    _ = model.fit(X_tr, y_tr, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0, batch_size=params["batch_size"])

    # obtain predictions
    y_val_pred = model.predict(X_val)
    y_val_pred = np.squeeze(y_val_pred)
    
    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "n_units_1": trial.suggest_int("n_units_1", low=16, high=64, step=1),
        # "n_units_2": trial.suggest_int("n_units_2", low=16, high=64, step=1),
        "n_neurons": trial.suggest_int("n_neurons", low=16, high=64, step=1),        
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1),
        "window_size": trial.suggest_int("window_size", 1, 24, step=1),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256]),
        "dropout_1": trial.suggest_float("dropout_1", 0.1, 0.5),
        # "dropout_2": trial.suggest_float("dropout_2", 0.1, 0.5),
    }
    
    window_size = config["window_size"]
    
    X_train, _, y_train, _ = train_test_split(X_cv, y_cv, test_size=0.2, shuffle=False, random_state=seed)
    
    X_train_seq, y_train_seq, _ = create_sequences(X_train, y_train, window_size)

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_train_seq, y_train_seq)
    ):
        cv_rmse[i] = fit_and_validate_lstm_model(
            X_train_seq,
            y_train_seq,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
lstm_studies = {}

for target_variable, (X, y) in lstm_datasets.items():
    
    if target_variable != 'HNAC (1_mL)':
        continue
    
    lstm_studies[target_variable] = {}
    
    for feature_combination in tqdm_notebook(feature_combinations, desc='Feature combination'):
        
        X_train_comb = X[feature_combination]
        
        path = f"{feltre_sqlites_folder}/LSTM - {target_variable}" + str(feature_combination).replace('/', '_') + ".sqlite3"
        storage_path = f"sqlite:///" + path
        study_name = "Hyperparameter Tuning - LSTM - " + target_variable + str(feature_combination)
        if os.path.exists(path):
                
            study = optuna.load_study(
            study_name=study_name,
            storage=storage_path,
            )
                
        else:
                
            study = optuna.create_study(
                direction="minimize",
                storage=storage_path,
                study_name=study_name,
                load_if_exists=True,
            )
            
            print(f"Optimizing LSTM for {target_variable} with {feature_combination}")
            study.optimize(lambda trial: objective(trial, X_train_comb, y_train), n_trials=100, show_progress_bar=True)
                
        lstm_studies[target_variable][str(feature_combination)] = study  

# Compare studies results

In [None]:
# get all the studies
best_studies = {}

for target_variable in datasets.keys():
    
    if target_variable == 'HNAC (1_mL)':
        continue
    
    best_studies[target_variable] = {}
    
    # get the best study for each model
    xgb_target_variable = xgb_studies[target_variable]
    xgb_sorted = sorted(xgb_target_variable.items(), key=lambda x: x[1].best_value)
    best_studies[target_variable]['xgb'] = xgb_sorted
    
    lgbm_target_variable = lgbm_studies[target_variable]
    lgbm_sorted = sorted(lgbm_target_variable.items(), key=lambda x: x[1].best_value)
    best_studies[target_variable]['lgbm'] = lgbm_sorted
    
    qrnn_target_variable = qrnn_studies[target_variable]
    qrnn_sorted = sorted(qrnn_target_variable.items(), key=lambda x: x[1].best_value)
    best_studies[target_variable]['qrnn'] = qrnn_sorted
    
    lstm_target_variable = lstm_studies[target_variable]
    lstm_sorted = sorted(lstm_target_variable.items(), key=lambda x: x[1].best_value)
    best_studies[target_variable]['lstm'] = lstm_sorted

In [None]:
for target_variable in datasets.keys():
    
    if target_variable == 'HNAC (1_mL)':
        continue
    
    print(f'Target Variable: {target_variable}')
    print('Best models:')
    print('=='*50)
    print('XGB')
    print('=='*50)
    # print the best 5 configurations for each model
    for i in range(5):
        print('Parameters:', best_studies[target_variable]['xgb'][i][0])
        print('Best Value:', np.round(best_studies[target_variable]['xgb'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('LGBM')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[target_variable]['lgbm'][i][0])
        print('Best Value:', np.round(best_studies[target_variable]['lgbm'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('QRNN')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[target_variable]['qrnn'][i][0])
        print('Best Value:', np.round(best_studies[target_variable]['qrnn'][i][1].best_value, 3))
        print()
    print('=='*50)
    print('LSTM')
    print('=='*50)
    for i in range(5):
        print('Parameters:', best_studies[target_variable]['lstm'][i][0])
        print('Best Value:', np.round(best_studies[target_variable]['lstm'][i][1].best_value, 3))
        print()
    print('=='*50)

In [None]:
# For HNAC (1_mL), the best model is LSTM with the following variables:


# Prediction

In [None]:
predictions = {
    'LSTM' : {},
    'XGBoost': {},
    'LGBM': {},
    'QRNN': {},
    'GRU': {},
    'BI_LSTM': {}
}

In [None]:
for target_variable, (X, y) in lstm_datasets.items():
    
    if target_variable != 'HNAC (1_mL)':
        continue
    
    # ==== LSTM ====
    
    predictions['LSTM'][target_variable] = {}
    
    window_size = lstm_studies[target_variable].best_trial.params["window_size"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=seed)
    
    X_train_seq, y_train_seq, timesteps_train = create_sequences(X_train, y_train, window_size)
    X_test_seq, y_test_seq, timesteps_test = create_sequences(X_test, y_test, window_size)
    

    n_units_1 = lstm_studies[target_variable].best_trial.params["n_units_1"]
    n_neurons = lstm_studies[target_variable].best_trial.params["n_neurons"]
    dropout_1 = lstm_studies[target_variable].best_trial.params["dropout_1"]
    learning_rate = lstm_studies[target_variable].best_trial.params["learning_rate"]
    batch_size = lstm_studies[target_variable].best_trial.params["batch_size"] 
    n_units_2 = lstm_studies[target_variable].best_trial.params["n_units_2"]
    dropout_2 = lstm_studies[target_variable].best_trial.params["dropout_2"]
    
    # fit the model 50 times to get a better estimate of the predictions and the uncertainty
    n_iterations = 50
    
    y_pred_list = []
    
    for _ in range(n_iterations):
        
        model = Sequential()
        model.add(Input(shape=(window_size, X_train_seq.shape[-1])))
        model.add(LSTM(units=n_units_1, return_sequences=True, seed=seed))
        model.add(Dropout(dropout_1, seed=seed))
        model.add(LSTM(units=n_units_2, return_sequences=False, seed=seed))
        model.add(Dropout(dropout_2, seed=seed))
        model.add(Dense(n_neurons))
        model.add(Dense(1))
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss=MeanSquaredError(),
            metrics=[RootMeanSquaredError()],
        )
        
        early_stopping = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)

        _ = model.fit(X_train_seq, y_train_seq, epochs=50, callbacks=[early_stopping], verbose=0, batch_size=batch_size)
        
        # Warm-up the model
        warm_up_pred = model.predict(X_train_seq[-window_size - 1:])
        warm_up_pred = np.squeeze(warm_up_pred)
        
        y_pred = model.predict(X_test_seq)
        y_pred = np.squeeze(y_pred)

        # concatenate the warm-up predictions with the test predictions
        y_pred = np.concatenate([warm_up_pred, y_pred])
        
        y_pred_list.append(y_pred)
    
    # get a timesteps_test as a one-dimensional array with no duplicates
    timesteps_test = np.unique(timesteps_test)
    timesteps_train = np.unique(timesteps_train)

    predictions['LSTM'][target_variable]["timesteps_test"] = timesteps_test
    predictions['LSTM'][target_variable]["timesteps_train"] = timesteps_train
    predictions['LSTM'][target_variable]["y_test"] = y_test
    predictions['LSTM'][target_variable]["y_train"] = y_train
    
    mean_pred = np.mean(y_pred_list, axis=0)
    std_pred = np.std(y_pred_list, axis=0)
    
    predictions['LSTM'][target_variable]["mean_pred"] = mean_pred
    predictions['LSTM'][target_variable]["std_pred"] = std_pred

In [None]:
for target_variable, (X, y) in lstm_datasets.items():
    
    if target_variable != 'HNAC (1_mL)':
        continue
    # ==== GRU ====
    
    predictions['GRU'][target_variable] = {}
    
    window_size = lstm_studies[target_variable].best_trial.params["window_size"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=seed)
    
    X_train_seq, y_train_seq, timesteps_train = create_sequences(X_train, y_train, window_size)
    X_test_seq, y_test_seq, timesteps_test = create_sequences(X_test, y_test, window_size)
    

    n_units_1 = gru_studies[target_variable].best_trial.params["n_units_1"]
    n_neurons = gru_studies[target_variable].best_trial.params["n_neurons"]
    dropout_1 = gru_studies[target_variable].best_trial.params["dropout_1"]
    learning_rate = gru_studies[target_variable].best_trial.params["learning_rate"]
    batch_size = gru_studies[target_variable].best_trial.params["batch_size"] 
    
    # fit the model 50 times to get a better estimate of the predictions and the uncertainty
    n_iterations = 50
    
    y_pred_list = []
    
    for _ in range(n_iterations):
        
        model = Sequential()
        model.add(Input(shape=(window_size, X_train_seq.shape[-1])))
        model.add(GRU(units=n_units_1, return_sequences=False, seed=42))
        model.add(Dropout(dropout_1))
        model.add(Dense(n_neurons))
        model.add(Dense(1))
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss=MeanSquaredError(),
            metrics=[RootMeanSquaredError()],
        )
        
        early_stopping = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)

        _ = model.fit(X_train_seq, y_train_seq, epochs=50, callbacks=[early_stopping], verbose=0, batch_size=batch_size)
        
        # Warm-up the model
        warm_up_pred = model.predict(X_train_seq[-window_size - 1:])
        warm_up_pred = np.squeeze(warm_up_pred)
        
        y_pred = model.predict(X_test_seq)
        y_pred = np.squeeze(y_pred)
        
        # concatenate the warm-up predictions with the test predictions
        y_pred = np.concatenate([warm_up_pred, y_pred])
        
        y_pred_list.append(y_pred)
    
    # get a timesteps_test as a one-dimensional array with no duplicates
    timesteps_test = np.unique(timesteps_test)
    timesteps_train = np.unique(timesteps_train)

    predictions['GRU'][target_variable]["timesteps_test"] = timesteps_test
    predictions['GRU'][target_variable]["timesteps_train"] = timesteps_train
    predictions['GRU'][target_variable]["y_test"] = y_test
    predictions['GRU'][target_variable]["y_train"] = y_train
    
    mean_pred = np.mean(y_pred_list, axis=0)
    std_pred = np.std(y_pred_list, axis=0)
    
    predictions['GRU'][target_variable]["mean_pred"] = mean_pred
    predictions['GRU'][target_variable]["std_pred"] = std_pred

In [None]:
for target_variable, (X, y) in lstm_datasets.items():
    
    if target_variable != 'HNAC (1_mL)':
        continue
    # ==== BIDIRECTIONAL LSTM ====
    
    predictions['BI_LSTM'][target_variable] = {}
    
    window_size = bi_lstm_studies[target_variable].best_trial.params["window_size"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=seed)
    
    X_train_seq, y_train_seq, timesteps_train = create_sequences(X_train, y_train, window_size)
    X_test_seq, y_test_seq, timesteps_test = create_sequences(X_test, y_test, window_size)
    

    n_units_1 = bi_lstm_studies[target_variable].best_trial.params["n_units_1"]
    n_neurons = bi_lstm_studies[target_variable].best_trial.params["n_neurons"]
    dropout_1 = bi_lstm_studies[target_variable].best_trial.params["dropout_1"]
    learning_rate = bi_lstm_studies[target_variable].best_trial.params["learning_rate"]
    batch_size = bi_lstm_studies[target_variable].best_trial.params["batch_size"] 
    
    # fit the model 50 times to get a better estimate of the predictions and the uncertainty
    n_iterations = 50
    
    y_pred_list = []
    
    for _ in range(n_iterations):
        
        model = Sequential()
        model.add(Input(shape=(window_size, X_train_seq.shape[-1])))
        model.add(Bidirectional(LSTM(units=n_units_1, return_sequences=False, seed=42)))
        model.add(Dropout(dropout_1))
        model.add(Dense(n_neurons))
        model.add(Dense(1))
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss=MeanSquaredError(),
            metrics=[RootMeanSquaredError()],
        )
        
        early_stopping = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)

        _ = model.fit(X_train_seq, y_train_seq, epochs=50, callbacks=[early_stopping], verbose=0, batch_size=batch_size)
        
        # Warm-up the model
        warm_up_pred = model.predict(X_train_seq[-window_size - 1:])
        warm_up_pred = np.squeeze(warm_up_pred)
        
        y_pred = model.predict(X_test_seq)
        y_pred = np.squeeze(y_pred)
        
        # concatenate the warm-up predictions with the test predictions
        y_pred = np.concatenate([warm_up_pred, y_pred])
        
        y_pred_list.append(y_pred)
    
    # get a timesteps_test as a one-dimensional array with no duplicates
    timesteps_test = np.unique(timesteps_test)
    timesteps_train = np.unique(timesteps_train)

    predictions['BI_LSTM'][target_variable]["timesteps_test"] = timesteps_test
    predictions['BI_LSTM'][target_variable]["timesteps_train"] = timesteps_train
    predictions['BI_LSTM'][target_variable]["y_test"] = y_test
    predictions['BI_LSTM'][target_variable]["y_train"] = y_train
    
    mean_pred = np.mean(y_pred_list, axis=0)
    std_pred = np.std(y_pred_list, axis=0)
    
    predictions['BI_LSTM'][target_variable]["mean_pred"] = mean_pred
    predictions['BI_LSTM'][target_variable]["std_pred"] = std_pred

In [None]:
# LSTM PLOTS

for target_variable in lstm_datasets.keys():
    
    if target_variable != 'HNAC (1_mL)':
        continue
    
    timesteps_test = predictions['LSTM'][target_variable]["timesteps_test"]
    timesteps_train = predictions['LSTM'][target_variable]["timesteps_train"]
    y_train = predictions['LSTM'][target_variable]["y_train"]
    y_test = predictions['LSTM'][target_variable]["y_test"]
    
    
    y_pred_lstm = predictions['LSTM'][target_variable]["mean_pred"]
    std_pred_lstm = predictions['LSTM'][target_variable]["std_pred"]    
    
    # y_pred_gru = predictions['GRU'][target_variable]["mean_pred"]
    # std_pred_gru = predictions['GRU'][target_variable]["std_pred"]
    
    # y_pred_bi_lstm = predictions['BI_LSTM'][target_variable]["mean_pred"]
    # std_pred_bi_lstm = predictions['BI_LSTM'][target_variable]["std_pred"]
    
    
    fig = go.Figure()
    fig.add_trace(go.Scatter
    (
        x=timesteps_train,
        y=np.expm1(y_train), 
        mode='lines',
        name='True',
        line=dict(color='blue'),
        showlegend=False
    ))
    
    fig.add_trace(go.Scatter
    (
        x=timesteps_test,
        y=np.expm1(y_test),
        mode='lines',
        name='True',
        line=dict(color='blue')
    ))
    
    fig.add_trace(go.Scatter
    (
        x=timesteps_test,
        y=np.expm1(y_pred_lstm),
        mode='lines',
        name='LSTM',
        line=dict(color='red')
    ))
    
    fig.add_trace(go.Scatter(
        name='Upper Bound',
        x=timesteps_test,
        y=np.expm1(y_pred_lstm + 1.96 * std_pred_lstm),
        mode='lines',
        line=dict(width=0),
        showlegend=False
    ))
    
    
    
    fig.add_trace(go.Scatter(
        name='Lower Bound',
        x=timesteps_test,
        y=np.expm1(y_pred_lstm - 1.96 * std_pred_lstm),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(255, 102, 102, 0.3)',  # light red color
        fill='tonexty',
        showlegend=False
    ))
    
    # fig.add_trace(go.Scatter
    # (
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_gru),
    #     mode='lines',
    #     name='GRU',
    #     line=dict(color='green')
    # ))
    
    # fig.add_trace(go.Scatter(
    #     name='Upper Bound',
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_gru + 1.96 * std_pred_gru),
    #     mode='lines',
    #     line=dict(width=0),
    #     showlegend=False
    # ))
    
    # fig.add_trace(go.Scatter(
    #     name='Lower Bound',
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_gru - 1.96 * std_pred_gru),
    #     line=dict(width=0),
    #     mode='lines',
    #     fillcolor='rgba(102, 255, 102, 0.3)',  # light green color
    #     fill='tonexty',
    #     showlegend=False
    # ))
    
    # fig.add_trace(go.Scatter
    # (
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_bi_lstm),
    #     mode='lines',
    #     name='BI LSTM',
    #     line=dict(color='orange')
    # ))
    
    # fig.add_trace(go.Scatter(
    #     name='Upper Bound',
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_bi_lstm + 1.96 * std_pred_bi_lstm),
    #     mode='lines',
    #     line=dict(width=0),
    #     showlegend=False
    # ))
    
    # fig.add_trace(go.Scatter(
    #     name='Lower Bound',
    #     x=timesteps_test,
    #     y=np.expm1(y_pred_bi_lstm - 1.96 * std_pred_bi_lstm),
    #     line=dict(width=0),
    #     mode='lines',
    #     fillcolor='rgba(255, 204, 102, 0.3)',  # light orange color
    #     fill='tonexty',
    #     showlegend=False
    # ))

    target_variable_name = f"{target_variable.replace('_', '/')}"
    
    # fig.update_yaxes(type="log")
    fig.update_layout(
        title={
            'text': f"{target_variable_name}",
            'y':0.98,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title="Time",
        yaxis_title=target_variable_name,
        margin=dict(l=0, r=10, t=30, b=0),
        font=dict(
            size=14,
        ),
    )
    
    # put the legend at the top
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ))
    
    # fig.write_image(os.path.join(plot_folder, f"LSTM - {target_variable}.png"), scale=3)
    
    fig.show()
    

In [None]:
# OTHER MODELS

for target_variable, _ in lstm_datasets.items():
    
    X_train, X_test, y_train, y_test = datasets[target_variable]
    
    # ==== XGBoost ====
    
    predictions['XGBoost'][target_variable] = {}
    
    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": xgb_studies[target_variable].best_trial.params["eta"],
        "reg_lambda": xgb_studies[target_variable].best_trial.params["reg_lambda"],
        "reg_alpha": xgb_studies[target_variable].best_trial.params["reg_alpha"],
        "learning_rate": xgb_studies[target_variable].best_trial.params["learning_rate"],
        "updater": xgb_studies[target_variable].best_trial.params["updater"],
        "n_estimators": xgb_studies[target_variable].best_trial.params["n_estimators"],
        "eval_metric": "rmse",
    }
    
    model = XGBRegressor(random_state=seed, **params)
    
    _ = model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    predictions['XGBoost'][target_variable]["y_test"] = y_test
    predictions['XGBoost'][target_variable]["y_pred"] = y_pred

In [None]:
for target_variable, _ in lstm_datasets.items():
# ==== LGBM ====
        
    predictions['LGBM'][target_variable] = {}
    
    config = {
        "n_estimators": lgbm_studies[target_variable].best_trial.params["n_estimators"],
        "learning_rate": lgbm_studies[target_variable].best_trial.params["learning_rate"],
        "max_depth": lgbm_studies[target_variable].best_trial.params["max_depth"],
        "num_leaves": lgbm_studies[target_variable].best_trial.params["num_leaves"],
        "min_data_in_leaf": lgbm_studies[target_variable].best_trial.params["min_data_in_leaf"],
        "lambda_l1": lgbm_studies[target_variable].best_trial.params["lambda_l1"],
        "lambda_l2": lgbm_studies[target_variable].best_trial.params["lambda_l2"],
        "min_split_gain": lgbm_studies[target_variable].best_trial.params["min_split_gain"],
        "subsample": lgbm_studies[target_variable].best_trial.params["subsample"],
        "bagging_fraction": lgbm_studies[target_variable].best_trial.params["bagging_fraction"],
        "feature_fraction": lgbm_studies[target_variable].best_trial.params["feature_fraction"],
        "min_child_samples": lgbm_studies[target_variable].best_trial.params["min_child_samples"],
        "max_bin": lgbm_studies[target_variable].best_trial.params["max_bin"],
    }
    
    model = LGBMRegressor(
        objective="regression",
        random_state=seed,
        linear_tree=True,
    )
    
    model.set_params(**config)
    
    _ = model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    
    predictions['LGBM'][target_variable]["y_test"] = y_test
    predictions['LGBM'][target_variable]["y_pred"] = y_pred

In [None]:
for target_variable, _ in lstm_datasets.items():
# ==== QRNN ====
    
    predictions['QRNN'][target_variable] = {}
    
    config = {
        "n_layers": qrnn_studies[target_variable].best_trial.params["n_layers"],
        "n_units": qrnn_studies[target_variable].best_trial.params["n_units"],
        "activation": qrnn_studies[target_variable].best_trial.params["activation"],
        "batch_size": qrnn_studies[target_variable].best_trial.params["batch_size"],
    }
    
    model = QRNN(
        n_inputs=X_train.shape[1],
        quantiles=[0.05, 0.5, 0.95],
        model=(config["n_layers"], config["n_units"], config["activation"]),
    )
    
    n_epochs = 50
    optimizer = torch.optim.AdamW(model.model.parameters())
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)
    
    model.train(
        training_data=(X_train.to_numpy(), np.array(y_train)),
        validation_data=(X_test.to_numpy(), np.array(y_test)),
        optimizer=optimizer,
        scheduler=scheduler,
        n_epochs=n_epochs,
        device="cpu",
        batch_size=config["batch_size"],
        logger=None,
    )
    
    with torch.no_grad():
        y_pred = model.predict(X_test.to_numpy()).numpy()
        
        
    predictions['QRNN'][target_variable]["y_test"] = y_test
    predictions['QRNN'][target_variable]["y_pred_median"] = y_pred[:, 1]
    predictions['QRNN'][target_variable]["y_pred_lower"] = y_pred[:, 0]
    predictions['QRNN'][target_variable]["y_pred_upper"] = y_pred[:, 2]


In [None]:
# ALL MODELS PLOTS

# plot the predictions for each target variable
for target_variable in datasets.keys():
    
    fig = go.Figure()
        
    fig.add_trace(go.Scatter(x=predictions['LGBM'][target_variable]["y_test"].index, y=predictions['LGBM'][target_variable]["y_test"], mode='lines', name='True'))
        
        
    for model in predictions.keys():
        
        if model == 'QRNN':
            fig.add_trace(go.Scatter
                            (x=predictions[model][target_variable]["y_test"].index, y=predictions[model][target_variable]["y_pred_median"], mode='lines', name='QRNN Predicted'))
            fig.add_trace(go.Scatter
                            (x=predictions[model][target_variable]["y_test"].index, y=predictions[model][target_variable]["y_pred_lower"], mode='lines', name='Lower Bound'))
            fig.add_trace(go.Scatter
                            (x=predictions[model][target_variable]["y_test"].index, y=predictions[model][target_variable]["y_pred_upper"], mode='lines', name='Upper Bound'))
        
        if model == 'LGBM':
            fig.add_trace(go.Scatter
                        (x=predictions[model][target_variable]["y_test"].index, y=predictions[model][target_variable]["y_pred"], mode='lines', name='LGBM Predicted'))
            
        if model == 'XGBoost':
            fig.add_trace(go.Scatter
                            (x=predictions[model][target_variable]["y_test"].index, y=predictions[model][target_variable]["y_pred"], mode='lines', name='XGBoost Predicted'))
            
        if model == 'LSTM':
            fig.add_trace(go.Scatter
                            (x=predictions[model][target_variable]["timesteps_test"], y=predictions[model][target_variable]["y_pred"], mode='lines', name='LSTM Predicted'))
            
    fig.update_layout(title=f'{target_variable}', xaxis_title='Date', yaxis_title=target_variable)
    fig.show()
