Modelling of supply points for soft-sensor for NUWEE.

In [None]:
import os
import pandas as pd
import numpy as np

import optuna
import torch

In [None]:
utils_folder = os.path.join("..", "..", "utils")

data_folder = os.path.join("..", "..", "data")
clean_data_folder = os.path.join(data_folder, "Clean Data")
metadata_folder = os.path.join(data_folder, "Metadata")
plot_folder = os.path.join(data_folder, "Plots")

sensor_folder = os.path.join(clean_data_folder, "sensors")

In [None]:
grab_df = pd.read_excel(os.path.join(clean_data_folder, "modelling_grab.xlsx"))

nuwee_site1_df = pd.read_excel(os.path.join(clean_data_folder, 'nuwee', 'Site1_tabular.xlsx'))
nuwee_site2_df = pd.read_excel(os.path.join(clean_data_folder, 'nuwee', 'Site2_tabular.xlsx'))
nuwee_site3_df = pd.read_excel(os.path.join(clean_data_folder, 'nuwee', 'Site3_tabular.xlsx'))

In [None]:
grab_df

In [None]:
grab_df.dropna(inplace=True)

In [None]:
grab_df['DateTime'] = pd.to_datetime(grab_df['DateTime'])

In [None]:
nuwee_site1_df['DateTime'] = pd.to_datetime(nuwee_site1_df['DateTime'])
nuwee_site2_df['DateTime'] = pd.to_datetime(nuwee_site2_df['DateTime'])
nuwee_site3_df['DateTime'] = pd.to_datetime(nuwee_site3_df['DateTime'])

In [None]:
codes_dict = {}
for cluster in grab_df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = grab_df[grab_df['Cluster'] == cluster]['Code'].unique().tolist()
    codes_dict[cluster] = codes
    print(codes)

In [None]:
nuwee_site1_df.columns

In [None]:
common_columns = nuwee_site1_df.columns.difference(['DateTime', 'Sampling Point', 'TTHMs'])

In [None]:
grab_df = grab_df[['DateTime', 'Cluster', 'Code', 'TTHMs'] + common_columns.tolist()]

In [None]:
cluster_0_df = grab_df[grab_df['Cluster'] == 0].copy()
cluster_1_df = grab_df[grab_df['Cluster'] == 1].copy()
cluster_2_df = grab_df[grab_df['Cluster'] == 2].copy()

# NUWEE Data preprocessing

We are going to fill deal with missing values and imputation.

In [None]:
nuwee_site1_df.isna().sum()

In [None]:
nuwee_site2_df.isna().sum()


In [None]:
nuwee_site3_df.isna().sum()

In [None]:
# First we nee to clean the nuwee data by removing the rows with all missing values, then impute the rest
nuwee_site1_df.dropna(how='all', subset=common_columns, inplace=True)
nuwee_site2_df.dropna(how='all', subset=common_columns, inplace=True)
nuwee_site3_df.dropna(how='all', subset=common_columns, inplace=True)

In [None]:
# then, we need to remove the rows with all missing values in the TTHMs column
nuwee_site1_df.dropna(how='all', subset=['TTHMs'], inplace=True)
nuwee_site2_df.dropna(how='all', subset=['TTHMs'], inplace=True)
nuwee_site3_df.dropna(how='all', subset=['TTHMs'], inplace=True)

In [None]:
nuwee_site1_df.reset_index(drop=True, inplace=True)
nuwee_site2_df.reset_index(drop=True, inplace=True)
nuwee_site3_df.reset_index(drop=True, inplace=True)

In [None]:
nuwee_site1_df.isna().sum()

In [None]:
nuwee_site2_df.isna().sum()

In [None]:
nuwee_site3_df.isna().sum()

In [None]:
# now we can impute the missing values in the common columns
import miceforest as mf

In [None]:
# create a kernel for each site
kernel = mf.ImputationKernel(
    data=nuwee_site1_df[common_columns],
    variable_schema=common_columns.tolist(),
    random_state=42,
    mean_match_strategy='shap',
)

kernel.mice(5, verbose=True)

In [None]:
nuwee_site1_df[common_columns] = kernel.complete_data(dataset=0)

In [None]:
# create a kernel for each site
kernel = mf.ImputationKernel(
    data=nuwee_site2_df[common_columns],
    variable_schema=common_columns.tolist(),
    random_state=42,
    mean_match_strategy='shap',
)

kernel.mice(5, verbose=True)

In [None]:
nuwee_site2_df[common_columns] = kernel.complete_data(dataset=0)

In [None]:
# create a kernel for each site
kernel = mf.ImputationKernel(
    data=nuwee_site3_df[common_columns],
    variable_schema=common_columns.tolist(),
    random_state=42,
    mean_match_strategy='shap',
)

kernel.mice(5, verbose=True)

In [None]:
nuwee_site3_df[common_columns] = kernel.complete_data(dataset=0)

In [None]:
# final check
nuwee_site1_df.isna().sum()

In [None]:
nuwee_site2_df.isna().sum()

In [None]:
nuwee_site3_df.isna().sum()

# Clustering based on Mahalanobis distance

In [None]:
# Compute mean and covariance for each cluster
cluster_0_mean = cluster_0_df[common_columns].mean()
cluster_0_cov = cluster_0_df[common_columns].cov()
cluster_1_mean = cluster_1_df[common_columns].mean()
cluster_1_cov = cluster_1_df[common_columns].cov()
cluster_2_mean = cluster_2_df[common_columns].mean()
cluster_2_cov = cluster_2_df[common_columns].cov()

In [None]:
clusters_stats = {
    0: {
        'mean': cluster_0_mean,
        'cov': cluster_0_cov
    },
    1: {
        'mean': cluster_1_mean,
        'cov': cluster_1_cov
    },
    2: {
        'mean': cluster_2_mean,
        'cov': cluster_2_cov
    }
}

In [None]:
from scipy.spatial.distance import mahalanobis

def assign_cluster(row, clusters_stats):
    distances = {}
    for cluster, stats in clusters_stats.items():
        mean = stats['mean']
        cov = stats['cov']
        inv_cov = np.linalg.inv(cov)
        distance = mahalanobis(row[common_columns], mean, inv_cov)
        distances[cluster] = distance
    return min(distances, key=distances.get)

In [None]:
# nuwee site 1

nuwee_site1_df['Cluster'] = -1
for i, row in nuwee_site1_df.iterrows():
    cluster = assign_cluster(row, clusters_stats)
    nuwee_site1_df.at[i, 'Cluster'] = cluster

In [None]:
nuwee_site1_df['Cluster'].value_counts()

In [None]:
# nuwee site 2
nuwee_site2_df['Cluster'] = -1

for i, row in nuwee_site2_df.iterrows():
    cluster = assign_cluster(row, clusters_stats)
    nuwee_site2_df.at[i, 'Cluster'] = cluster

In [None]:
nuwee_site2_df['Cluster'].value_counts()

In [None]:
# nuwee site 3
nuwee_site3_df['Cluster'] = -1

for i, row in nuwee_site3_df.iterrows():
    cluster = assign_cluster(row, clusters_stats)
    nuwee_site3_df.at[i, 'Cluster'] = cluster

In [None]:
nuwee_site3_df['Cluster'].value_counts()

PCA per far vedere cluster originiale, altri cluster (anche solo centroide) e i punti dei 3 siti

# Modelling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Prepare the data
# All the data from cluster 0 will be used for training

cluster_0_df.set_index('DateTime', inplace=True)
X, y = cluster_0_df[common_columns], cluster_0_df['TTHMs']

# scale the data
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

## PLS

The Partial Least Squares regression (PLS) is a method which reduces the variables, used to predict, to a smaller set of predictors. These predictors are then used to perform a regression.

It projects the predictors (independent variables) and the response variable (dependent variable) into a new space that maximizes the covariance between them. The procedure identifies components (latent variables) that explain the most variance in the predictors while also being predictive of the response variable.

In [None]:
from sklearn.cross_decomposition import PLSRegression

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error


In [None]:
def fit_and_validate_pls_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
    n_components = params["n_components"]
    tol = params["tol"]

    model = PLSRegression(
        n_components=n_components,
        tol=tol,
        scale=False,
        max_iter=1000,
    )
    
    model.fit(X_tr, y_tr)
    
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "n_components": trial.suggest_int("n_components", 2, X_cv.shape[1]),
        "tol": trial.suggest_float("tol", 1e-6, 1e-1),
        
    }
    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_pls_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
if os.path.exists(f"nuwee_sqlites/PLS.sqlite3"):
    
    study = optuna.load_study(
        study_name=f"Hyperparameter Tuning - PLS",
        storage=f"sqlite:///nuwee_sqlites/PLS.sqlite3",
    )

else:
    
    study = optuna.create_study(
        study_name=f"Hyperparameter Tuning - PLS",
        storage=f"sqlite:///nuwee_sqlites/PLS.sqlite3",
        direction="minimize",
        load_if_exists=True,
    )

    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

pls_study = study

## SVR

In [None]:
# import SVR from sklearn
from sklearn.svm import SVR

In [None]:
# print the parameters of the model
svr = SVR()

In [None]:
kernel = [
    "linear",
    "rbf",
    "sigmoid",
]

In [None]:
def fit_and_validate_svr_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    kernel = params["kernel"]
    C = params["C"]
    epsilon = params["epsilon"]
    gamma = params["gamma"]

    model = SVR(
        kernel=kernel,
        C=C,
        epsilon=epsilon,
        gamma=gamma,
    )
    
    model.fit(X_tr, y_tr)
    
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "kernel": trial.suggest_categorical("kernel", kernel),
        "C": trial.suggest_float("C", 1e-6, 1, log=True),
        "epsilon": trial.suggest_float("epsilon", 1e-6, 1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-6, 1, log=True),
        
    }
    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_svr_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
if os.path.exists(f"nuwee_sqlites/SVR.sqlite3"):
    
    study = optuna.load_study(
        study_name=f"Hyperparameter Tuning - SVR",
        storage=f"sqlite:///nuwee_sqlites/SVR.sqlite3",
    )

else:
    
    study = optuna.create_study(
        study_name=f"Hyperparameter Tuning - SVR",
        storage=f"sqlite:///nuwee_sqlites/SVR.sqlite3",
        direction="minimize",
        load_if_exists=True,
    )

    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

svr_study = study

## QRNN

In [None]:
from quantnn.qrnn import QRNN

In [None]:
quantiles = np.linspace(0.01, 0.99, 99)

def fit_and_validate_qrnn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index].to_numpy(), X.iloc[val_index].to_numpy()
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
    n_layers = params["n_layers"]
    n_units = params["n_units"]
    activation = params["activation"]

    model = QRNN(
        n_inputs=X_tr.shape[1],
        quantiles=quantiles,
        model=(n_layers, n_units, activation),
    )
    
    n_epochs = 50
    optimizer = torch.optim.AdamW(model.model.parameters())
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)
    
    model.train(
        training_data=(np.array(X_tr), np.array(y_tr)),
        optimizer=optimizer,
        scheduler=scheduler,
        n_epochs=n_epochs,
        device="cpu",
        batch_size=params["batch_size"],
        logger=None,
        
    )
    
    with torch.no_grad():
        y_val_pred = model.predict(X_val)
    

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred.mean(axis=-1)))

In [None]:
activations = [
    "elu",
    "hardshrink",
    "hardtanh",
    "prelu",
    "relu",
    "selu",
    "celu",
    "sigmoid",
    "softplus",
    "softmin",
]

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "n_layers": trial.suggest_int("n_layers", 1, 3),
        "n_units": trial.suggest_int("n_units", 32, 512, log=True),
        "activation": trial.suggest_categorical("activation", activations),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16]),
    }

    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_qrnn_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
if os.path.exists(f"nuwee_sqlites/QRNN.sqlite3"):
    
    study = optuna.load_study(
        study_name=f"Hyperparameter Tuning - QRNN",
        storage=f"sqlite:///nuwee_sqlites/QRNN.sqlite3",
    )

else:
    
    study = optuna.create_study(
        study_name=f"Hyperparameter Tuning - QRNN",
        storage=f"sqlite:///nuwee_sqlites/QRNN.sqlite3",
        direction="minimize",
        load_if_exists=True,
    )

    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

qrnn_study = study

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_xgb_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [None]:
if os.path.exists(f"nuwee_sqlites/XGB.sqlite3"):
    
    study = optuna.load_study(
        study_name=f"Hyperparameter Tuning - XGB",
        storage=f"sqlite:///nuwee_sqlites/XGB.sqlite3",
    )

else:
    
    study = optuna.create_study(
        study_name=f"Hyperparameter Tuning - XGB",
        storage=f"sqlite:///nuwee_sqlites/XGB.sqlite3",
        direction="minimize",
        load_if_exists=True,
    )

    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

xgb_study = study

# Comparison

In [None]:
# get all the studies

best_studies= {
    "PLS": pls_study.best_trial,
    "SVR": svr_study.best_trial,
    "QRNN": qrnn_study.best_trial,
    "XGB": xgb_study.best_trial,
}

In [None]:
comparison_df = pd.DataFrame(
    columns=['RMSE'],
    index=list(best_studies.keys()),
)

for model, study in best_studies.items():
    comparison_df.loc[model, :] = np.round(study.value, 3)
    

In [None]:
comparison_df

# Model Prediction with all common features

Since all the points were associated to cluster 0, we are going to use the model that performed best on all the features for cluster 0 and we are going to use it on these samples.

In [None]:
if os.path.exists(f"nuwee_sqlites/XGB.sqlite3"):
    
    xgb_study = optuna.load_study(
        study_name=f"Hyperparameter Tuning - XGB",
        storage=f"sqlite:///nuwee_sqlites/XGB.sqlite3",
    )

else:
    
    raise FileNotFoundError(
        f"SQLite file not found. Please check the path."
    )


In [None]:
from xgboost import XGBRegressor

In [None]:
# scale the sites df
nuwee_site1_df[common_columns] = scaler.transform(nuwee_site1_df[common_columns])
nuwee_site2_df[common_columns] = scaler.transform(nuwee_site2_df[common_columns])
nuwee_site3_df[common_columns] = scaler.transform(nuwee_site3_df[common_columns])

In [None]:
n_iterations = 50

site1_preds = []
site2_preds = []
site3_preds = []

for _ in range(n_iterations):

    xgb_best_trial = xgb_study.best_trial
    
    xgb = XGBRegressor(
        random_state=42,
        objective="reg:squarederror",
        booster="gblinear",
        eta=xgb_best_trial.params["eta"],
        reg_lambda=xgb_best_trial.params["reg_lambda"],
        reg_alpha=xgb_best_trial.params["reg_alpha"],
        learning_rate=xgb_best_trial.params["learning_rate"],
        updater=xgb_best_trial.params["updater"],
        n_estimators=xgb_best_trial.params["n_estimators"],
    )
    
    xgb.fit(X, y)
    
    site1_preds.append(xgb.predict(nuwee_site1_df[common_columns]))
    site2_preds.append(xgb.predict(nuwee_site2_df[common_columns]))
    site3_preds.append(xgb.predict(nuwee_site3_df[common_columns]))

eval_preds = {
    "y_test1": nuwee_site1_df['TTHMs'].values,
    "y_test2": nuwee_site2_df['TTHMs'].values,
    "y_test3": nuwee_site3_df['TTHMs'].values,
    "y_test_mean1": np.mean(site1_preds, axis=0),
    "y_test_mean2": np.mean(site2_preds, axis=0),
    "y_test_mean3": np.mean(site3_preds, axis=0),
    "y_test_lower1": np.quantile(site1_preds, 0.025, axis=0),
    "y_test_lower2": np.quantile(site2_preds, 0.025, axis=0),
    "y_test_lower3": np.quantile(site3_preds, 0.025, axis=0),
    "y_test_upper1": np.quantile(site1_preds, 0.975, axis=0),
    "y_test_upper2": np.quantile(site2_preds, 0.975, axis=0),
    "y_test_upper3": np.quantile(site3_preds, 0.975, axis=0),
}

In [None]:
import matplotlib.pyplot as plt

mettere colori diversi per i vari punti di campionamento

In [None]:
# Site 1
y_test1 = eval_preds["y_test1"]
y_test_mean1 = eval_preds["y_test_mean1"]

plt.figure(figsize=(10, 5))
plt.plot(y_test1, y_test_mean1, "o")
plt.plot([0, 14], [0, 14], "--")
plt.xlabel("True")
plt.ylabel("Predicted")

plt.title("Site 1 - True vs Predicted")
plt.show()

In [None]:
# Site 2
y_test2 = eval_preds["y_test2"]
y_test_mean2 = eval_preds["y_test_mean2"]

plt.figure(figsize=(10, 5))
plt.plot(y_test2, y_test_mean2, "o")
plt.plot([0, 14], [0, 14], "--")
plt.xlabel("True")
plt.ylabel("Predicted")
plt.title("Site 2 - True vs Predicted")
plt.show()

In [None]:
# Site 3
y_test3 = eval_preds["y_test3"]
y_test_mean3 = eval_preds["y_test_mean3"]

plt.figure(figsize=(10, 5))
plt.plot(y_test3, y_test_mean3, "o")
plt.plot([0, 14], [0, 14], "--")
plt.xlabel("True")
plt.ylabel("Predicted")
plt.title("Site 3 - True vs Predicted")
plt.show()

In [None]:
import plotly.graph_objects as go

In [None]:
# Site 1
y_test = eval_preds["y_test1"]
y_pred_mean = eval_preds["y_test_mean1"]
y_pred_lower = eval_preds["y_test_lower1"]
y_pred_upper = eval_preds["y_test_upper1"]
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=nuwee_site1_df['DateTime'],
        y=nuwee_site1_df['TTHMs'],
        mode="markers",
        name="True TTHMs",
        line=dict(color="black"),
        marker=dict(size=10),
    )
)

fig.add_trace(
    go.Scatter(
        x=nuwee_site1_df['DateTime'],
        y=y_pred_mean,
        mode="markers",
        name="Predicted TTHMs (95% PI)",
        line=dict(color="green"),
        marker=dict(size=10),
        error_y=dict(
            type='data',
            symmetric=False,
            array=y_pred_upper,
            arrayminus=y_pred_lower,
            thickness=2,
            width=5,
            color="green",
        ),
    )
)


# get the legend inside the plot
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="right",
        x=1,
    ),
    margin=dict(l=10, r=10, t=30, b=10),
    title="Site 1"
)

fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="TTHMs (µg/L)")

# fig.update_yaxes(range=[0, 25])

# update the overall font
fig.update_layout(font=dict(family="Arial", size=18))

fig.show()

In [None]:
# Site 2
y_test = eval_preds["y_test2"]
y_pred_mean = eval_preds["y_test_mean2"]
y_pred_lower = eval_preds["y_test_lower2"]
y_pred_upper = eval_preds["y_test_upper2"]
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=nuwee_site2_df['DateTime'],
        y=nuwee_site2_df['TTHMs'],
        mode="markers",
        name="True TTHMs",
        line=dict(color="black"),
        marker=dict(size=10),
    )
)

fig.add_trace(
    go.Scatter(
        x=nuwee_site2_df['DateTime'],
        y=y_pred_mean,
        mode="markers",
        name="Predicted TTHMs (95% PI)",
        line=dict(color="green"),
        marker=dict(size=10),
        error_y=dict(
            type='data',
            symmetric=False,
            array=y_pred_upper,
            arrayminus=y_pred_lower,
            thickness=2,
            width=5,
            color="green",
        ),
    )
)

# get the legend inside the plot
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="right",
        x=1,
    ),
    margin=dict(l=10, r=10, t=30, b=10),
    title="Site 2"
)

fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="TTHMs (µg/L)")

# fig.update_yaxes(range=[0, 25])

# update the overall font
fig.update_layout(font=dict(family="Arial", size=18))

fig.show()

In [None]:
# Site 3
y_test3 = eval_preds["y_test3"]
y_test_mean3 = eval_preds["y_test_mean3"]
y_test_lower3 = eval_preds["y_test_lower3"]
y_test_upper3 = eval_preds["y_test_upper3"]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=nuwee_site3_df['DateTime'],
        y=nuwee_site3_df['TTHMs'],
        mode="markers",
        name="True TTHMs",
        line=dict(color="black"),
        marker=dict(size=10),
    )
)

fig.add_trace(
    go.Scatter(
        x=nuwee_site3_df['DateTime'],
        y=y_test_mean3,
        mode="markers",
        name="Predicted TTHMs (95% PI)",
        line=dict(color="green"),
        marker=dict(size=10),
        error_y=dict(
            type='data',
            symmetric=False,
            array=y_test_upper3,
            arrayminus=y_test_lower3,
            thickness=2,
            width=5,
            color="green",
        ),
    )
)

fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="right",
        x=1,
    ),
    margin=dict(l=10, r=10, t=30, b=10),
    title="Site 3"
)

fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="TTHMs (µg/L)")

# fig.update_yaxes(range=[0, 25])

# update the overall font
fig.update_layout(font=dict(family="Arial", size=18))

fig.show()