Modelling of supply points for soft-sensor.

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import optuna
import torch

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

In [None]:
utils_folder = os.path.join("..", "utils")

data_folder = os.path.join("..", "data")

sensor_folder = os.path.join(data_folder, "sensors")

In [None]:
grab_df = pd.read_excel(os.path.join(data_folder, "modelling_grab.xlsx"))

In [None]:
grab_df

In [None]:
grab_df.dropna(inplace=True)

In [None]:
grab_df.shape

In [None]:
codes_dict = {}
for cluster in grab_df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = grab_df[grab_df['Cluster'] == cluster]['Code'].unique().tolist()
    codes_dict[cluster] = codes
    print(codes)

In [None]:
cluster_0_df = grab_df[grab_df['Cluster'] == 0].copy()
cluster_1_df = grab_df[grab_df['Cluster'] == 1].copy()
cluster_2_df = grab_df[grab_df['Cluster'] == 2].copy()

In [None]:
cluster_0_df.drop(columns=['Code', 'DateTime', 'Cluster'], inplace=True)
cluster_1_df.drop(columns=['Code', 'DateTime', 'Cluster'], inplace=True)
cluster_2_df.drop(columns=['Code', 'DateTime', 'Cluster'], inplace=True)

In [None]:
clusters = {
    'cluster_0': cluster_0_df,
    'cluster_1': cluster_1_df,
}

In [None]:
sensor_dict = {}

for file in os.listdir(sensor_folder):
    if file.endswith(".xlsx"):
        sensor_dict[file.split(".")[0]] = pd.read_excel(
            os.path.join(sensor_folder, file)
        )

In [None]:
codes_dict

In [None]:
cluster_0_sensor_dict = {}
cluster_1_sensor_dict = {}
cluster_2_sensor_dict = {}

for sensor_name in sensor_dict.keys():
    if sensor_name in codes_dict[0]:
        cluster_0_sensor_dict[sensor_name] = sensor_dict[sensor_name]
    elif sensor_name in codes_dict[1]:
        cluster_1_sensor_dict[sensor_name] = sensor_dict[sensor_name]
    elif sensor_name in codes_dict[2]:
        cluster_2_sensor_dict[sensor_name] = sensor_dict[sensor_name]

In [None]:
for supply_point_name in cluster_0_sensor_dict.keys():
    sensor_df = cluster_0_sensor_dict[supply_point_name]
    
    # for the moment remove the uva since it is not present in the grab data
    if 'UVA254 (1/m)' in sensor_df.columns:
        sensor_df.drop(columns=['UVA254 (1/m)'], inplace=True)
    
    # remove turbidity since it is not present in the grab data
    if 'Turbidity (NTU)' in sensor_df.columns:
        sensor_df.drop(columns=['Turbidity (NTU)'], inplace=True)

In [None]:
scaler = MinMaxScaler()

In [None]:
for cluster_name in clusters.keys():
    df = clusters[cluster_name]
    X, y = df.drop(columns=['TTHMs']), df['TTHMs']
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    clusters[cluster_name] = X_scaled, y
    
    # put in the same order the columns of the sensors
    X_columns = X.columns.tolist()    
    
    # Sensors 
    if cluster_name == 'cluster_0':
        for sensor_name in cluster_0_sensor_dict.keys():
            sensor_df = cluster_0_sensor_dict[sensor_name].copy()
            
            datetime_col = sensor_df['DateTime']
            sensor_df.drop(columns=['DateTime'], inplace=True)
            
            X_sensor = pd.DataFrame(scaler.fit_transform(sensor_df), columns=sensor_df.columns)
            
            # put in the same order the columns of the sensors
            X_sensor = X_sensor[X_columns]
            
            X_sensor['DateTime'] = datetime_col
            cluster_0_sensor_dict[sensor_name] = X_sensor
    elif cluster_name == 'cluster_1':
        for sensor_name in cluster_1_sensor_dict.keys():
            sensor_df = cluster_1_sensor_dict[sensor_name].copy()
            datetime_col = sensor_df['DateTime']
            sensor_df.drop(columns=['DateTime'], inplace=True)
            
            X_sensor = pd.DataFrame(scaler.fit_transform(sensor_df), columns=sensor_df.columns)
            
            # put in the same order the columns of the sensors
            X_sensor = X_sensor[X_columns]
            
            X_sensor['DateTime'] = datetime_col
            cluster_1_sensor_dict[sensor_name] = X_sensor
    

# Modelling

Different models will be used:

- PLS
- SVR
- QRNNNN
- XGBoost

In [None]:
# split the data
for cluster_name in clusters.keys():
    X, y = clusters[cluster_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clusters[cluster_name] = X_train, X_test, y_train, y_test

## PLS

The Partial Least Squares regression (PLS) is a method which reduces the variables, used to predict, to a smaller set of predictors. These predictors are then used to perform a regression.

It projects the predictors (independent variables) and the response variable (dependent variable) into a new space that maximizes the covariance between them. The procedure identifies components (latent variables) that explain the most variance in the predictors while also being predictive of the response variable.

In [None]:
from sklearn.cross_decomposition import PLSRegression

In [None]:
# print the parameters of the model
pls = PLSRegression()
print(pls.get_params())

In [None]:
def fit_and_validate_pls_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
    n_components = params["n_components"]
    tol = params["tol"]

    model = PLSRegression(
        n_components=n_components,
        tol=tol,
        scale=False,
        max_iter=1000,
    )
    
    model.fit(X_tr, y_tr)
    
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "n_components": trial.suggest_int("n_components", 2, X_cv.shape[1]),
        "tol": trial.suggest_float("tol", 1e-6, 1e-1),
        
    }
    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_pls_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
pls_studies = {}

for cluster_name in clusters.keys():
    X_train, X_test, y_train, y_test = clusters[cluster_name]
    
    if os.path.exists(f"supply_points_sqlites/PLS_{cluster_name}.sqlite3"):
        
        study = optuna.load_study(
            study_name=f"Hyperparameter Tuning - PLS_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/PLS_{cluster_name}.sqlite3",
        )
    
    else:
        
        study = optuna.create_study(
            study_name=f"Hyperparameter Tuning - PLS_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/PLS_{cluster_name}.sqlite3",
            direction="minimize",
            load_if_exists=True,
        )
    
        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)
    
    pls_studies[cluster_name] = study

## SVR

In [None]:
# import SVR from sklearn
from sklearn.svm import SVR

In [None]:
# print the parameters of the model
svr = SVR()

In [None]:
kernel = [
    "linear",
    "rbf",
    "sigmoid",
]

In [None]:
def fit_and_validate_svr_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    kernel = params["kernel"]
    C = params["C"]
    epsilon = params["epsilon"]
    gamma = params["gamma"]

    model = SVR(
        kernel=kernel,
        C=C,
        epsilon=epsilon,
        gamma=gamma,
    )
    
    model.fit(X_tr, y_tr)
    
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "kernel": trial.suggest_categorical("kernel", kernel),
        "C": trial.suggest_float("C", 1e-6, 1, log=True),
        "epsilon": trial.suggest_float("epsilon", 1e-6, 1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-6, 1, log=True),
        
    }
    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_svr_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
clusters.keys()

In [None]:
svr_studies = {}

for cluster_name in clusters.keys():
    X_train, X_test, y_train, y_test = clusters[cluster_name]
    if os.path.exists(f"supply_points_sqlites/SVR_{cluster_name}.sqlite3"):
        
        study = optuna.load_study(
            study_name=f"Hyperparameter Tuning - SVR_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/SVR_{cluster_name}.sqlite3",
        )
    
    else:
        
        study = optuna.create_study(
            study_name=f"Hyperparameter Tuning - SVR_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/SVR_{cluster_name}.sqlite3",
            direction="minimize",
            load_if_exists=True,
        )
    
        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)
    
    svr_studies[cluster_name] = study

## QRNN

In [None]:
from quantnn.qrnn import QRNN

In [None]:
quantiles = np.linspace(0.01, 0.99, 99)

def fit_and_validate_qrnn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index].to_numpy(), X.iloc[val_index].to_numpy()
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
    n_layers = params["n_layers"]
    n_units = params["n_units"]
    activation = params["activation"]

    model = QRNN(
        n_inputs=X_tr.shape[1],
        quantiles=quantiles,
        model=(n_layers, n_units, activation),
    )
    
    n_epochs = 50
    optimizer = torch.optim.AdamW(model.model.parameters())
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)
    
    model.train(
        training_data=(np.array(X_tr), np.array(y_tr)),
        optimizer=optimizer,
        scheduler=scheduler,
        n_epochs=n_epochs,
        device="cpu",
        batch_size=params["batch_size"],
        logger=None,
        
    )
    
    with torch.no_grad():
        y_val_pred = model.predict(X_val)
    

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred.mean(axis=-1)))

In [None]:
activations = [
    "elu",
    "hardshrink",
    "hardtanh",
    "prelu",
    "relu",
    "selu",
    "celu",
    "sigmoid",
    "softplus",
    "softmin",
]

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    
    config= {
        
        "n_layers": trial.suggest_int("n_layers", 1, 3),
        "n_units": trial.suggest_int("n_units", 32, 512, log=True),
        "activation": trial.suggest_categorical("activation", activations),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16]),
    }

    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_qrnn_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [None]:
qrnn_studies = {}

for cluster_name in clusters.keys():
    X_train, X_test, y_train, y_test = clusters[cluster_name]
    if os.path.exists(f"supply_points_sqlites/QRNN_{cluster_name}.sqlite3"):
        
        study = optuna.load_study(
            study_name=f"Hyperparameter Tuning - QRNN_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/QRNN_{cluster_name}.sqlite3",
        )
    
    else:
        
        study = optuna.create_study(
            study_name=f"Hyperparameter Tuning - QRNN_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/QRNN_{cluster_name}.sqlite3",
            direction="minimize",
            load_if_exists=True,
        )
    
        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)
    
    qrnn_studies[cluster_name] = study

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    cv = LeaveOneOut()
    cv_rmse = np.zeros((cv.get_n_splits(X_cv)))
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_xgb_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    # trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [None]:
xgb_studies = {}

for cluster_name in clusters.keys():
    X_train, X_test, y_train, y_test = clusters[cluster_name]
    if os.path.exists(f"supply_points_sqlites/XGB_{cluster_name}.sqlite3"):
        
        study = optuna.load_study(
            study_name=f"Hyperparameter Tuning - XGB_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/XGB_{cluster_name}.sqlite3",
        )
    
    else:
        
        study = optuna.create_study(
            study_name=f"Hyperparameter Tuning - XGB_{cluster_name}",
            storage=f"sqlite:///supply_points_sqlites/XGB_{cluster_name}.sqlite3",
            direction="minimize",
            load_if_exists=True,
        )
    
        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)
    
    xgb_studies[cluster_name] = study

# Comparison

In [None]:
# get all the studies
best_studies = {}

for cluster_name in clusters.keys():
    best_studies[cluster_name] = {
        "PLS": pls_studies[cluster_name].best_trial,
        "SVR": svr_studies[cluster_name].best_trial,
        "QRNN": qrnn_studies[cluster_name].best_trial,
        "XGB": xgb_studies[cluster_name].best_trial,
    }

In [None]:
comparison_df = pd.DataFrame(
    columns=best_studies["cluster_0"].keys(),
    index=list(best_studies.keys()),
)

for cluster, studies in best_studies.items():
    for model, study in studies.items():
        comparison_df.loc[cluster, model] = np.round(study.value, 3)
    

In [None]:
comparison_df.T

# Evaluation

In [None]:
clusters_eval = {}
cluster_models = {}

In [None]:
n_iterations = 50

for cluster_name in clusters.keys():
    
    medians = []
    lower = []
    upper = []
    
    for _ in range(n_iterations):
    
        qrnn_best_trial = qrnn_studies[cluster_name].best_trial
        X_train, X_test, y_train, y_test = clusters[cluster_name]

        
        qrnn = QRNN(
            n_inputs=X_train.shape[1],
            quantiles=[0.025, 0.5, 0.975],
            model=(
                qrnn_best_trial.params["n_layers"],
                qrnn_best_trial.params["n_units"],
                qrnn_best_trial.params["activation"],
            ),
        )
        n_epochs = 50
        optimizer = torch.optim.AdamW(qrnn.model.parameters())
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)

        qrnn.train(
            training_data=(X_train.to_numpy(), np.array(y_train)),
            optimizer=optimizer,
            scheduler=scheduler,
            n_epochs=n_epochs,
            device="cpu",
            batch_size=qrnn_best_trial.params["batch_size"],
            logger=None,
        )
        
        cluster_models[cluster_name] = qrnn
        
        y_test_median = qrnn.predict(X_test.to_numpy())[:, 1]
        y_test_lower = qrnn.predict(X_test.to_numpy())[:, 0]
        y_test_upper = qrnn.predict(X_test.to_numpy())[:, 2]
        
        
        
        medians.append(y_test_median)
        lower.append(y_test_lower)
        upper.append(y_test_upper)

    clusters_eval[cluster_name] = {
        "y_test": y_test,
        "y_test_median": np.mean(medians, axis=0),
        "y_test_lower": np.mean(lower, axis=0),
        "y_test_upper": np.mean(upper, axis=0),
    }
        

In [None]:
for cluster_name in clusters_eval.keys():
    y_test = clusters_eval[cluster_name]["y_test"]
    y_test_median = clusters_eval[cluster_name]["y_test_median"]
    
    plt.figure(figsize=(10, 5))
    plt.plot(y_test, y_test_median, "o")
    plt.plot([0, 14], [0, 14], "--")
    plt.xlabel("True")
    plt.ylabel("Predicted")
    
    plt.title(f"{cluster_name} - True vs Predicted")
    plt.show()

In [None]:
# plot the time series of the predictions

for cluster_name in clusters.keys():
    y_test = clusters_eval[cluster_name]["y_test"]
    y_pred_qrnn_median = clusters_eval[cluster_name]["y_test_median"]
    y_pred_qrnn_lower = clusters_eval[cluster_name]["y_test_lower"]
    y_pred_qrnn_upper = clusters_eval[cluster_name]["y_test_upper"]
    
    grab_df_test = grab_df.iloc[y_test.index]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=grab_df_test['DateTime'],
            y=y_test,
            mode="markers",
            name="True TTHMs",
            line=dict(color="black"),
            marker=dict(size=10),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=grab_df_test['DateTime'],
            y=y_pred_qrnn_median,
            mode="markers",
            name="Predicted TTHMs (95% PI)",
            line=dict(color="green"),
            marker=dict(size=10),
            error_y=dict(
                type='data',
                symmetric=False,
                array=y_pred_qrnn_upper,
                arrayminus=y_pred_qrnn_lower,
                thickness=2,
                width=5,
                color="green",
            ),
        )
    )
    
    cluster_index = int(cluster_name.split("_")[-1]) + 1

    # get the legend inside the plot
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="right",
            x=1,
        ),
        margin=dict(l=10, r=10, t=30, b=10),
        title=f"Cluster {cluster_index - 1} - QRNN Predictions",
    )

    fig.update_xaxes(title_text="Time")
    fig.update_yaxes(title_text="TTHMs (µg/L)")
    
    fig.update_yaxes(range=[0, 25])

    # update the overall font
    fig.update_layout(font=dict(family="Arial", size=18))
    
    fig.show()

# Sensor Prediction

In [None]:
# make subplots
fig = make_subplots(
    rows=len(cluster_0_sensor_dict.keys()),
    cols=1,
    subplot_titles=list(cluster_0_sensor_dict.keys()),
    shared_xaxes=True,
    vertical_spacing=0.05,
    x_title="Time",
    y_title="TTHMs (µg/L)",
)

colors = [
    "blue",
    "red",
    "green",
    "purple",
    "orange",
]

show_true_TTHMs = True

for index, supply_point_name in enumerate(cluster_0_sensor_dict.keys()):
    
    sensor_df = cluster_0_sensor_dict[supply_point_name].copy()
    
    supply_points_grab = grab_df[grab_df['Code'] == supply_point_name].copy()
    
    model = cluster_models["cluster_0"]
    
    y_pred_median = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 1]
    y_pred_lower = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 0]
    y_pred_upper = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 2]
    
    sensor_df['DateTime'] = pd.to_datetime(sensor_df['DateTime'])
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_median,
            mode="lines",
            name="Predicted TTHMs (95% PI)",
            line=dict(color=colors[index]),
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_lower,
            name="Predicted TTHMs (95% PI) Lower",
            line=dict(width=0),
            mode='lines',
            fillcolor='rgba(68, 68, 68, 0.3)',
            fill='tonexty',
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_upper,
            mode="lines",
            name="Predicted TTHMs (95% PI) Upper",
            line=dict(width=0),
            fillcolor='rgba(68, 68, 68, 0.3)',
            fill='tonexty',
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=supply_points_grab['DateTime'],
            y=supply_points_grab['TTHMs'],
            mode="markers",
            name="True TTHMs",
            line=dict(color="black"),
            marker=dict(size=10),
            showlegend=show_true_TTHMs,
        ),
        row=index + 1,
        col=1
    )
    
    show_true_TTHMs = False
    
    # Update y-axis range for each subplot
    fig.update_yaxes(range=[0, 15], row=index + 1, col=1)
    
fig.update_layout(
    font=dict(family="Arial", size=18),  # maintain font consistency
    width=1200,
    height=800,
    margin=dict(l=10, r=10, t=30, b=60),
    legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="right",
            x=1,
        ),
)

fig.show()

# fig.write_image(
#     "cluster_0_sensor.png",
#     scale=3,
# )

In [None]:
# do the same for the cluster 1
fig = make_subplots(
    rows=len(cluster_1_sensor_dict.keys()),
    cols=1,
    subplot_titles=list(cluster_1_sensor_dict.keys()),
    shared_xaxes=True,
    vertical_spacing=0.05,
    x_title="Time",
    y_title="TTHMs (µg/L)",
)

colors = [
    "brown",
    "olive",
    "teal",
    "navy",
    "pink",
]

show_true_TTHMs = True

for index, supply_point_name in enumerate(cluster_1_sensor_dict.keys()):
    
    sensor_df = cluster_1_sensor_dict[supply_point_name].copy()
    
    supply_points_grab = grab_df[grab_df['Code'] == supply_point_name].copy()
    
    model = cluster_models["cluster_1"]
    
    y_pred_median = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 1]
    y_pred_lower = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 0]
    y_pred_upper = model.predict(sensor_df[sensor_df.columns.difference(["DateTime"])].to_numpy())[:, 2]
    
    sensor_df['DateTime'] = pd.to_datetime(sensor_df['DateTime'])
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_median,
            mode="lines",
            name="Predicted TTHMs (95% PI)",
            line=dict(color=colors[index]),
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_lower,
            name="Predicted TTHMs (95% PI) Lower",
            line=dict(width=0),
            mode='lines',
            fillcolor='rgba(68, 68, 68, 0.3)',
            fill='tonexty',
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=sensor_df['DateTime'],
            y=y_pred_upper,
            mode="lines",
            name="Predicted TTHMs (95% PI) Upper",
            line=dict(width=0),
            fillcolor='rgba(68, 68, 68, 0.3)',
            fill='tonexty',
            showlegend=False
        ),
        row=index + 1,
        col=1,
    )
    
    fig.add_trace(
        go.Scatter(
            x=supply_points_grab['DateTime'],
            y=supply_points_grab['TTHMs'],
            mode="markers",
            name="True TTHMs",
            line=dict(color="black"),
            marker=dict(size=10),
            showlegend=show_true_TTHMs,
        ),
        row=index + 1,
        col=1
    )
    
    show_true_TTHMs = False
    
fig.update_layout(
    font=dict(family="Arial", size=18),  # maintain font consistency
    width=1200,
    height=800,
    margin=dict(l=10, r=10, t=30, b=60),
    legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="right",
            x=1,
        ),
)

fig.show()

# fig.write_image(
#     "cluster_1_sensor.png",
#     scale=3,
# )