# Variables With Prediction Modelling

Variables with future predictions are used to predict the Absorbance.

The variables are:
* Air temperature
* Daily Cumulated Rainfall
* Water Temperature
* Flow River (River Discharge in the Projections folder)


3 different models are used:
* linear regression
* random forest
* neural network

In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import (
    variance_inflation_factor,
)

import statsmodels.api as sm

from quantile_forest import RandomForestQuantileRegressor

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

# Hyperparameter Optimization
import optuna

# Neural Network
from sklearn.neural_network import MLPRegressor

# MAPiE
from mapie.regression import MapieQuantileRegressor, MapieRegressor
from mapie.metrics import regression_coverage_score

plt.rcParams["font.size"] = 16

# Define paths

In [None]:
data_folder = os.path.join("..", "..", "..", "data", "tarragona")

raw_data_folder = os.path.join(data_folder, "raw_data")

# Load dataset

In [None]:
full_df = pd.read_excel(
    os.path.join(raw_data_folder, "raw_full_dataset.xlsx")
)

In [None]:
full_df

In [None]:
full_df.rename(
    columns={
        "flowriver": "Flow River",
        "cumulated_rainfall_24h": "Daily Cumulated Rainfall",
        "environmental_temperature": "Air Temperature",
        "nitrate": "Nitrate",
        "dissolvedoxygen": "Dissolved Oxygen",
        "turbidity": "Turbidity",
        "watertemperature": "Water Temperature",
        "redoxpotential": "Redox Potential",
        "ABS254": "Absorbance 254nm",
    },
    inplace=True,
)

In [None]:
full_df.isna().sum()

In [None]:
full_df = full_df.dropna()

# Remove Variables with no future projections

They are:
- Nitrate
- pH
- Ammonium
- Dissolved Oxygen
- Conductivity
- Redox Potential

In [None]:
full_df.drop(
    columns=[
        "Nitrate",
        "pH",
        "Ammonium",
        "Dissolved Oxygen",
        "Conductivity",
        "Redox Potential",
        "Turbidity",
    ],
    inplace=True,
)

## Take the Monthly Average

In [None]:
full_df["Year"] = full_df["DateTime"].dt.year
full_df["Month"] = full_df["DateTime"].dt.month

In [None]:
# take the monthly average
monthly_avg_df = full_df.groupby(["Year", "Month"]).mean().reset_index()

In [None]:
full_df = monthly_avg_df

In [None]:
full_df["Season"] = full_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

# convert Season to integer
full_df["Season"] = full_df["Season"].apply(
    lambda x: 1
    if x == "Winter"
    else 2
    if x == "Spring"
    else 3
    if x == "Summer"
    else 4
)

full_df["Timestamp"] = full_df["DateTime"].apply(
    lambda x: x.timestamp()
)

# Multicollinearity Test

In [None]:
X = full_df[full_df.columns.difference(["DateTime"])]


vif_test = pd.DataFrame()
vif_test["variable"] = X.columns
vif_test["VIF"] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]

In [None]:
vif_test

In [None]:
X = full_df[full_df.columns.difference(["DateTime"])]


vif_test = pd.DataFrame()
vif_test["variable"] = X.columns
vif_test["VIF"] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]

In [None]:
vif_test

# Linear Regression

## All Features + Year

In [None]:
setting = "All Features + Year"

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Month", "Season"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2022-01-01"]
X_test = X[X["DateTime"] >= "2022-01-01"]

y_train = y[y["DateTime"] < "2022-01-01"]
y_test = y[y["DateTime"] >= "2022-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train[y_train.columns.difference(["DateTime"])]
y_test_fit = y_test[y_test.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()

X_train_fit = scaler.fit_transform(X_train_fit)
X_test_fit = scaler.transform(X_test_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_train.columns.difference(["DateTime"])
)
X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_test.columns.difference(["DateTime"])
)

### With available future projections

#### Train Model

In [None]:
model = sm.OLS(y_train_fit.values, sm.add_constant(X_train_fit)).fit()

In [None]:
results = model.summary2()

#### Results

In [None]:
print(results)

#### Predictions

In [None]:
# predict the model
predictions = model.get_prediction(
    sm.add_constant(X_test_fit)
).summary_frame(alpha=0.05)

In [None]:
train_res = model.resid

#### Plots

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_fit, predictions["mean"]))
r2 = r2_score(y_test_fit, predictions["mean"])

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test_fit, predictions["mean"], c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(model.fittedvalues, train_res, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Train Set Residuals Plot")
plt.show()

In [None]:
# plot the fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train["DateTime"],
    y=model.fittedvalues.values,
    label="Fitted Values",
)

plt.show()

In [None]:
residuals = (
    y_test_fit["Absorbance 254nm"].values - predictions["mean"].values
)

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(predictions["mean"], residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
# sns.lineplot(x=y_train['DateTime'], y=model.fittedvalues.values, label='Fitted Values')
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=predictions["mean"].values,
    label="Predicted Values",
)
plt.fill_between(
    y_test["DateTime"],
    predictions["obs_ci_lower"],
    predictions["obs_ci_upper"],
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"AIC = {model.aic:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    75,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm - Setting: {setting}")

plt.legend()
plt.show()

#### Store Results

In [None]:
lr_pred = predictions["mean"]
lr_lower_bound = predictions["obs_ci_lower"]
lr_upper_bound = predictions["obs_ci_upper"]
lr_rmse = rmse
lr_r2 = r2

## All Features + Year w/ log(y)

In [None]:
setting = "Ammonium + Flow River + Redox Potential + Turbidity + Year"

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Month", "Season"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

# scale y to ln(y)
y["Absorbance 254nm"] = np.log(y["Absorbance 254nm"])

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2022-01-01"]
X_test = X[X["DateTime"] >= "2022-01-01"]

y_train = y[y["DateTime"] < "2022-01-01"]
y_test = y[y["DateTime"] >= "2022-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train[y_train.columns.difference(["DateTime"])]
y_test_fit = y_test[y_test.columns.difference(["DateTime"])]

# scale the data
scaler = StandardScaler()

X_train_fit = scaler.fit_transform(X_train_fit)
X_test_fit = scaler.transform(X_test_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_train.columns.difference(["DateTime"])
)
X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_test.columns.difference(["DateTime"])
)

### Train Model

In [None]:
model = sm.OLS(y_train_fit.values, sm.add_constant(X_train_fit)).fit()

In [None]:
results = model.summary2()

### Results

In [None]:
print(results)

### Predictions

In [None]:
# predict the model
predictions = model.get_prediction(
    sm.add_constant(X_test_fit)
).summary_frame(alpha=0.05)

In [None]:
predictions

In [None]:
# rescale the data
y_train["Absorbance 254nm"] = np.exp(y_train["Absorbance 254nm"])
y_test["Absorbance 254nm"] = np.exp(y_test["Absorbance 254nm"])
y_test_fit = np.exp(y_test_fit)

predictions["mean"] = np.exp(predictions["mean"])
predictions["obs_ci_lower"] = np.exp(predictions["obs_ci_lower"])
predictions["obs_ci_upper"] = np.exp(predictions["obs_ci_upper"])

### Plots

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_fit, predictions["mean"]))
r2 = r2_score(y_test_fit, predictions["mean"])

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test_fit, predictions["mean"], c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = (
    y_test_fit["Absorbance 254nm"].values - predictions["mean"].values
)
plt.figure(figsize=(10, 5))
plt.scatter(predictions["mean"], residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Residuals Distribution")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=predictions["mean"].values,
    label="Predicted Values",
)
plt.fill_between(
    y_test["DateTime"],
    predictions["obs_ci_lower"],
    predictions["obs_ci_upper"],
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"AIC = {model.aic:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    75,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm - Setting: {setting}")

plt.legend()
plt.show()

### Store Results

In [None]:
log_pred = predictions["mean"]
log_lower_bound = predictions["obs_ci_lower"]
log_upper_bound = predictions["obs_ci_upper"]
log_rmse = rmse
log_r2 = r2

## All Features + Year + Polynomial Features

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Month", "Season"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    X[X.columns.difference(["DateTime"] + further_features)]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        X.columns.difference(["DateTime"] + further_features)
    ),
)

X_poly["DateTime"] = X["DateTime"].values

X_poly[further_features] = X[further_features].values

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X_poly[X_poly["DateTime"] < "2022-01-01"]
X_test = X_poly[X_poly["DateTime"] >= "2022-01-01"]

y_train = y[y["DateTime"] < "2022-01-01"]
y_test = y[y["DateTime"] >= "2022-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train[y_train.columns.difference(["DateTime"])]
y_test_fit = y_test[y_test.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()

X_train_fit = scaler.fit_transform(
    X_train_fit[X_train_fit.columns.difference(further_features)]
)
X_test_fit = scaler.transform(
    X_test_fit[X_test_fit.columns.difference(further_features)]
)

X_train_fit = pd.DataFrame(
    X_train_fit,
    columns=X_train.columns.difference(["DateTime"] + further_features),
)
X_test_fit = pd.DataFrame(
    X_test_fit,
    columns=X_test.columns.difference(["DateTime"] + further_features),
)

X_train_fit[further_features] = X_train[further_features].values
X_test_fit[further_features] = X_test[further_features].values

### Train Model

In [None]:
model = sm.OLS(y_train_fit.values, sm.add_constant(X_train_fit)).fit()

In [None]:
results = model.summary2()

### Results

In [None]:
print(results)

### Predictions

In [None]:
# predict the model
predictions = model.get_prediction(
    sm.add_constant(X_test_fit)
).summary_frame(alpha=0.05)

In [None]:
train_res = model.resid

### Plots

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_fit, predictions["mean"]))
r2 = r2_score(y_test_fit, predictions["mean"])

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test_fit, predictions["mean"], c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(model.fittedvalues, train_res, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Train Set Residuals Plot")
plt.show()

In [None]:
# plot the fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train["DateTime"],
    y=model.fittedvalues.values,
    label="Fitted Values",
)

plt.show()

In [None]:
residuals = (
    y_test_fit["Absorbance 254nm"].values - predictions["mean"].values
)

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(predictions["mean"], residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
# sns.lineplot(x=y_train['DateTime'], y=model.fittedvalues.values, label='Fitted Values')
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=predictions["mean"].values,
    label="Predicted Values",
)
plt.fill_between(
    y_test["DateTime"],
    predictions["obs_ci_lower"],
    predictions["obs_ci_upper"],
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"AIC = {model.aic:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    75,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm - Setting: {setting}")

plt.legend()
plt.show()

### Store Results

In [None]:
poly_pred = predictions["mean"]
poly_lower_bound = predictions["obs_ci_lower"]
poly_upper_bound = predictions["obs_ci_upper"]
poly_rmse = rmse
poly_r2 = r2

## Final Plot

In [None]:
# create a plot for the comparison of the models
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
    color="red",
)

# Linear Regression
sns.lineplot(
    x=y_test["DateTime"],
    y=lr_pred.values,
    label="Linear Regression",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test["DateTime"],
    lr_lower_bound,
    lr_upper_bound,
    alpha=0.2,
    color="blue",
)

# Linear Regression Log
sns.lineplot(
    x=y_test["DateTime"],
    y=log_pred.values,
    label="Log Linear Regression",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test["DateTime"],
    log_lower_bound,
    log_upper_bound,
    alpha=0.2,
    color="orange",
)

# Linear Regression Poly
sns.lineplot(
    x=y_test["DateTime"],
    y=poly_pred.values,
    label="Poly Linear Regression",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test["DateTime"],
    poly_lower_bound,
    poly_upper_bound,
    alpha=0.2,
    color="green",
)

plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Linear Regression RMSE = {lr_rmse:.2f}, R\u00b2 = {lr_r2:.2f}",
        f"Log Linear Regression RMSE = {log_rmse:.2f}, R\u00b2 = {log_r2:.2f}",
        f"Poly Linear Regression RMSE = {poly_rmse:.2f}, R\u00b2 = {poly_r2:.2f}",
    )
)

plt.text(
    y_test["DateTime"].iloc[0] - pd.Timedelta(days=20),
    33,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm - Model Comparison")

plt.legend()
plt.show()

In [None]:
# create a plot for the comparison of the models
plt.figure(figsize=(20, 10))
# sns.lineplot(x=y_train['DateTime'], y=y_train['Absorbance 254nm'], label='Historical Data')
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
    color="red",
)

# Linear Regression
sns.lineplot(
    x=y_test["DateTime"],
    y=lr_pred.values,
    label="Linear Regression",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test["DateTime"],
    lr_lower_bound,
    lr_upper_bound,
    alpha=0.2,
    color="blue",
)

# Linear Regression Log
sns.lineplot(
    x=y_test["DateTime"],
    y=log_pred.values,
    label="Log Linear Regression",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test["DateTime"],
    log_lower_bound,
    log_upper_bound,
    alpha=0.2,
    color="orange",
)

# Linear Regression Poly
sns.lineplot(
    x=y_test["DateTime"],
    y=poly_pred.values,
    label="Poly Linear Regression",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test["DateTime"],
    poly_lower_bound,
    poly_upper_bound,
    alpha=0.2,
    color="green",
)

# change the y range to see the differences
plt.ylim(-5, 41)

plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm (un.Abs/m)")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Linear Regression RMSE = {lr_rmse:.2f}",
        f"Log Linear Regression RMSE = {log_rmse:.2f}",
        f"Poly Linear Regression RMSE = {poly_rmse:.2f}",
    )
)

plt.text(
    y_test["DateTime"].iloc[0] - pd.Timedelta(days=20),
    36,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm (un.Abs/m) - Model Comparison")

plt.legend()
plt.show()

# Random Forest

In [None]:
setting = "All Features + Year"

### Split Data

In [None]:
X = full_df[
    full_df.columns.difference(
        ["Absorbance 254nm", "Season", "Month", "Timestamp"]
    )
]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2022-01-01"]
X_test = X[X["DateTime"] >= "2022-01-01"]

y_train = y[y["DateTime"] < "2022-01-01"]
y_test = y[y["DateTime"] >= "2022-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train[y_train.columns.difference(["DateTime"])]
y_test_fit = y_test[y_test.columns.difference(["DateTime"])]

### Train Model

#### Perform K-Fold Cross-Validation

K = 10 since the training set is composed of 10 years (almost).

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [None]:
rf_model = RandomForestQuantileRegressor(
    n_estimators=20, max_features="log2", max_depth=7, random_state=42
)

cv = KFold(n_splits=10)

rf_ensemble = cross_validate(
    rf_model,
    X_train_fit,
    y_train_fit.values.flatten(),
    cv=cv,
    scoring="neg_mean_squared_error",
    return_estimator=True,
    return_train_score=True,
)

### Results

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = X_train_fit.columns
feature_importance["importance_mean"] = np.mean(
    [
        estimator.feature_importances_
        for estimator in rf_ensemble["estimator"]
    ],
    axis=0,
)
feature_importance["importance_std"] = np.std(
    [
        estimator.feature_importances_
        for estimator in rf_ensemble["estimator"]
    ],
    axis=0,
)

feature_importance = feature_importance.sort_values(
    by="importance_mean", ascending=False
)

# plot the importance of each feature with the std
plt.figure(figsize=(25, 7.5))
plt.errorbar(
    x=feature_importance["feature"],
    y=feature_importance["importance_mean"],
    yerr=feature_importance["importance_std"],
    fmt="o",
)
plt.title("Feature Importance")
plt.show()

### Predictions

In [None]:
# get predictions
mean_predictions = np.array(
    [
        estimator.predict(X_test_fit, quantiles=0.5)
        for estimator in rf_ensemble["estimator"]
    ]
)
lower_bound_predictions = np.array(
    [
        estimator.predict(X_test_fit, quantiles=0.05)
        for estimator in rf_ensemble["estimator"]
    ]
)
upper_bound_predictions = np.array(
    [
        estimator.predict(X_test_fit, quantiles=0.95)
        for estimator in rf_ensemble["estimator"]
    ]
)

In [None]:
mean_predictions = np.mean(mean_predictions, axis=0)
lower_bound_predictions = np.mean(lower_bound_predictions, axis=0)
upper_bound_predictions = np.mean(upper_bound_predictions, axis=0)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_fit, mean_predictions))
r2 = r2_score(y_test_fit, mean_predictions)

print(f"RMSE: {rmse:.2f}")
print(f"R\u00b2: {r2:.2f}")

### Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test_fit.values, mean_predictions, c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test_fit["Absorbance 254nm"] - mean_predictions
plt.figure(figsize=(10, 5))
plt.scatter(mean_predictions, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
# get training residuals
train_predictions = np.array(
    [
        estimator.predict(X_train_fit, quantiles=0.5)
        for estimator in rf_ensemble["estimator"]
    ]
)
train_predictions = np.mean(train_predictions, axis=0)

train_residuals = y_train_fit["Absorbance 254nm"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series of the residuals and of the train set
fig, ax = plt.subplots(2, 1, figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
    ax=ax[0],
)
sns.lineplot(
    x=y_train["DateTime"],
    y=train_predictions,
    label="Fitted Values",
    ax=ax[0],
)

sns.lineplot(
    x=y_train["DateTime"],
    y=train_residuals,
    label="Training Residuals",
    ax=ax[1],
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(
    x=y_test["DateTime"], y=mean_predictions, label="Predicted Values"
)
# plot std of predictions
plt.fill_between(
    y_test["DateTime"],
    lower_bound_predictions,
    y2=upper_bound_predictions,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    75,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"Absorbance 254nm - Setting: {setting}")

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [None]:
rf_pred = mean_predictions
rf_lower_bound = lower_bound_predictions
rf_upper_bound = upper_bound_predictions
rf_rmse = rmse
rf_r2 = r2

# XGBoost

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Season", "Month"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(X[X.columns.difference(["DateTime"])])

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        X.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = X["DateTime"].values

# X_poly[further_features] = X[further_features].values

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X_poly[X_poly["DateTime"] < "2021-01-01"]
X_test = X_poly[X_poly["DateTime"] >= "2021-01-01"]

y_train = y[y["DateTime"] < "2021-01-01"]
y_test = y[y["DateTime"] >= "2021-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train[y_train.columns.difference(["DateTime"])]
y_test_fit = y_test[y_test.columns.difference(["DateTime"])]

In [None]:
# %%script false --no-raise-error
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2021-01-01"]
X_test = X[X["DateTime"] >= "2021-01-01"]

y_train = y[y["DateTime"] < "2021-01-01"]
y_test = y[y["DateTime"] >= "2021-01-01"]

In [None]:
# %%script false --no-raise-error
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train.copy()
y_test_fit = y_test.copy()

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_train_fit = scaler.fit_transform(X_train_fit)
X_test_fit = scaler.transform(X_test_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_train.columns.difference(["DateTime"])
)
X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_test.columns.difference(["DateTime"])
)

### Train Model

#### Learning and Validation Curves

In [None]:
def learning_curves(
    estimator,
    title,
    X,
    y,
    cv=None,
    train_sizes=np.linspace(0.3, 1.0, 5),
):
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator,
        X,
        y,
        train_sizes=train_sizes,
        cv=cv,
        scoring="neg_mean_squared_error",
    )

    train_scores_mean = np.sqrt(-np.mean(train_scores, axis=1))
    train_scores_std = np.sqrt(np.std(train_scores, axis=1))
    validation_scores_mean = np.sqrt(
        -np.mean(validation_scores, axis=1)
    )
    validation_scores_std = np.sqrt(np.std(validation_scores, axis=1))

    plt.rcParams["font.size"] = 12
    plt.figure(figsize=(20, 7.5))
    plt.plot(
        train_sizes,
        train_scores_mean,
        "o-",
        color="r",
        label="Training error",
    )
    plt.plot(
        train_sizes,
        validation_scores_mean,
        "o-",
        color="g",
        label="Validation error",
    )
    plt.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    plt.fill_between(
        train_sizes,
        validation_scores_mean - validation_scores_std,
        validation_scores_mean + validation_scores_std,
        alpha=0.1,
        color="g",
    )

    plt.rcParams["font.size"] = 10
    plt.ylabel("RMSE", fontsize=14)
    plt.xlabel("Training set size", fontsize=14)
    title = title
    plt.title(title, fontsize=18, y=1.03)
    plt.legend()
    plt.show()

In [None]:
def validation_curves(
    estimator,
    title,
    X,
    y,
    cv=None,
    param_name=None,
    param_range=None,
    fit_params=None,
):
    train_scores, test_scores = validation_curve(
        estimator,
        X,
        y,
        param_name=param_name,
        param_range=param_range,
        fit_params=fit_params,
        cv=cv,
        scoring="neg_mean_squared_error",  #'roc_auc'
        n_jobs=4,
    )
    train_scores_mean = np.sqrt(-np.mean(train_scores, axis=1))
    train_scores_std = np.sqrt(np.std(train_scores, axis=1))
    test_scores_mean = np.sqrt(-np.mean(test_scores, axis=1))
    test_scores_std = np.sqrt(np.std(test_scores, axis=1))

    plt.rcParams["font.size"] = 12
    plt.figure(figsize=(20, 7.5))
    plt.title(title, fontsize=20)
    plt.xlabel(param_name, fontsize=14)
    plt.ylabel("Score", fontsize=14)
    lw = 2
    plt.plot(
        param_range,
        train_scores_mean,
        label="Training score",
        color="darkorange",
        lw=lw,
    )
    plt.fill_between(
        param_range,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.2,
        color="darkorange",
        lw=lw,
    )
    plt.plot(
        param_range,
        test_scores_mean,
        label="Cross-validation score",
        color="navy",
        lw=lw,
    )
    plt.fill_between(
        param_range,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.2,
        color="navy",
        lw=lw,
    )
    plt.rcParams["font.size"] = 10
    plt.legend(loc="best")
    # plt.ylim(4, 8)
    plt.grid(visible=True)
    plt.show()

    # print param value with lowest test score
    print(
        f"{param_name} with lowest score:",
        param_range[np.argmin(test_scores_mean)],
    )

In [None]:
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    booster="gblinear",
    random_state=42,
)

In [None]:
title = "Learning Curves (XGB)"
learning_curves(
    model, title, X_train_fit, y_train_fit["Absorbance 254nm"], cv=5
)

In [None]:
param_name = "n_estimators"
param_range = np.arange(1, 100, 5)

In [None]:
title = "Validation Curves (LGBM)"
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "learning_rate"
param_range = np.logspace(-3, -1, 10)

In [None]:
np.logspace(-3, -1, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "eta"
param_range = np.logspace(-5, 0, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "reg_alpha"
param_range = np.logspace(-8, 0, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "reg_lambda"
param_range = np.logspace(-5, -1, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "scale_pos_weight"
param_range = np.logspace(-5, 0, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "base_score"
param_range = np.linspace(0.1, 10, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "importance_type"
param_range = ["gain", "weight", "cover", "total_gain", "total_cover"]

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

#### Hyperparameter Tuning

In [None]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = xgb.XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [None]:
def objective(trial: optuna.trial.Trial) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    n_splits = 5
    cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_train_fit, y_train_fit["Absorbance 254nm"])
    ):
        cv_mae[i] = fit_and_validate_xgb_model(
            X_train_fit,
            y_train_fit["Absorbance 254nm"],
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [None]:
study = optuna.create_study(
    direction="minimize",
    storage="sqlite:///XGBoost.sqlite3",
    study_name="Hyperparameter Tuning - All Features"
    + " + "
    + str(further_features),
    load_if_exists=True,
)
study.optimize(
    objective,
    n_trials=200,
    show_progress_bar=True,
)

In [None]:
study.best_params

In [None]:
params = study.best_params
params["objective"] = "reg:squarederror"
params["booster"] = "gblinear"

In [None]:
# params['n_estimators'] = 150
# params['learning_rate'] = 0.3

In [None]:
param = {}
param["objective"] = "reg:squarederror"
param["booster"] = "gblinear"
param["n_estimators"] = 10
param["learning_rate"] = 0.5

In [None]:
params = {
    "objective": "reg:squarederror",
    "booster": "gblinear",
    "eta": 0.021222820197838683,
    "reg_lambda": 1.2716135487076726e-07,
    "reg_alpha": 0.25584966658518155,
    "learning_rate": 0.6602531811820622,
    "n_estimators": 369,
    "updater": "shotgun",
}

In [None]:
booster = xgb.XGBRegressor(
    random_state=42,
    **params,
)

booster.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = booster.feature_names_in_
feature_importance["importance"] = booster.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)

# rotate the x axis words by 45°
plt.xticks(rotation=45)

plt.title("Feature Importance")
plt.show()

### Predictions

In [None]:
mapie_cqr = MapieRegressor(booster, method="naive", random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

In [None]:
alpha = 0.05

# Evaluate prediction and coverage level on testing set
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

coverage_cqr = regression_coverage_score(
    y_test_fit["Absorbance 254nm"], y_lower, y_upper
)

In [None]:
print(f"Coverage: {coverage_cqr}")

#### Compute Measures

In [None]:
def nse(predictions, targets):
    return 1 - (
        np.sum((targets - predictions) ** 2)
        / np.sum((targets - np.mean(targets)) ** 2)
    )

In [None]:
from sklearn.metrics import mean_absolute_error


rmse = np.sqrt(
    mean_squared_error(y_test_fit["Absorbance 254nm"], y_med)
)
r2 = r2_score(y_test_fit["Absorbance 254nm"], y_med)

mae = mean_absolute_error(y_test_fit["Absorbance 254nm"], y_med)

nseff = nse(y_med, y_test_fit["Absorbance 254nm"])

print(f"RMSE: {rmse:.2f}")
print(f"R\u00b2: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"NSE: {nseff:.2f}")

### Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test_fit["Absorbance 254nm"], y_med, c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test_fit["Absorbance 254nm"] - y_med
plt.figure(figsize=(10, 5))
plt.scatter(y_med, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
# get training residuals
train_predictions = booster.predict(X_train_fit)
train_residuals = y_train_fit["Absorbance 254nm"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train["DateTime"], y=train_predictions, label="Fitted Values"
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(x=y_test["DateTime"], y=y_med, label="Predicted Values")
# plot std of predictions
plt.fill_between(
    y_test["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"Coverage = {coverage_cqr:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"Absorbance 254nm - Setting: All Features + {further_features}"
)

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [None]:
boost_pred = y_med
boost_lower_bound = y_lower
boost_upper_bound = y_upper
boost_rmse = rmse
boost_r2 = r2
boost_mae = mae
boost_nse = nseff
boost_coverage = coverage_cqr

# LightGBM (Random Forest with Linear Regressors on leaves)

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Season", "Month"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(X[X.columns.difference(["DateTime"])])

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        X.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = X["DateTime"].values

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X_poly[X_poly["DateTime"] < "2021-01-01"]
X_test = X_poly[X_poly["DateTime"] >= "2021-01-01"]

y_train = y[y["DateTime"] < "2021-01-01"]
y_test = y[y["DateTime"] >= "2021-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train.copy()
y_test_fit = y_test.copy()

In [None]:
# %%script false --no-raise-error
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2021-01-01"]
X_test = X[X["DateTime"] >= "2021-01-01"]

y_train = y[y["DateTime"] < "2021-01-01"]
y_test = y[y["DateTime"] >= "2021-01-01"]

In [None]:
# %%script false --no-raise-error
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train.copy()
y_test_fit = y_test.copy()

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_train_fit = scaler.fit_transform(X_train_fit)
X_test_fit = scaler.transform(X_test_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_train.columns.difference(["DateTime"])
)
X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_test.columns.difference(["DateTime"])
)

### Train Model

#### Learning and Validation Curves

In [None]:
def learning_curves(
    estimator,
    title,
    X,
    y,
    cv=None,
    train_sizes=np.linspace(0.3, 1.0, 5),
):
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator,
        X,
        y,
        train_sizes=train_sizes,
        cv=cv,
        scoring="neg_mean_squared_error",
    )

    train_scores_mean = np.sqrt(-np.mean(train_scores, axis=1))
    train_scores_std = np.sqrt(np.std(train_scores, axis=1))
    validation_scores_mean = np.sqrt(
        -np.mean(validation_scores, axis=1)
    )
    validation_scores_std = np.sqrt(np.std(validation_scores, axis=1))

    plt.rcParams["font.size"] = 12
    plt.figure(figsize=(20, 7.5))
    plt.plot(
        train_sizes,
        train_scores_mean,
        "o-",
        color="r",
        label="Training error",
    )
    plt.plot(
        train_sizes,
        validation_scores_mean,
        "o-",
        color="g",
        label="Validation error",
    )
    plt.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    plt.fill_between(
        train_sizes,
        validation_scores_mean - validation_scores_std,
        validation_scores_mean + validation_scores_std,
        alpha=0.1,
        color="g",
    )

    plt.rcParams["font.size"] = 10
    plt.ylabel("RMSE", fontsize=14)
    plt.xlabel("Training set size", fontsize=14)
    title = title
    plt.title(title, fontsize=18, y=1.03)
    plt.legend()
    plt.show()

In [None]:
def validation_curves(
    estimator,
    title,
    X,
    y,
    cv=None,
    param_name=None,
    param_range=None,
    fit_params=None,
):
    train_scores, test_scores = validation_curve(
        estimator,
        X,
        y,
        param_name=param_name,
        param_range=param_range,
        fit_params=fit_params,
        cv=cv,
        scoring="neg_mean_squared_error",  #'roc_auc'
        n_jobs=4,
    )
    train_scores_mean = np.sqrt(-np.mean(train_scores, axis=1))
    train_scores_std = np.sqrt(np.std(train_scores, axis=1))
    test_scores_mean = np.sqrt(-np.mean(test_scores, axis=1))
    test_scores_std = np.sqrt(np.std(test_scores, axis=1))

    plt.rcParams["font.size"] = 12
    plt.figure(figsize=(20, 7.5))
    plt.title(title, fontsize=20)
    plt.xlabel(param_name, fontsize=14)
    plt.ylabel("Score", fontsize=14)
    lw = 2
    plt.plot(
        param_range,
        train_scores_mean,
        label="Training score",
        color="darkorange",
        lw=lw,
    )
    plt.fill_between(
        param_range,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.2,
        color="darkorange",
        lw=lw,
    )
    plt.plot(
        param_range,
        test_scores_mean,
        label="Cross-validation score",
        color="navy",
        lw=lw,
    )
    plt.fill_between(
        param_range,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.2,
        color="navy",
        lw=lw,
    )
    plt.rcParams["font.size"] = 10
    plt.legend(loc="best")
    # plt.ylim(4, 8)
    plt.grid(visible=True)
    plt.show()

    # print param value with lowest test score
    print(
        f"{param_name} with lowest score:",
        param_range[np.argmin(test_scores_mean)],
    )

In [None]:
params_lgb = {
    "objective": "regression",
    "linear_tree": True,
    "random_state": 42,
}

In [None]:
model = LGBMRegressor(**params_lgb)

In [None]:
title = "Learning Curves (LGBM)"
learning_curves(
    model, title, X_train_fit, y_train_fit["Absorbance 254nm"], cv=5
)

In [None]:
param_name = "n_estimators"
param_range = np.arange(1, 100, 5)

In [None]:
title = "Validation Curves (LGBM)"
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "learning_rate"
param_range = np.logspace(-3, -1, 10)

In [None]:
np.logspace(-3, -1, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "num_leaves"
param_range = np.arange(2, 50, 2)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "lambda_l1"
param_range = np.logspace(-5, -1, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "lambda_l2"
param_range = np.logspace(-5, -1, 10)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "max_depth"
param_range = np.arange(2, 20, 1)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

In [None]:
param_name = "max_bin"
param_range = np.arange(2, 255, 5)

In [None]:
validation_curves(
    model,
    title,
    X_train_fit,
    y_train_fit["Absorbance 254nm"],
    cv=5,
    param_name=param_name,
    param_range=param_range,
)

#### Hyperparameter Tuning

In [None]:
def fit_and_validate_lgbm_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = LGBMRegressor(
        objective="regression",
        random_state=42,
        linear_tree=True,
    )

    if params is not None:
        model.set_params(**params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [None]:
def objective(trial: optuna.trial.Trial) -> float:
    config = {
        "n_estimators": trial.suggest_int(
            "n_estimators", 1, 20, step=1
        ),
        "learning_rate": trial.suggest_float(
            "learning_rate", 1e-3, 1, log=True
        ),
        "max_depth": trial.suggest_int("max_depth", 2, 16, step=1),
        "num_leaves": trial.suggest_int("num_leaves", 2, 20, step=1),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf", 2, 50, step=1
        ),
        "lambda_l1": trial.suggest_float(
            "lambda_l1", 1e-3, 10, log=True
        ),
        "lambda_l2": trial.suggest_float(
            "lambda_l2", 1e-3, 10, log=True
        ),
        "min_split_gain": trial.suggest_float(
            "min_split_gain", 0, 15, step=0.5
        ),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 1e-3, 1, log=True
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 1e-3, 1, log=True
        ),
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 20, 1000, log=True
        ),
        "max_bin": trial.suggest_int("max_bin", 10, 500, step=10),
    }

    n_splits = 5
    cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_train_fit, y_train_fit["Absorbance 254nm"])
    ):
        cv_mae[i] = fit_and_validate_lgbm_model(
            X_train_fit,
            y_train_fit["Absorbance 254nm"],
            train_index,
            test_index,
            config,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [None]:
study = optuna.create_study(
    direction="minimize",
    storage="sqlite:///LGBM.sqlite3",
    study_name="Hyperparameter Tuning - All Features"
    + " + "
    + str(further_features),
    load_if_exists=True,
)
study.optimize(objective, n_trials=400, show_progress_bar=True)

In [None]:
study.best_params

In [None]:
params = study.best_params

# params['n_estimators'] = 10
# params["max_bin"] = 30
params["learning_rate"] = 0.6
# params['lambda_l2'] = 1

In [None]:
# %%script false --no-raise-error
params = {
    "n_estimators": 16,
    "learning_rate": 0.6192800859019298,
    "max_depth": 16,
    "num_leaves": 20,
    "min_data_in_leaf": 34,
    "lambda_l1": 1.8585248563175933,
    "lambda_l2": 0.020368547806226774,
    "min_split_gain": 2.5,
    "subsample": 0.5639096844841955,
    "bagging_fraction": 0.026474369917739878,
    "feature_fraction": 0.0012608584366219668,
    "min_child_samples": 33,
    "max_bin": 20,
}

#### Train model with mapie

In [None]:
alpha = 0.1
estimator = LGBMRegressor(
    objective="regression", random_state=42, linear_tree=True, **params
)

estimator.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

In [None]:
# Calibrate uncertainties on calibration set
mapie_cqr = MapieRegressor(estimator, cv="prefit", random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = estimator.feature_name_
feature_importance["importance"] = estimator.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)
plt.title("Feature Importance")
plt.show()

### Predictions

In [None]:
# Evaluate prediction and coverage level on testing set
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

coverage_cqr = regression_coverage_score(
    y_test_fit["Absorbance 254nm"], y_lower, y_upper
)

In [None]:
print("Coverage:", coverage_cqr)

#### Compute Measures

In [None]:
rmse = np.sqrt(
    mean_squared_error(y_test_fit["Absorbance 254nm"], y_med)
)
r2 = r2_score(y_test_fit["Absorbance 254nm"], y_med)

mae = mean_absolute_error(y_test_fit["Absorbance 254nm"], y_med)

nseff = nse(y_med, y_test_fit["Absorbance 254nm"])

print(f"RMSE: {rmse:.2f}")
print(f"R\u00b2: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"NSE: {nseff:.2f}")

### Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test_fit["Absorbance 254nm"], y_med, c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test_fit["Absorbance 254nm"] - y_med
plt.figure(figsize=(10, 5))
plt.scatter(y_med, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["Absorbance 254nm"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train_fit["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train_fit["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test_fit["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(x=y_test["DateTime"], y=y_med, label="Predicted Values")
# plot std of predictions
plt.fill_between(
    y_test["DateTime"],
    y_lower.flatten(),
    y2=y_upper.flatten(),
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"Coverage = {coverage_cqr:.3f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0] - pd.Timedelta(days=120),
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"Absorbance 254nm - Setting: All Features + {further_features}"
)

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [None]:
lgbm_pred = y_med
lgbm_lower_bound = y_lower
lgbm_upper_bound = y_upper
lgbm_rmse = rmse
lgbm_r2 = r2
lgbm_mae = mae
lgbm_nse = nseff
lgbm_coverage = coverage_cqr

# Neural Network

## Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Season", "Month"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["Absorbance 254nm"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "Absorbance 254nm"]]

In [None]:
# split data such that 2022 and 2023 are test data
X_train = X[X["DateTime"] < "2021-01-01"]
X_test = X[X["DateTime"] >= "2021-01-01"]

y_train = y[y["DateTime"] < "2021-01-01"]
y_test = y[y["DateTime"] >= "2021-01-01"]

In [None]:
X_train_fit = X_train[X_train.columns.difference(["DateTime"])]
X_test_fit = X_test[X_test.columns.difference(["DateTime"])]

y_train_fit = y_train.copy()
y_test_fit = y_test.copy()

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_train_fit = scaler.fit_transform(X_train_fit)
X_test_fit = scaler.transform(X_test_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_train.columns.difference(["DateTime"])
)
X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_test.columns.difference(["DateTime"])
)

## Train the model

### Hyperparameter Tuning

In [None]:
def fit_and_validate_nn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = MLPRegressor(
        random_state=42,
        hidden_layer_sizes=tuple(params["layers"]),
        max_iter=1000,
    )

    param = params.copy()
    param.pop("layers")
    model.set_params(**param)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [None]:
def objective(trial: optuna.trial.Trial) -> float:
    config = {
        "layers": [
            trial.suggest_int(f"n_units_{i}", 50, 100, step=5)
            for i in range(trial.suggest_int("n_layers", 2, 2))
        ],
        "activation": trial.suggest_categorical(
            "activation", ["identity", "logistic", "tanh", "relu"]
        ),
        "solver": trial.suggest_categorical("solver", ["sgd", "adam"]),
        "alpha": trial.suggest_float("alpha", 1e-5, 1),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", ["constant", "invscaling", "adaptive"]
        ),
        "power_t": trial.suggest_float("power_t", 0.1, 1),
        "beta_1": trial.suggest_float("beta_1", 0.1, 1),
        "beta_2": trial.suggest_float("beta_2", 0.1, 1),
        "epsilon": trial.suggest_float("epsilon", 1e-8, 1),
        "early_stopping": True,
    }

    n_splits = 5
    cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_train_fit, y_train_fit["Absorbance 254nm"])
    ):
        cv_mae[i] = fit_and_validate_nn_model(
            X_train_fit,
            y_train_fit["Absorbance 254nm"],
            train_index,
            test_index,
            config,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [None]:
study = optuna.create_study(
    direction="minimize",
    storage="sqlite:///NeuralNetwork.sqlite3",
    study_name="Hyperparameter Tuning - All Features"
    + " + "
    + str(further_features),
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

In [None]:
study.best_params

In [None]:
params = {
    "n_layers": 2,
    "n_units_0": 85,
    "n_units_1": 75,
    "activation": "relu",
    "solver": "adam",
    "alpha": 0.7765540584565614,
    "learning_rate": "constant",
    "power_t": 0.3382710741601535,
    "beta_1": 0.19887581875693028,
    "beta_2": 0.984060053664114,
    "epsilon": 0.32827083622604075,
}

In [None]:
# params["n_units_1"] = 80
# params['learning_rate'] = 'invscaling'
params["activation"] = "identity"

hidden_layer_sizes = [
    params[f"n_units_{i}"] for i in range(params["n_layers"])
]

for i in range(params["n_layers"]):
    params.pop(f"n_units_{i}")

params.pop("n_layers")

#### Train model with mapie

In [None]:
alpha = 0.1
estimator = MLPRegressor(
    random_state=42,
    hidden_layer_sizes=hidden_layer_sizes,
    max_iter=1000,
    early_stopping=True,
    **params
)

estimator.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

In [None]:
# Calibrate uncertainties on calibration set
mapie_cqr = MapieRegressor(estimator, cv="prefit", random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["Absorbance 254nm"])

## Prediction

In [None]:
# Evaluate prediction and coverage level on testing set
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

coverage_cqr = regression_coverage_score(
    y_test_fit["Absorbance 254nm"], y_lower, y_upper
)

In [None]:
print("Coverage:", coverage_cqr)

#### Compute Measures

In [None]:
rmse = np.sqrt(
    mean_squared_error(y_test_fit["Absorbance 254nm"], y_med)
)
r2 = r2_score(y_test_fit["Absorbance 254nm"], y_med)

mae = mean_absolute_error(y_test_fit["Absorbance 254nm"], y_med)

nseff = nse(y_med, y_test_fit["Absorbance 254nm"])

print(f"RMSE: {rmse:.2f}")
print(f"R\u00b2: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"NSE: {nseff:.2f}")

## Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test_fit["Absorbance 254nm"], y_med, c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test_fit["Absorbance 254nm"] - y_med
plt.figure(figsize=(10, 5))
plt.scatter(y_med, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["Absorbance 254nm"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(
    train_predictions.flatten(), train_residuals, c="b", s=40, alpha=0.5
)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test_fit["DateTime"],
    y=y_test_fit["Absorbance 254nm"],
    label="True Values",
)
sns.lineplot(
    x=y_test_fit["DateTime"], y=y_med, label="Predicted Values"
)
# plot std of predictions
plt.fill_between(
    y_test_fit["DateTime"],
    y_lower.flatten(),
    y2=y_upper.flatten(),
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("Absorbance 254nm")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"Coverage = {coverage_cqr:.3f}",
    )
)

plt.text(
    y_train_fit["DateTime"].iloc[0] - pd.Timedelta(days=120),
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"Absorbance 254nm - Setting: All Features + {further_features}"
)

plt.legend()
plt.show()

## Store Results

In [None]:
nn_pred = y_med
nn_lower_bound = y_lower
nn_upper_bound = y_upper
nn_rmse = rmse
nn_r2 = r2
nn_mae = mae
nn_nse = nseff
nn_coverage = coverage_cqr

# Final Plot

In [None]:
# create a plot for the comparison of the models
from cProfile import label


plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train["DateTime"],
    y=y_train["Absorbance 254nm"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
    color="red",
)

# Linear Regression
sns.lineplot(
    x=y_test["DateTime"],
    y=boost_pred,
    label="XGBoost",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test["DateTime"],
    boost_lower_bound,
    boost_upper_bound,
    alpha=0.2,
    color="blue",
    label="95% Prediction Interval",
)

# Random Forest
sns.lineplot(
    x=y_test["DateTime"],
    y=lgbm_pred,
    label="Light GBM",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test["DateTime"],
    lgbm_lower_bound,
    lgbm_upper_bound,
    alpha=0.2,
    color="orange",
    label="95% Prediction Interval",
)

# Neural Network
sns.lineplot(
    x=y_test["DateTime"],
    y=nn_pred,
    label="Neural Network",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test["DateTime"],
    nn_lower_bound,
    nn_upper_bound,
    alpha=0.2,
    color="green",
    label="95% Prediction Interval",
)

plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"XGBoost RMSE = {boost_rmse:.2f}, R\u00b2 = {boost_r2:.2f}, MAE = {boost_mae:.2f}",
        f"Light GBM RMSE = {lgbm_rmse:.2f}, R\u00b2 = {lgbm_r2:.2f}, MAE = {lgbm_mae:.2f}",
        f"Neural Network RMSE = {nn_rmse:.2f}, R\u00b2 = {nn_r2:.2f} MAE = {nn_mae:.2f}",
    )
)

plt.text(
    y_train["DateTime"].iloc[0],
    -2,
    s=text_string,
    fontsize=16,
    bbox=props,
)

plt.title(f"UVA254 - Model Comparison")

plt.legend()
plt.show()

In [None]:
# create a plot for the comparison of the models
plt.figure(figsize=(20, 10))
# sns.lineplot(x=y_train['DateTime'], y=y_train['Absorbance 254nm'], label='Historical Data')
sns.lineplot(
    x=y_test["DateTime"],
    y=y_test["Absorbance 254nm"],
    label="True Values",
    color="red",
)

# Linear Regression
sns.lineplot(
    x=y_test["DateTime"],
    y=boost_pred,
    label="Linear Regression",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test["DateTime"],
    boost_lower_bound,
    boost_upper_bound,
    alpha=0.2,
    color="blue",
    label="95% Prediction Interval",
)

# Random Forest
sns.lineplot(
    x=y_test["DateTime"],
    y=lgbm_pred,
    label="Random Forest",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test["DateTime"],
    lgbm_lower_bound,
    lgbm_upper_bound,
    alpha=0.2,
    color="orange",
    label="95% Prediction Interval",
)

# Neural Network
sns.lineplot(
    x=y_test["DateTime"],
    y=nn_pred,
    label="Neural Network",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test["DateTime"],
    nn_lower_bound,
    nn_upper_bound,
    alpha=0.2,
    color="green",
    label="95% Prediction Interval",
)

plt.ylim(-5, 41)

plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"XGBoost RMSE = {boost_rmse:.2f}, R\u00b2 = {boost_r2:.2f}, MAE = {boost_mae:.2f}",
        f"Light GBM RMSE = {lgbm_rmse:.2f}, R\u00b2 = {lgbm_r2:.2f}, MAE = {lgbm_mae:.2f}",
        f"Neural Network RMSE = {nn_rmse:.2f}, R\u00b2 = {nn_r2:.2f} MAE = {nn_mae:.2f}",
    )
)

plt.text(
    y_test["DateTime"].iloc[0] - pd.Timedelta(days=20),
    36,
    s=text_string,
    fontsize=16,
    bbox=props,
)

plt.title(f"UVA254 - Model Comparison")

plt.legend()
plt.show()