# Variables With Prediction Modelling

Variables with future predictions are used to predict the Absorbance.

The variables are:
* Air temperature
* Daily Cumulated Rainfall
* Water Temperature
* Flow River (River Discharge in the Projections folder)


3 different models are used:
* linear regression
* random forest
* neural network

In [171]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import (
    variance_inflation_factor,
)

import statsmodels.api as sm

from quantile_forest import RandomForestQuantileRegressor

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.utils import resample

# Hyperparameter Optimization
import optuna

# Neural Network
from sklearn.neural_network import MLPRegressor


plt.rcParams["font.size"] = 16

# Define paths

In [172]:
data_folder = os.path.join("..", "..", "..", "data", "tarragona")

raw_data_folder = os.path.join(data_folder, "raw_data")

# Load dataset

In [173]:
full_df = pd.read_excel(
    os.path.join(raw_data_folder, "raw_full_dataset.xlsx")
)

In [None]:
full_df

In [175]:
full_df.rename(
    columns={
        "flowriver": "Flow River",
        "cumulated_rainfall_24h": "Daily Cumulated Rainfall",
        "environmental_temperature": "Air Temperature",
        "nitrate": "Nitrate",
        "dissolvedoxygen": "Dissolved Oxygen",
        "turbidity": "Turbidity",
        "watertemperature": "Water Temperature",
        "redoxpotential": "Redox Potential",
        "ABS254": "UVA254",
    },
    inplace=True,
)

In [None]:
full_df.isna().sum()

In [177]:
full_df = full_df.dropna()

# Remove Variables with no future projections

They are:
- Nitrate
- pH
- Ammonium
- Dissolved Oxygen
- Conductivity
- Redox Potential

In [None]:
full_df.drop(
    columns=[
        "Nitrate (mg/L)",
        "pH",
        "Ammonium (mg/L)",
        "Dissolved Oxygen (mg/L)",
        "Conductivity (µS/cm)",
        "Redox Potential (mV)",
        "Turbidity (NTU)",
        "Daily Cumulated Rainfall",
        "Flow River",                       
        "Air Temperature",
        "is_outlier",
    ],
    inplace=True,
)

## Take the Monthly Average

In [None]:
full_df["Year"] = full_df["DateTime"].dt.year
full_df["Month"] = full_df["DateTime"].dt.month

In [180]:
# take the monthly average
monthly_avg_df = full_df.groupby(["Year", "Month"]).mean().reset_index()

In [181]:
full_df = monthly_avg_df

In [182]:
full_df["Season"] = full_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

# convert Season to integer
full_df["Season"] = full_df["Season"].apply(
    lambda x: 1
    if x == "Winter"
    else 2
    if x == "Spring"
    else 3
    if x == "Summer"
    else 4
)

full_df["Timestamp"] = full_df["DateTime"].apply(
    lambda x: x.timestamp()
)

# Multicollinearity Test

In [183]:
X = full_df[full_df.columns.difference(["DateTime"])]


vif_test = pd.DataFrame()
vif_test["variable"] = X.columns
vif_test["VIF"] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]

In [None]:
vif_test

In [185]:
X = full_df[full_df.columns.difference(["DateTime"])]


vif_test = pd.DataFrame()
vif_test["variable"] = X.columns
vif_test["VIF"] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]

In [None]:
vif_test

In [187]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    df.drop(columns=['1'], inplace=True)
    return df

# Prepare Dataset

In [188]:
# add the year and month columns
full_df["Year"] = full_df["DateTime"].dt.year
full_df["Month"] = full_df["DateTime"].dt.month

# Save the datetime column for later (drop diff returns error
# if I remove it before)
datetime_column = full_df.dropna()["DateTime"]

X = full_df.drop(columns=["DateTime", "UVA254"])
y = full_df[['DateTime', 'UVA254']]

X = extend_features(X, lags=1, rolling_window=3, poly_degree=2)

# Normalize the data
scaler = MinMaxScaler()
cols = X.columns

X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=cols)

# Add the datetime column back
X["DateTime"] = datetime_column.values


X = X.set_index("DateTime")
y = y.set_index("DateTime")

# sort the columns
X = X[sorted(X.columns)]

X_train = X.loc[:"2022-01-01"]
y_train = y.loc[:"2022-01-01"]

X_test = X.loc["2022-01-01":]
y_test = y.loc["2022-01-01":]

# Linear Regression

#### Train Model

In [189]:
model = sm.OLS(y_train.values, sm.add_constant(X_train)).fit()

In [190]:
results = model.summary2()

#### Results

In [None]:
print(results)

#### Predictions

In [None]:
# predict the model
predictions = model.get_prediction(
    sm.add_constant(X_test)
).summary_frame(alpha=0.05)

In [193]:
train_res = model.resid

#### Plots

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predictions["mean"]))
r2 = r2_score(y_test, predictions["mean"])

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test, predictions["mean"], c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(model.fittedvalues, train_res, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Train Set Residuals Plot")
plt.show()

In [None]:
# plot the fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train.index,
    y=model.fittedvalues.values,
    label="Fitted Values",
)

plt.show()

In [198]:
residuals = (
    y_test["UVA254"].values - predictions["mean"].values
)

In [None]:
# plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(predictions["mean"], residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
# sns.lineplot(x=y_train['DateTime'], y=model.fittedvalues.values, label='Fitted Values')
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
)
sns.lineplot(
    x=y_test.index,
    y=predictions["mean"].values,
    label="Predicted Values",
)
plt.fill_between(
    y_test.index,
    predictions["obs_ci_lower"],
    predictions["obs_ci_upper"],
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
        f"AIC = {model.aic:.2f}",
    )
)

plt.text(
    y_train.index[0],
    75,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"UVA254 Predictions")

plt.legend()
plt.show()

#### Store Results

In [202]:
lr_pred = predictions["mean"]
lr_lower_bound = predictions["obs_ci_lower"]
lr_upper_bound = predictions["obs_ci_upper"]
lr_rmse = rmse
lr_r2 = r2

# Random Forest

### Train Model

#### Perform K-Fold Cross-Validation

K = 10 since the training set is composed of 10 years (almost).

In [203]:
rf_model = RandomForestQuantileRegressor(
    n_estimators=20, max_features="log2", max_depth=7, random_state=42
)

cv = TimeSeriesSplit(n_splits=10)

rf_ensemble = cross_validate(
    rf_model,
    X_train,
    y_train.values.flatten(),
    cv=cv,
    scoring="neg_mean_squared_error",
    return_estimator=True,
    return_train_score=True,
)

### Results

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = X_train.columns
feature_importance["importance_mean"] = np.mean(
    [
        estimator.feature_importances_
        for estimator in rf_ensemble["estimator"]
    ],
    axis=0,
)
feature_importance["importance_std"] = np.std(
    [
        estimator.feature_importances_
        for estimator in rf_ensemble["estimator"]
    ],
    axis=0,
)

feature_importance = feature_importance.sort_values(
    by="importance_mean", ascending=False
)

# plot the importance of each feature with the std
plt.figure(figsize=(25, 7.5))
plt.errorbar(
    x=feature_importance["feature"],
    y=feature_importance["importance_mean"],
    yerr=feature_importance["importance_std"],
    fmt="o",
)
plt.title("Feature Importance")
plt.show()

### Predictions

In [205]:
# get predictions
mean_predictions = np.array(
    [
        estimator.predict(X_test, quantiles=0.5)
        for estimator in rf_ensemble["estimator"]
    ]
)
lower_bound_predictions = np.array(
    [
        estimator.predict(X_test, quantiles=0.05)
        for estimator in rf_ensemble["estimator"]
    ]
)
upper_bound_predictions = np.array(
    [
        estimator.predict(X_test, quantiles=0.95)
        for estimator in rf_ensemble["estimator"]
    ]
)

In [206]:
mean_predictions = np.mean(mean_predictions, axis=0)
lower_bound_predictions = np.mean(lower_bound_predictions, axis=0)
upper_bound_predictions = np.mean(upper_bound_predictions, axis=0)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, mean_predictions))
r2 = r2_score(y_test, mean_predictions)

print(f"RMSE: {rmse:.2f}")
print(f"R\u00b2: {r2:.2f}")

### Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(y_test.values, mean_predictions, c="b", s=40, alpha=0.5)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test["UVA254"] - mean_predictions
plt.figure(figsize=(10, 5))
plt.scatter(mean_predictions, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [211]:
# get training residuals
train_predictions = np.array(
    [
        estimator.predict(X_train, quantiles=0.5)
        for estimator in rf_ensemble["estimator"]
    ]
)
train_predictions = np.mean(train_predictions, axis=0)

train_residuals = y_train["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series of the residuals and of the train set
fig, ax = plt.subplots(2, 1, figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
    ax=ax[0],
)
sns.lineplot(
    x=y_train.index,
    y=train_predictions,
    label="Fitted Values",
    ax=ax[0],
)

sns.lineplot(
    x=y_train.index,
    y=train_residuals,
    label="Training Residuals",
    ax=ax[1],
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
)
sns.lineplot(
    x=y_test.index, y=mean_predictions, label="Predicted Values"
)
# plot std of predictions
plt.fill_between(
    y_test.index,
    lower_bound_predictions,
    y2=upper_bound_predictions,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
    )
)

plt.text(
    y_train.index[6],
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(f"UVA254 Predictions")

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [216]:
rf_pred = mean_predictions
rf_lower_bound = lower_bound_predictions
rf_upper_bound = upper_bound_predictions
rf_rmse = rmse
rf_r2 = r2

# XGBoost

### Train Model

#### Hyperparameter Tuning

In [217]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tra, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tra, y_val = y.iloc[train_index], y.iloc[val_index]

    model = xgb.XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tra, y_tra)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [218]:
def objective(trial: optuna.trial.Trial, X_tr, y_tr) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_tr, y_tr)
    ):
        cv_mae[i] = fit_and_validate_xgb_model(
            X_tr,
            y_tr,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [None]:
if os.path.exists(f"XGBoost-Extended.sqlite3"):       
    study = optuna.load_study(
    study_name="Hyperparameter Tuning - XGBoost",
    storage=f"sqlite:///XGBoost-Extended.sqlite3",
    )
            
else:    
    study = optuna.create_study(
        direction="minimize",
        storage=f"sqlite:///XGBoost-Extended.sqlite3",
        study_name="Hyperparameter Tuning - XGBoost",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)

In [None]:
study.best_params

In [221]:
params = study.best_params
params["objective"] = "reg:squarederror"
params["booster"] = "gblinear"

In [222]:
# params['n_estimators'] = 150
# params['learning_rate'] = 0.3

In [223]:
# param = {}
# param["objective"] = "reg:squarederror"
# param["booster"] = "gblinear"
# param["n_estimators"] = 10
# param["learning_rate"] = 0.5

In [224]:
# params = {
#     "objective": "reg:squarederror",
#     "booster": "gblinear",
#     "eta": 0.021222820197838683,
#     "reg_lambda": 1.2716135487076726e-07,
#     "reg_alpha": 0.25584966658518155,
#     "learning_rate": 0.6602531811820622,
#     "n_estimators": 369,
#     "updater": "shotgun",
# }

In [None]:
booster = xgb.XGBRegressor(
    random_state=42,
    **params,
)

booster.fit(X_train, y_train["UVA254"])

### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = booster.feature_names_in_
feature_importance["importance"] = booster.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)

# rotate the x axis words by 45°
plt.xticks(rotation=45)

plt.title("Feature Importance")
plt.show()

### Predictions

In [227]:
n_iterations = 100

n_size = len(X_train)
predictions = np.zeros((len(X_test), n_iterations))
metrics = []

for i in range(n_iterations):
# Bootstrap sample (random state changes each iteration)
    X_resampled, y_resampled = resample(X_train, y_train, n_samples=n_size, random_state=i)
    
    # Train the model with the best hyperparameters
    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_resampled, y_resampled)
    
    # Predict on the validation set
    y_pred = model.predict(X_test)
    predictions[:, i] = y_pred
    
    # Calculate and store the metric (e.g., RMSE)
    metric = mean_squared_error(y_test, y_pred, squared=False)
    metrics.append(metric)
    


# Convert to a numpy array for easier calculation
metrics = np.array(metrics)

# Calculate the mean RMSE
mean_rmse = np.mean(metrics)

# Calculate 95% confidence interval of the predictions
lower_bound = np.percentile(predictions, 2.5, axis=1)
upper_bound = np.percentile(predictions, 97.5, axis=1)

# Calculate the mean predictions
mean_predictions = np.mean(predictions, axis=1)

xgb_results = {
    "y_pred": mean_predictions,
    "y_pred_lower": lower_bound,
    "y_pred_upper": upper_bound,
    "model": model,
    "rmse": mean_rmse,
    "r2": r2_score(y_test, mean_predictions),
    "mae": np.mean(np.abs(y_test.values - mean_predictions)),
}

### Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test["UVA254"], xgb_results['y_pred'], c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test["UVA254"] - xgb_results['y_pred']
plt.figure(figsize=(10, 5))
plt.scatter(xgb_results['y_pred'], residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [231]:
# get training residuals
train_predictions = booster.predict(X_train)
train_residuals = y_train["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train.index, y=train_predictions, label="Fitted Values"
)

plt.show()

In [None]:
y_med = xgb_results["y_pred"]
y_lower = xgb_results["y_pred_lower"]
y_upper = xgb_results["y_pred_upper"]

rmse = xgb_results["rmse"]
r2 = xgb_results["r2"]

plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
)
sns.lineplot(x=y_test.index, y=y_med, label="Predicted Values")
# plot std of predictions
plt.fill_between(
    y_test.index,
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
    )
)

plt.text(
    y_train.index[6],
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"UVA254 Predictions"
)

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [236]:
boost_pred = y_med
boost_lower_bound = y_lower
boost_upper_bound = y_upper
boost_rmse = rmse
boost_r2 = r2
boost_mae = xgb_results["mae"]

# LightGBM (Random Forest with Linear Regressors on leaves)

### Train Model

#### Hyperparameter Tuning

In [237]:
def fit_and_validate_lgbm_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = LGBMRegressor(
        objective="regression",
        random_state=42,
        linear_tree=True,
    )

    if params is not None:
        model.set_params(**params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [238]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "n_estimators": trial.suggest_int(
            "n_estimators", 1, 20, step=1
        ),
        "learning_rate": trial.suggest_float(
            "learning_rate", 1e-3, 1, log=True
        ),
        "max_depth": trial.suggest_int("max_depth", 2, 16, step=1),
        "num_leaves": trial.suggest_int("num_leaves", 2, 20, step=1),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf", 2, 50, step=1
        ),
        "lambda_l1": trial.suggest_float(
            "lambda_l1", 1e-3, 10, log=True
        ),
        "lambda_l2": trial.suggest_float(
            "lambda_l2", 1e-3, 10, log=True
        ),
        "min_split_gain": trial.suggest_float(
            "min_split_gain", 0, 15, step=0.5
        ),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 1e-3, 1, log=True
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 1e-3, 1, log=True
        ),
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 20, 1000, log=True
        ),
        "max_bin": trial.suggest_int("max_bin", 10, 500, step=10),
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_mae[i] = fit_and_validate_lgbm_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [None]:
if os.path.exists(f"LGBM-Extended.sqlite3"):
        
    study = optuna.load_study(
    study_name="Hyperparameter Tuning - LGBM",
    storage=f"sqlite:///LGBM-Extended.sqlite3",
    )
        
else:
        
    study = optuna.create_study(
        direction="minimize",
        storage=f"sqlite:///LGBM-Extended.sqlite3",
        study_name="Hyperparameter Tuning - LGBM",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)

In [None]:
study.best_params

In [241]:
params = study.best_params

# params['n_estimators'] = 10
# params["max_bin"] = 30
# params["learning_rate"] = 0.6
# params['lambda_l2'] = 1

In [242]:
# %%script false --no-raise-error
# params = {
#     "n_estimators": 16,
#     "learning_rate": 0.6192800859019298,
#     "max_depth": 16,
#     "num_leaves": 20,
#     "min_data_in_leaf": 34,
#     "lambda_l1": 1.8585248563175933,
#     "lambda_l2": 0.020368547806226774,
#     "min_split_gain": 2.5,
#     "subsample": 0.5639096844841955,
#     "bagging_fraction": 0.026474369917739878,
#     "feature_fraction": 0.0012608584366219668,
#     "min_child_samples": 33,
#     "max_bin": 20,
# }

#### Train model

In [None]:


n_iterations = 100

n_size = len(X_train)
predictions = np.zeros((len(X_test), n_iterations))
metrics = []

for i in range(n_iterations):
# Bootstrap sample (random state changes each iteration)
    X_resampled, y_resampled = resample(X_train, y_train, n_samples=n_size, random_state=i)
    
    # Train the model with the best hyperparameters
    model = LGBMRegressor(
    objective="regression",
    random_state=42,
    linear_tree=True,
    )
    
    model.set_params(**params)
    
    model.fit(X_resampled, y_resampled)
    
    # Predict on the validation set
    y_pred = model.predict(X_test)
    predictions[:, i] = y_pred
    
    # Calculate and store the metric (e.g., RMSE)
    metric = mean_squared_error(y_test, y_pred, squared=False)
    metrics.append(metric)
    


# Convert to a numpy array for easier calculation
metrics = np.array(metrics)

# Calculate the mean RMSE
mean_rmse = np.mean(metrics)

# Calculate 95% confidence interval of the predictions
lower_bound = np.percentile(predictions, 2.5, axis=1)
upper_bound = np.percentile(predictions, 97.5, axis=1)

# Calculate the mean predictions
mean_predictions = np.mean(predictions, axis=1)
    
lgbm_results = {
    "y_pred": mean_predictions,
    "y_pred_lower": lower_bound,
    "y_pred_upper": upper_bound,
    "model": model,
    "rmse": mean_rmse,
    "r2": r2_score(y_test, mean_predictions),
    "mae": np.mean(np.abs(y_test.values - mean_predictions)),
}

### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = model.feature_name_
feature_importance["importance"] = model.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)
plt.title("Feature Importance")
plt.show()

### Predictions

### Plots

In [245]:
y_med = lgbm_results["y_pred"]
y_lower = lgbm_results["y_pred_lower"]
y_upper = lgbm_results["y_pred_upper"]

rmse = lgbm_results["rmse"]
r2 = lgbm_results["r2"]

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test["UVA254"], y_med, c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test["UVA254"] - y_med
plt.figure(figsize=(10, 5))
plt.scatter(y_med, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [None]:
# get training residuals
train_predictions = model.predict(X_train)
train_residuals = y_train["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train.index,
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
)
sns.lineplot(x=y_test.index, y=y_med, label="Predicted Values")
# plot std of predictions
plt.fill_between(
    y_test.index,
    y_lower.flatten(),
    y2=y_upper.flatten(),
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
    )
)

plt.text(
    y_train.index[6],
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"UVA254 Predictions"
)

plt.legend()
plt.show()

Comment: prevede un trend crescente, probabilmente dato da qualche feature 

### Store Results

In [254]:
lgbm_pred = y_med
lgbm_lower_bound = y_lower
lgbm_upper_bound = y_upper
lgbm_rmse = rmse
lgbm_r2 = r2
lgbm_mae = lgbm_results["mae"]

# Neural Network

## Train the model

### Hyperparameter Tuning

In [255]:
def fit_and_validate_nn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = MLPRegressor(
        random_state=42,
        hidden_layer_sizes=tuple(params["layers"]),
        max_iter=1000,
    )

    param = params.copy()
    param.pop("layers")
    model.set_params(**param)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [256]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "layers": [
            trial.suggest_int(f"n_units_{i}", 50, 100, step=5)
            for i in range(trial.suggest_int("n_layers", 2, 2))
        ],
        "activation": trial.suggest_categorical(
            "activation", ["identity", "logistic", "tanh", "relu"]
        ),
        "solver": trial.suggest_categorical("solver", ["sgd", "adam"]),
        "alpha": trial.suggest_float("alpha", 1e-5, 1),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", ["constant", "invscaling", "adaptive"]
        ),
        "power_t": trial.suggest_float("power_t", 0.1, 1),
        "beta_1": trial.suggest_float("beta_1", 0.1, 1),
        "beta_2": trial.suggest_float("beta_2", 0.1, 1),
        "epsilon": trial.suggest_float("epsilon", 1e-8, 1),
        "early_stopping": True,
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_mae = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_mae[i] = fit_and_validate_nn_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_mae", cv_mae)

    return np.mean(cv_mae)

In [257]:
if os.path.exists(f"MLP-Extended.sqlite3"):
    
    study = optuna.load_study(
    study_name="Hyperparameter Tuning - MLP",
    storage=f"sqlite:///MLP-Extended.sqlite3",
    )
        
else:
        
    study = optuna.create_study(
        direction="minimize",
        storage=f"sqlite:///MLP-Extended.sqlite3",
        study_name="Hyperparameter Tuning - MLP",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100, show_progress_bar=True)

In [258]:
params = study.best_params

In [259]:
# params = {
#     "n_layers": 2,
#     "n_units_0": 85,
#     "n_units_1": 75,
#     "activation": "relu",
#     "solver": "adam",
#     "alpha": 0.7765540584565614,
#     "learning_rate": "constant",
#     "power_t": 0.3382710741601535,
#     "beta_1": 0.19887581875693028,
#     "beta_2": 0.984060053664114,
#     "epsilon": 0.32827083622604075,
# }

#### Train model

In [None]:
n_size = len(X_train)
predictions = np.zeros((len(X_test), n_iterations))
metrics = []

for i in range(n_iterations):
    
    # Copy since we will be modifying the params
    params_copy = params.copy()
    
    # Bootstrap sample (random state changes each iteration)
    X_resampled, y_resampled = resample(X_train, y_train, n_samples=n_size, random_state=i)
    
    
    hidden_layer_sizes = [
        params_copy[f"n_units_{k}"] for k in range(params_copy["n_layers"])
    ]

    for j in range(params_copy["n_layers"]):
        params_copy.pop(f"n_units_{j}")

    params_copy.pop("n_layers")
        
    model = MLPRegressor(
        random_state=42,
        hidden_layer_sizes=hidden_layer_sizes,
        max_iter=1000,
    )

    model.set_params(**params_copy)
    
    # Predict on the validation set
    model.fit(X_resampled, y_resampled.values.ravel())
    y_pred = model.predict(X_test)
    predictions[:, i] = y_pred
    
    # Calculate and store the metric (e.g., RMSE)
    metric = mean_squared_error(y_test, y_pred, squared=False)
    metrics.append(metric)
    


# Convert to a numpy array for easier calculation
metrics = np.array(metrics)

# Calculate the mean RMSE
mean_rmse = np.mean(metrics)

# Calculate 95% confidence interval of the predictions
lower_bound = np.percentile(predictions, 2.5, axis=1)
upper_bound = np.percentile(predictions, 97.5, axis=1)

# Calculate the mean predictions
mean_predictions = np.mean(predictions, axis=1)
    
mlp_results = {
    "y_pred": mean_predictions,
    "y_pred_lower": lower_bound,
    "y_pred_upper": upper_bound,
    "model": model,
    "rmse": mean_rmse,
    "r2": r2_score(y_test, mean_predictions),
    "mae": np.mean(np.abs(y_test.values - mean_predictions)),
}

In [261]:
# Evaluate prediction and coverage level on testing set
y_med = mlp_results["y_pred"]
y_lower = mlp_results["y_pred_lower"]
y_upper = mlp_results["y_pred_upper"]

rmse = mlp_results["rmse"]
r2 = mlp_results["r2"]

## Plots

In [None]:
# plot the true vs predicted values
plt.figure(figsize=(10, 5))
plt.scatter(
    y_test["UVA254"], y_med, c="b", s=40, alpha=0.5
)
plt.axline([0, 0], [1, 1], color="red", linestyle="--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title(f"Predictions vs True Values")
plt.show()

In [None]:
# plot the residuals
residuals = y_test["UVA254"] - y_med
plt.figure(figsize=(10, 5))
plt.scatter(y_med, residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Test Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the residuals
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title("Test Set Residuals Distribution")
plt.show()

In [265]:
# get training residuals
train_predictions = model.predict(X_train)
train_residuals = y_train["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(
    train_predictions.flatten(), train_residuals, c="b", s=40, alpha=0.5
)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
)
sns.lineplot(
    x=y_test.index, y=y_med, label="Predicted Values"
)
# plot std of predictions
plt.fill_between(
    y_test.index,
    y_lower.flatten(),
    y2=y_upper.flatten(),
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"RMSE = {rmse:.2f}",
        f"R\u00b2 = {r2:.2f}",
    )
)

plt.text(
    y_train.index[6],
    35,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.title(
    f"UVA254 Predictions"
)

plt.legend()
plt.show()

## Store Results

In [271]:
nn_pred = y_med
nn_lower_bound = y_lower
nn_upper_bound = y_upper
nn_rmse = rmse
nn_r2 = r2
nn_mae = mlp_results["mae"]

# Final Plot

In [None]:
# create a plot for the comparison of the models
from cProfile import label


plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train.index,
    y=y_train["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
    color="red",
)

# Linear Regression
sns.lineplot(
    x=y_test.index,
    y=boost_pred,
    label="XGBoost",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test.index,
    boost_lower_bound,
    boost_upper_bound,
    alpha=0.2,
    color="blue",
    label="95% Prediction Interval",
)

# Random Forest
sns.lineplot(
    x=y_test.index,
    y=lgbm_pred,
    label="Light GBM",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test.index,
    lgbm_lower_bound,
    lgbm_upper_bound,
    alpha=0.2,
    color="orange",
    label="95% Prediction Interval",
)

# Neural Network
sns.lineplot(
    x=y_test.index,
    y=nn_pred,
    label="Neural Network",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test.index,
    nn_lower_bound,
    nn_upper_bound,
    alpha=0.2,
    color="green",
    label="95% Prediction Interval",
)

plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"XGBoost RMSE = {boost_rmse:.2f}, R\u00b2 = {boost_r2:.2f}, MAE = {boost_mae:.2f}",
        f"Light GBM RMSE = {lgbm_rmse:.2f}, R\u00b2 = {lgbm_r2:.2f}, MAE = {lgbm_mae:.2f}",
        f"Neural Network RMSE = {nn_rmse:.2f}, R\u00b2 = {nn_r2:.2f} MAE = {nn_mae:.2f}",
    )
)

plt.text(
    y_train.index[6],
    -2,
    s=text_string,
    fontsize=16,
    bbox=props,
)

plt.title(f"UVA254 - Model Comparison")

plt.legend()
plt.show()

In [None]:
# create a plot for the comparison of the models
plt.figure(figsize=(20, 10))
# sns.lineplot(x=y_train['DateTime'], y=y_train['UVA254'], label='Historical Data')
sns.lineplot(
    x=y_test.index,
    y=y_test["UVA254"],
    label="True Values",
    color="red",
)

# XGBoost
sns.lineplot(
    x=y_test.index,
    y=boost_pred,
    label="XGBoost",
    linestyle="--",
    color="blue",
)
plt.fill_between(
    y_test.index,
    boost_lower_bound,
    boost_upper_bound,
    alpha=0.2,
    color="blue",
    label="95% Prediction Interval",
)

# Light GBM
sns.lineplot(
    x=y_test.index,
    y=lgbm_pred,
    label="Light GBM",
    linestyle="--",
    color="orange",
)
plt.fill_between(
    y_test.index,
    lgbm_lower_bound,
    lgbm_upper_bound,
    alpha=0.2,
    color="orange",
    label="95% Prediction Interval",
)

# Neural Network
sns.lineplot(
    x=y_test.index,
    y=nn_pred,
    label="MLP Neural Network",
    linestyle="--",
    color="green",
)
plt.fill_between(
    y_test.index,
    nn_lower_bound,
    nn_upper_bound,
    alpha=0.2,
    color="green",
    label="95% Prediction Interval",
)

plt.ylim(-5, 41)

plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box
props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"XGBoost RMSE = {boost_rmse:.2f}, R\u00b2 = {boost_r2:.2f}, MAE = {boost_mae:.2f}",
        f"Light GBM RMSE = {lgbm_rmse:.2f}, R\u00b2 = {lgbm_r2:.2f}, MAE = {lgbm_mae:.2f}",
        f"Neural Network RMSE = {nn_rmse:.2f}, R\u00b2 = {nn_r2:.2f} MAE = {nn_mae:.2f}",
    )
)

plt.text(
    y_test.index[6],
    36,
    s=text_string,
    fontsize=16,
    bbox=props,
)

plt.title(f"UVA254 - Model Comparison")

plt.legend()
plt.show()