In [45]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import resample

import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

import optuna

# Define Paths and Load Data

In [46]:
data_folder = os.path.join("..", "..", "..", "data", "berlin")
clean_data_folder = os.path.join(data_folder, "clean_data")

In [47]:
surface_df = pd.read_excel(os.path.join(clean_data_folder, "surface.xlsx"))

In [48]:
ground_df = pd.read_excel(os.path.join(clean_data_folder, "ground.xlsx"))

In [49]:
diff_columns = ["DateTime", "Station"]
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
    "Coliform (MPN/100ml)"
]

# Modelling

In [50]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    df.drop(columns=['1'], inplace=True)
    return df

## Surface

In [51]:
drop_columns = [
    "Ammonium (mg/l)",
    "Conductivity (µS/cm)",
    "Dissolved Oxygen (mg/l)",
    "Nitrate (mg/l)",
    "pH",
]

In [52]:
surface_df.drop(columns=drop_columns, inplace=True)

In [53]:
train_size = 0.7

In [54]:
datasets = {}

# Prepare the data for the models
for station_id in surface_df['Station'].unique():
    df = surface_df[surface_df['Station'] == station_id]
    
    # add the year and month columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    
    # Save the datetime column for later (drop diff returns error
    # if I remove it before)
    datetime_column = df.drop(columns=bacteria_columns).dropna()["DateTime"]
    
    df = df.drop(columns=diff_columns + bacteria_columns).dropna()
    
    X = df.drop(columns=["DOC (mg/l)"])
    y = df[["DOC (mg/l)"]]
    
    X = extend_features(X, lags=1, rolling_window=3, poly_degree=2)
    
    # Normalize the data
    scaler = MinMaxScaler()
    cols = X.columns
    
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    
    # Add the datetime column back
    X["DateTime"] = datetime_column.values
    y["DateTime"] = datetime_column.values
    
    
    X = X.set_index("DateTime")
    y = y.set_index("DateTime")
    
    X_tr, X_ts = X[:int(train_size * len(X))], X[int(train_size * len(X)):]
    y_tr, y_ts = y[:int(train_size * len(y))], y[int(train_size * len(y)):]
    
    datasets[station_id] = (X_tr, X_ts, y_tr, y_ts)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

### Linear Regression

In [55]:
lr_results = {}

for station_id in surface_df['Station'].unique():
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    model = sm.OLS(y_tr, sm.add_constant(X_tr))
    results = model.fit()
    
    predictions = results.get_prediction(sm.add_constant(X_ts)).summary_frame(alpha=0.05)
    
    lr_results[station_id] = {
        "y_pred": predictions['mean'],
        "y_pred_lower": predictions['mean_ci_lower'],
        "y_pred_upper": predictions['mean_ci_upper'],
        "model": results,
        "rmse": np.sqrt(mean_squared_error(y_ts, predictions["mean"])),
        "r2": r2_score(y_ts, predictions["mean"]),
    }

### Random Forest

#### Hyperparameter Tuning

In [56]:
def fit_and_validate_rf(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = RandomForestRegressor(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        random_state=42,
    )

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [57]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    # define the hyperparameters to search over
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 500, step=10),
        "max_depth": trial.suggest_int("max_depth", 1, 32),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
    }
    
    
    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(cv.split(X_cv)):
        cv_rmse[i] = fit_and_validate_rf(
            X_cv, y_cv, train_index, test_index, params
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [58]:
rf_studies = {}

for station_id in surface_df['Station'].unique():
    
    X_tr, _, y_tr, _ = datasets[station_id]
    
    if os.path.exists(f"RandomForest-Station{station_id}-Extended.sqlite3"):
        
        study = optuna.load_study(
        study_name="Hyperparameter Tuning - RandomForest"
        + " + "
        + f"Station {station_id}",
        storage=f"sqlite:///RandomForest-Station{station_id}-Extended.sqlite3",
    )

    else:
        
        study = optuna.create_study(
            direction="minimize",
            storage=f"sqlite:///RandomForest-Station{station_id}-Extended.sqlite3",
            study_name="Hyperparameter Tuning - RandomForest"
            + " + "
            + f"Station {station_id}",
            load_if_exists=True,
        )
        study.optimize(lambda trial: objective(trial, X_tr, y_tr), n_trials=100, show_progress_bar=True)
    
    rf_studies[station_id] = study

#### Prediction

In [59]:
rf_results = {}

n_iterations = 100

for station_id in surface_df['Station'].unique():
    params = rf_studies[station_id].best_params
    
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    n_size = len(X_tr)
    predictions = np.zeros((len(X_ts), n_iterations))
    metrics = []
    
    for i in range(n_iterations):
    # Bootstrap sample (random state changes each iteration)
        X_resampled, y_resampled = resample(X_tr, y_tr, n_samples=n_size, random_state=i)
        
        # Train the model with the best hyperparameters
        model = RandomForestRegressor(random_state=42, **params)
        model.fit(X_resampled, y_resampled)
        
        # Predict on the validation set
        y_pred = model.predict(X_ts)
        predictions[:, i] = y_pred
        
        # Calculate and store the metric (e.g., RMSE)
        metric = mean_squared_error(y_ts, y_pred, squared=False)
        metrics.append(metric)
        
    
    
    # Convert to a numpy array for easier calculation
    metrics = np.array(metrics)
    
    # Calculate the mean RMSE
    mean_rmse = np.mean(metrics)
    
    # Calculate 95% confidence interval of the predictions
    lower_bound = np.percentile(predictions, 2.5, axis=1)
    upper_bound = np.percentile(predictions, 97.5, axis=1)
    
    # Calculate the mean predictions
    mean_predictions = np.mean(predictions, axis=1)
    
    rf_results[station_id] = {
        "y_pred": mean_predictions,
        "y_pred_lower": lower_bound,
        "y_pred_upper": upper_bound,
        "model": model,
        "rmse": mean_rmse,
        "r2": r2_score(y_ts, mean_predictions),
    }


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

### XGBoost

#### Hyperparameter Tuning

In [60]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = xgb.XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [61]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_xgb_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [62]:
xgb_studies = {}

for station_id in surface_df['Station'].unique():
    
    X_tr, _, y_tr, _ = datasets[station_id]

    if os.path.exists(f"XGBoost-Station{station_id}-Extended.sqlite3"):
            
        study = optuna.load_study(
        study_name="Hyperparameter Tuning - XGBoost"
        + " + "
        + f"Station{station_id}",
        storage=f"sqlite:///XGBoost-Station{station_id}-Extended.sqlite3",
        )
            
    else:
            
        study = optuna.create_study(
            direction="minimize",
            storage=f"sqlite:///XGBoost-Station{station_id}-Extended.sqlite3",
            study_name="Hyperparameter Tuning - XGBoost"
            + " + "
            + f"Station{station_id}",
            load_if_exists=True,
        )
        study.optimize(lambda trial: objective(trial, X_tr, y_tr), n_trials=100, show_progress_bar=True)
            
    xgb_studies[station_id] = study

#### Prediction

In [63]:
xgb_results = {}

n_iterations = 100

for station_id in surface_df['Station'].unique():
    params = xgb_studies[station_id].best_params
    
    params["objective"] = "reg:squarederror"
    params["booster"] = "gblinear"
    
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    n_size = len(X_tr)
    predictions = np.zeros((len(X_ts), n_iterations))
    metrics = []
    
    for i in range(n_iterations):
    # Bootstrap sample (random state changes each iteration)
        X_resampled, y_resampled = resample(X_tr, y_tr, n_samples=n_size, random_state=i)
        
        # Train the model with the best hyperparameters
        model = xgb.XGBRegressor(**params, random_state=42)
        model.fit(X_resampled, y_resampled)
        
        # Predict on the validation set
        y_pred = model.predict(X_ts)
        predictions[:, i] = y_pred
        
        # Calculate and store the metric (e.g., RMSE)
        metric = mean_squared_error(y_ts, y_pred, squared=False)
        metrics.append(metric)
        
    
    
    # Convert to a numpy array for easier calculation
    metrics = np.array(metrics)
    
    # Calculate the mean RMSE
    mean_rmse = np.mean(metrics)
    
    # Calculate 95% confidence interval of the predictions
    lower_bound = np.percentile(predictions, 2.5, axis=1)
    upper_bound = np.percentile(predictions, 97.5, axis=1)
    
    # Calculate the mean predictions
    mean_predictions = np.mean(predictions, axis=1)
    
    xgb_results[station_id] = {
        "y_pred": mean_predictions,
        "y_pred_lower": lower_bound,
        "y_pred_upper": upper_bound,
        "model": model,
        "rmse": mean_rmse,
        "r2": r2_score(y_ts, mean_predictions),
    }

### LightGBM

#### Hyperparameter Tuning

In [64]:
def fit_and_validate_lgbm_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = LGBMRegressor(
        objective="regression",
        random_state=42,
        linear_tree=True,
    )

    if params is not None:
        model.set_params(**params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [65]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "n_estimators": trial.suggest_int(
            "n_estimators", 1, 20, step=1
        ),
        "learning_rate": trial.suggest_float(
            "learning_rate", 1e-3, 1, log=True
        ),
        "max_depth": trial.suggest_int("max_depth", 2, 16, step=1),
        "num_leaves": trial.suggest_int("num_leaves", 2, 20, step=1),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf", 2, 50, step=1
        ),
        "lambda_l1": trial.suggest_float(
            "lambda_l1", 1e-3, 10, log=True
        ),
        "lambda_l2": trial.suggest_float(
            "lambda_l2", 1e-3, 10, log=True
        ),
        "min_split_gain": trial.suggest_float(
            "min_split_gain", 0, 15, step=0.5
        ),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 1e-3, 1, log=True
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 1e-3, 1, log=True
        ),
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 20, 1000, log=True
        ),
        "max_bin": trial.suggest_int("max_bin", 10, 500, step=10),
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_lgbm_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [66]:
lgbm_studies = {}

for station_id in surface_df['Station'].unique():
        
        X_tr, _, y_tr, _ = datasets[station_id]
    
        if os.path.exists(f"LGBM-Station{station_id}-Extended.sqlite3"):
                
            study = optuna.load_study(
            study_name="Hyperparameter Tuning - LGBM"
            + " + "
            + f"Station {station_id}",
            storage=f"sqlite:///LGBM-Station{station_id}-Extended.sqlite3",
            )
                
        else:
                
            study = optuna.create_study(
                direction="minimize",
                storage=f"sqlite:///LGBM-Station{station_id}-Extended.sqlite3",
                study_name="Hyperparameter Tuning - LGBM"
                + " + "
                + f"Station {station_id}",
                load_if_exists=True,
            )
            study.optimize(lambda trial: objective(trial, X_tr, y_tr), n_trials=100, show_progress_bar=True)
                
        lgbm_studies[station_id] = study

#### Prediction

In [67]:
lgbm_results = {}

n_iterations = 100

for station_id in surface_df['Station'].unique():
    params = lgbm_studies[station_id].best_params
    
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    n_size = len(X_tr)
    predictions = np.zeros((len(X_ts), n_iterations))
    metrics = []
    
    for i in range(n_iterations):
    # Bootstrap sample (random state changes each iteration)
        X_resampled, y_resampled = resample(X_tr, y_tr, n_samples=n_size, random_state=i)
        
        # Train the model with the best hyperparameters
        model = LGBMRegressor(
        objective="regression",
        random_state=42,
        linear_tree=True,
        )
        
        model.set_params(**params)
        
        model.fit(X_resampled, y_resampled)
        
        # Predict on the validation set
        y_pred = model.predict(X_ts)
        predictions[:, i] = y_pred
        
        # Calculate and store the metric (e.g., RMSE)
        metric = mean_squared_error(y_ts, y_pred, squared=False)
        metrics.append(metric)
        
    
    
    # Convert to a numpy array for easier calculation
    metrics = np.array(metrics)
    
    # Calculate the mean RMSE
    mean_rmse = np.mean(metrics)
    
    # Calculate 95% confidence interval of the predictions
    lower_bound = np.percentile(predictions, 2.5, axis=1)
    upper_bound = np.percentile(predictions, 97.5, axis=1)
    
    # Calculate the mean predictions
    mean_predictions = np.mean(predictions, axis=1)
    
    lgbm_results[station_id] = {
        "y_pred": mean_predictions,
        "y_pred_lower": lower_bound,
        "y_pred_upper": upper_bound,
        "model": model,
        "rmse": mean_rmse,
        "r2": r2_score(y_ts, mean_predictions),
    }

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 689
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 35
[LightGBM] [Info] Start training from score 5.933745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 675
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 35
[LightGBM] [Info] Start training from score 5.839389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 607
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 35
[LightGBM] [Info] Start training from s

### MultiLayerPerceptron

#### Hyperparameter Tuning

In [68]:
def fit_and_validate_nn_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = MLPRegressor(
        random_state=42,
        hidden_layer_sizes=tuple(params["layers"]),
        max_iter=1000,
    )

    param = params.copy()
    param.pop("layers")
    model.set_params(**param)

    # train model
    _ = model.fit(X_tr, y_tr.values.ravel())

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [69]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    config = {
        "layers": [
            trial.suggest_int(f"n_units_{i}", 50, 100, step=5)
            for i in range(trial.suggest_int("n_layers", 2, 2))
        ],
        "activation": trial.suggest_categorical(
            "activation", ["identity", "logistic", "tanh", "relu"]
        ),
        "solver": trial.suggest_categorical("solver", ["sgd", "adam"]),
        "alpha": trial.suggest_float("alpha", 1e-5, 1),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", ["constant", "invscaling", "adaptive"]
        ),
        "power_t": trial.suggest_float("power_t", 0.1, 1),
        "beta_1": trial.suggest_float("beta_1", 0.1, 1),
        "beta_2": trial.suggest_float("beta_2", 0.1, 1),
        "epsilon": trial.suggest_float("epsilon", 1e-8, 1),
        "early_stopping": True,
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_nn_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            config,
        )
        
    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)
    
    return np.mean(cv_rmse)

In [70]:
mlp_studies = {}

for station_id in surface_df['Station'].unique():
            
    X_tr, _, y_tr, _ = datasets[station_id]

    if os.path.exists(f"MLP-Station{station_id}-Extended.sqlite3"):
            
        study = optuna.load_study(
        study_name="Hyperparameter Tuning - MLP"
        + " + "
        + f"Station {station_id}",
        storage=f"sqlite:///MLP-Station{station_id}-Extended.sqlite3",
        )
            
    else:
            
        study = optuna.create_study(
            direction="minimize",
            storage=f"sqlite:///MLP-Station{station_id}-Extended.sqlite3",
            study_name="Hyperparameter Tuning - MLP"
            + " + "
            + f"Station {station_id}",
            load_if_exists=True,
        )
        study.optimize(lambda trial: objective(trial, X_tr, y_tr), n_trials=100, show_progress_bar=True)
            
    mlp_studies[station_id] = study

#### Prediction

In [71]:
mlp_results = {}

n_iterations = 100

for station_id in surface_df['Station'].unique():
    params = mlp_studies[station_id].best_params
    
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    n_size = len(X_tr)
    predictions = np.zeros((len(X_ts), n_iterations))
    metrics = []
    
    for i in range(n_iterations):
        
        # Copy since we will be modifying the params
        params_copy = params.copy()
        
        # Bootstrap sample (random state changes each iteration)
        X_resampled, y_resampled = resample(X_tr, y_tr, n_samples=n_size, random_state=i)
        
        
        hidden_layer_sizes = [
            params_copy[f"n_units_{k}"] for k in range(params_copy["n_layers"])
        ]

        for j in range(params_copy["n_layers"]):
            params_copy.pop(f"n_units_{j}")

        params_copy.pop("n_layers")
            
        model = MLPRegressor(
            random_state=42,
            hidden_layer_sizes=hidden_layer_sizes,
            max_iter=1000,
        )
    
        model.set_params(**params_copy)
        
        # Predict on the validation set
        model.fit(X_resampled, y_resampled.values.ravel())
        y_pred = model.predict(X_ts)
        predictions[:, i] = y_pred
        
        # Calculate and store the metric (e.g., RMSE)
        metric = mean_squared_error(y_ts, y_pred, squared=False)
        metrics.append(metric)
        
    
    
    # Convert to a numpy array for easier calculation
    metrics = np.array(metrics)
    
    # Calculate the mean RMSE
    mean_rmse = np.mean(metrics)
    
    # Calculate 95% confidence interval of the predictions
    lower_bound = np.percentile(predictions, 2.5, axis=1)
    upper_bound = np.percentile(predictions, 97.5, axis=1)
    
    # Calculate the mean predictions
    mean_predictions = np.mean(predictions, axis=1)
    
    mlp_results[station_id] = {
        "y_pred": mean_predictions,
        "y_pred_lower": lower_bound,
        "y_pred_upper": upper_bound,
        "model": model,
        "rmse": mean_rmse,
        "r2": r2_score(y_ts, mean_predictions),
    }


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't conver

### Plot

In [72]:
# plot the results

for station_id in surface_df['Station'].unique():
    print(f"=== Station {station_id} ===")
    
    X_tr, X_ts, y_tr, y_ts = datasets[station_id]
    
    lr_result = lr_results[station_id]
    rf_result = rf_results[station_id]
    xgb_result = xgb_results[station_id]
    lgbm_result = lgbm_results[station_id]
    mlp_result = mlp_results[station_id]
    
    rmse_lr = lr_result["rmse"]
    rmse_rf = rf_result["rmse"]
    rmse_xgb = xgb_result["rmse"]
    rmse_lgbm = lgbm_result["rmse"]
    rmse_mlp = mlp_result["rmse"]
    
    r2_lr = lr_result["r2"]
    r2_rf = rf_result["r2"]
    r2_xgb = xgb_result["r2"]
    r2_lgbm = lgbm_result["r2"]
    r2_mlp = mlp_result["r2"]
    
    print(f"Linear Regression RMSE: {rmse_lr}")
    print(f"Random Forest RMSE: {rmse_rf}")
    print(f"XGBoost RMSE: {rmse_xgb}")
    print(f"LightGBM RMSE: {rmse_lgbm}")
    print(f"Neural Network RMSE: {rmse_mlp}")
    
    print()
    
    print(f"Linear Regression R2: {r2_lr}")
    print(f"Random Forest R2: {r2_rf}")
    print(f"XGBoost R2: {r2_xgb}")
    print(f"LightGBM R2: {r2_lgbm}")
    print(f"Neural Network R2: {r2_mlp}")
    
    
    
    fig = go.Figure()
    
    # TRUE
    
    # add both the training and testing data in a unique trace
    y_true = pd.concat([y_tr, y_ts])
    
    fig.add_trace(
        go.Scatter(
            x=y_true.index,
            y=y_true["DOC (mg/l)"],
            mode="lines",
            name="True",
            line=dict(color="black"),
        )
    )
    
    # LINEAR REGRESSION
    
    fig.add_trace(
        go.Scatter(
            x=y_ts.index,
            y=lr_result["y_pred"],
            mode="lines",
            name="Linear Regression",
            line=dict(color="blue"),
        )
    )
    
    # add confidence intervals
    fig.add_traces(
        [
            go.Scatter(
                x=y_ts.index,
                y=lr_result["y_pred_lower"],
                mode="lines",
                line_color="blue",
                line=dict(dash="dash"),
                showlegend=False,
            ),
            go.Scatter(
                x=y_ts.index,
                y=lr_result["y_pred_upper"],
                mode="lines",
                line_color="blue",
                line=dict(dash="dash"),
                name="95% CI",
                fill="tonexty",
                fillcolor="rgba(0,0,255,0.2)"
            )
        ]
    )
    
    # RANDOM FOREST
    
    fig.add_trace(
        go.Scatter(
            x=y_ts.index,
            y=rf_result["y_pred"],
            mode="lines",
            name="Random Forest",
            line=dict(color="red"),
        )
    )
    
    # add confidence intervals
    fig.add_traces(
        [
            go.Scatter(
                x=y_ts.index,
                y=rf_result["y_pred_lower"],
                mode="lines",
                line_color="red",
                line=dict(dash="dash"),
                showlegend=False,
            ),
            go.Scatter(
                x=y_ts.index,
                y=rf_result["y_pred_upper"],
                mode="lines",
                line_color="red",
                line=dict(dash="dash"),
                name="95% CI",
                fill="tonexty",
                fillcolor="rgba(255,0,0,0.2)"
            )
        ]
    )
    
    # XGBOOST
    
    fig.add_trace(
        go.Scatter(
            x=y_ts.index,
            y=xgb_result["y_pred"],
            mode="lines",
            name="XGBoost",
            line=dict(color="green"),
        )
    )
    
    # add confidence intervals
    fig.add_traces(
        [
            go.Scatter(
                x=y_ts.index,
                y=xgb_result["y_pred_lower"],
                mode="lines",
                line_color="green",
                line=dict(dash="dash"),
                showlegend=False,
            ),
            go.Scatter(
                x=y_ts.index,
                y=xgb_result["y_pred_upper"],
                mode="lines",
                line_color="green",
                line=dict(dash="dash"),
                name="95% CI",
                fill="tonexty",
                fillcolor="rgba(0,255,0,0.2)"
            )
        ]
    )
    
    # LGBM
    
    fig.add_trace(
        go.Scatter(
            x=y_ts.index,
            y=lgbm_result["y_pred"],
            mode="lines",
            name="LightGBM",
            line=dict(color="purple"),
        )
    )
    
    # add confidence intervals
    fig.add_traces(
        [
            go.Scatter(
                x=y_ts.index,
                y=lgbm_result["y_pred_lower"],
                mode="lines",
                line_color="purple",
                line=dict(dash="dash"),
                showlegend=False,
            ),
            go.Scatter(
                x=y_ts.index,
                y=lgbm_result["y_pred_upper"],
                mode="lines",
                line_color="purple",
                line=dict(dash="dash"),
                name="95% CI",
                fill="tonexty",
                fillcolor="rgba(128,0,128,0.2)"
            )
        ]
    )
    
    # MLP
    
    fig.add_trace(
        go.Scatter(
            x=y_ts.index,
            y=mlp_result["y_pred"],
            mode="lines",
            name="Neural Network",
            line=dict(color="orange"),
        )
    )
    
    # add confidence intervals
    fig.add_traces(
        [
            go.Scatter(
                x=y_ts.index,
                y=mlp_result["y_pred_lower"],
                mode="lines",
                line_color="orange",
                line=dict(dash="dash"),
                showlegend=False,
            ),
            go.Scatter(
                x=y_ts.index,
                y=mlp_result["y_pred_upper"],
                mode="lines",
                line_color="orange",
                line=dict(dash="dash"),
                name="95% CI",
                fill="tonexty",
                fillcolor="rgba(255,165,0,0.2)"
            )
        ]
    )
    
    fig.update_layout(
        title=f"DOC (mg/l) - Station {station_id}",
        xaxis_title="Date",
        yaxis_title="DOC (mg/l)",
    )
    
    fig.show()

=== Station 105 ===
Linear Regression RMSE: 1.4523545291338733
Random Forest RMSE: 0.47947562127127646
XGBoost RMSE: 0.4434667483009191
LightGBM RMSE: 0.4729026078070028
Neural Network RMSE: 0.7502811502385773

Linear Regression R2: -5.60415433597191
Random Forest R2: 0.43202495458589885
XGBoost R2: 0.4677063430034678
LightGBM R2: 0.5174024740259568
Neural Network R2: -0.7433654980645912


=== Station 305 ===
Linear Regression RMSE: 1.1645765110772852
Random Forest RMSE: 1.0386456382676683
XGBoost RMSE: 1.1060707602592261
LightGBM RMSE: 1.2591437629168016
Neural Network RMSE: 1.1430613713701858

Linear Regression R2: 0.03535718607322469
Random Forest R2: 0.26260160333982396
XGBoost R2: 0.13938134651117895
LightGBM R2: 0.2184771531399231
Neural Network R2: 0.07947169913624186


=== Station 325 ===
Linear Regression RMSE: 2.910769905701122
Random Forest RMSE: 1.0530980061718223
XGBoost RMSE: 0.8139665796807232
LightGBM RMSE: 0.8469181798027792
Neural Network RMSE: 1.3088656665016887

Linear Regression R2: -13.274895017920912
Random Forest R2: -0.809443026076218
XGBoost R2: -0.10700649646879934
LightGBM R2: -0.03542972119791754
Neural Network R2: -1.8444077848201084
