In [None]:
import os

import pandas as pd
import numpy as np

import pickle

import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations
import xgboost as xgb

import optuna

# Define Paths and Load Data

In [None]:
data_folder = os.path.join("..", "..", "..", "..", "data", "berlin")
clean_data_folder = os.path.join(data_folder, "clean_data")

In [None]:
surface_df = pd.read_excel(os.path.join(clean_data_folder, "surface.xlsx"))

In [None]:
ground_df = pd.read_excel(os.path.join(clean_data_folder, "ground.xlsx"))

In [None]:
diff_columns = ["DateTime", "Station"]
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
    "Coliform (MPN/100ml)"
]

# Modelling

In [None]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    df.drop(columns=['1'], inplace=True)
    return df

## Surface

In [None]:
train_size = 0.7
station_id = 105

In [None]:
all_datasets = {}

# Prepare the data for the models

df = surface_df[surface_df['Station'] == station_id]

# add the year and month columns
df["Year"] = df["DateTime"].dt.year
df["Month"] = df["DateTime"].dt.month

# Save the datetime column for later (drop diff returns error
# if I remove it before)
datetime_column = df.drop(columns=bacteria_columns).dropna()["DateTime"]

df = df.drop(columns=diff_columns + bacteria_columns).dropna()

X = df.drop(columns=["DOC (mg/l)"])
y = df[["DOC (mg/l)"]]

# X = extend_features(X, lags=1, rolling_window=3, poly_degree=2)

# Normalize the data
scaler = MinMaxScaler()
cols = X.columns

X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=cols)

# Add the datetime column back
X["DateTime"] = datetime_column.values
y["DateTime"] = datetime_column.values


X = X.set_index("DateTime")
y = y.set_index("DateTime")

# create a different dataset for every possible combination of features

for i in range(1, len(X.columns) + 1):
    for subset in combinations(X.columns, i):
        
        # order the subset
        subset = sorted(subset)
        
        X_tr = X[list(subset)].iloc[:int(len(X) * train_size)]
        X_ts = X[list(subset)].iloc[int(len(X) * train_size):]
        y_tr = y.iloc[:int(len(X) * train_size)]
        y_ts = y.iloc[int(len(X) * train_size):]
        
        all_datasets[tuple(subset)] = (X_tr, X_ts, y_tr, y_ts)

### XGBoost

#### Hyperparameter Tuning

In [None]:
def fit_and_validate_xgb_model(
    X,
    y,
    train_index,
    val_index,
    params,
):
    X_tr, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model = xgb.XGBRegressor(random_state=42, **params)

    # train model
    _ = model.fit(X_tr, y_tr)

    # obtain predictions
    y_val_pred = model.predict(X_val)

    # return metrics
    return mean_squared_error(y_val, y_val_pred, squared=False)

In [None]:
def objective(trial: optuna.trial.Trial, X_cv, y_cv) -> float:
    eta = trial.suggest_float("eta", 1e-5, 1, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 1, log=True)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 1, log=True)
    learning_rate = trial.suggest_float(
        "learning_rate", 1e-5, 1, log=True
    )
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    updater = trial.suggest_categorical(
        "updater", ["shotgun", "coord_descent"]
    )

    params = {
        "objective": "reg:squarederror",
        "booster": "gblinear",
        "eta": eta,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "learning_rate": learning_rate,
        "updater": updater,
        "n_estimators": n_estimators,
        "eval_metric": "rmse",
    }

    n_splits = 5
    cv = TimeSeriesSplit(n_splits=n_splits)
    cv_rmse = [None] * n_splits
    for i, (train_index, test_index) in enumerate(
        cv.split(X_cv, y_cv)
    ):
        cv_rmse[i] = fit_and_validate_xgb_model(
            X_cv,
            y_cv,
            train_index,
            test_index,
            params,
        )

    # saving the individual fold holdout metrics
    # uncomment this line if you don't want this
    trial.set_user_attr("split_rmse", cv_rmse)

    return np.mean(cv_rmse)

In [None]:
xgb_studies = {}

if os.path.exists(f"XGBoost-Station{station_id}.sqlite3"):
        
    study = optuna.load_study(
    study_name="Hyperparameter Tuning - XGBoost"
    + " + "
    + f"Station{station_id}",
    storage=f"sqlite:///XGBoost-Station{station_id}.sqlite3",
    )
        
else:
        
    study = optuna.create_study(
        direction="minimize",
        storage=f"sqlite:///XGBoost-Station{station_id}.sqlite3",
        study_name="Hyperparameter Tuning - XGBoost"
        + " + "
        + f"Station{station_id}",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_tr, y_tr), n_trials=100, show_progress_bar=True)
        
xgb_studies[station_id] = study

#### Prediction

In [None]:
xgb_results = {}

params = xgb_studies[station_id].best_params

params["objective"] = "reg:squarederror"
params["booster"] = "gblinear"

    
for subset in all_datasets:
    X_tr, X_ts, y_tr, y_ts = all_datasets[subset]
    
    model = xgb.XGBRegressor(random_state=42, **params)
    
    _ = model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_ts)
    
    xgb_results[subset] = {
        "y_pred": y_pred,
        "y_true": y_ts,
        "r2_score": r2_score(y_ts, y_pred),
        "rmse": mean_squared_error(y_ts, y_pred, squared=False)
    }

In [None]:
# get the subset with the best rmse
best_subset = min(xgb_results, key=lambda x: xgb_results[x]["rmse"])

In [None]:
# store the result
with open(f"XGBoost-Station{station_id}.pickle", "wb") as f:
    pickle.dump(xgb_results, f)