In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import resample

import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

import optuna

# Define Paths and Load Data

In [None]:
data_folder = os.path.join("..", "..", "data", "berlin")
clean_data_folder = os.path.join(data_folder, "clean_data")

In [None]:
surface_df = pd.read_excel(os.path.join(clean_data_folder, "surface.xlsx"))

In [None]:
ground_df = pd.read_excel(os.path.join(clean_data_folder, "ground.xlsx"))

In [None]:
diff_columns = ["DateTime", "Station"]
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
    "Coliform (MPN/100ml)"
]

# Modelling

In [None]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    df.drop(columns=['1'], inplace=True)
    return df

## Surface

In [None]:
train_size = 0.7

In [None]:
datasets = {}

# Prepare the data for the models
for station_id in surface_df['Station'].unique():
    df = surface_df[surface_df['Station'] == station_id]
    
    # add the year and month columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    
    # Save the datetime column for later (drop diff returns error
    # if I remove it before)
    datetime_column = df.drop(columns=bacteria_columns).dropna()["DateTime"]
    
    df = df.drop(columns=diff_columns + bacteria_columns).dropna()
    
    X = df.drop(columns=["DOC (mg/l)"])
    y = df[["DOC (mg/l)"]]
    
    # X = extend_features(X, lags=1, rolling_window=3, poly_degree=2)
    
    # Normalize the data
    scaler = MinMaxScaler()
    cols = X.columns
    
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    
    # Add the datetime column back
    X["DateTime"] = datetime_column.values
    y["DateTime"] = datetime_column.values
    
    
    X = X.set_index("DateTime")
    y = y.set_index("DateTime")
    
    X_tr, X_ts = X[:int(train_size * len(X))], X[int(train_size * len(X)):]
    y_tr, y_ts = y[:int(train_size * len(y))], y[int(train_size * len(y)):]
    
    datasets[station_id] = (X_tr, X_ts, y_tr, y_ts)