In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import resample

import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

import optuna

from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Define Paths and Load Data

In [None]:
data_folder = os.path.join("..", "..", "data", "berlin")
clean_data_folder = os.path.join(data_folder, "clean_data")

In [None]:
surface_df = pd.read_excel(os.path.join(clean_data_folder, "surface.xlsx"))

In [None]:
ground_df = pd.read_excel(os.path.join(clean_data_folder, "ground.xlsx"))

In [None]:
diff_columns = ["DateTime", "Station"]
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
    "Coliform (MPN/100ml)"
]

# Multicollinearity Test

We use the following rules of thumb for interpreting VIF values:

* VIF = 1: There is no correlation between a given predictor variable and any other predictor variables in the model.
* VIF between 1 and 5: There is moderate correlation between a given predictor variable and other predictor variables in the model.
* VIF > 5: There is severe correlation between a given predictor variable and other predictor variables in the model.

## Surface

In [None]:
surface_df.columns.to_list()

In [None]:
vifs = {}
for station_id in surface_df['Station'].unique():
    df = surface_df[surface_df['Station'] == station_id]
    
    df = df.drop(columns=diff_columns + bacteria_columns).dropna()
    
    #find design matrix for regression model using 'rating' as response variable 
    y, X = dmatrices(
        'Q("DOC (mg/l)") ~ Q("Air Temperature (°C)")+Q("Ammonium (mg/l)")+Q("Conductivity (µS/cm)")+Q("Dissolved Oxygen (mg/l)")+Q("Nitrate (mg/l)")+Q("Water Temperature (°C)")+pH+Q("Flow River Rate (m³/s)")+Q("Cumulated Rainfall (mm)")',
        data=df,
        return_type='dataframe'
    )

    #create DataFrame to hold VIF values
    vif_df = pd.DataFrame()
    vif_df['variable'] = X.columns 

    #calculate VIF for each predictor variable 
    vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    vifs[station_id] = vif_df

In [None]:
vifs[105]

In [None]:
vifs[305]

In [None]:
vifs[325]

# Feature Selection

## Surface

In [None]:
def backward_feature_selection(
    X_tr, y_tr, X_ts, y_ts, pvalue_threshold=0.05
):

    initial_list = list(X_tr.columns)
    included = list(X_tr.columns)

    summaries = {}
    results = {}
    bfs_insights = {}
    index = 0
    excluded_variable = None

    changed = True

    while True:
        changed = False

        # Fit the regression model
        model = sm.OLS(
            y_tr.values, sm.add_constant(X_tr[included])
        ).fit()
        result = {}

        # PREDICTION STEP
        predictions = model.get_prediction(
            sm.add_constant(X_ts[included])
        ).summary_frame(alpha=0.05)

        train_res = model.resid

        rmse = np.sqrt(mean_squared_error(y_ts, predictions["mean"]))
        r2 = r2_score(y_ts, predictions["mean"])

        residuals = (
            y_ts["DOC (mg/l)"].values - predictions["mean"].values
        )

        model.aic
        #### Store Results

        result["y_pred"] = predictions["mean"]
        result["y_lower_bound"] = predictions["obs_ci_lower"]
        result["y_upper_bound"] = predictions["obs_ci_upper"]

        result["train_res"] = train_res
        result["residuals"] = residuals

        result["rmse"] = rmse
        result["r2"] = r2
        result["aic"] = model.aic

        result["model"] = model

        results[index] = result

        # FEATURE SELECTION STEP
        index += 1

        # Get the pvalues of the model
        pvalues = model.pvalues[1:]

        insights = pd.DataFrame(
            {
                "step": index,
                "n_features": len(included),
                "features": included,
                "dropped_feature": excluded_variable,
            }
        )

        bfs_insights[index] = insights
        summaries[index] = model.summary2()

        # Find the variable with the highest pvalue
        highest_pvalue = pvalues.max()
        if highest_pvalue >= pvalue_threshold:
            changed = True
            excluded_variable = pvalues.idxmax()
            included.remove(excluded_variable)

        if not changed:
            break

    return results, summaries, bfs_insights

In [None]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    return df

In [None]:
results_dict = {}
summaries_dict = {}
bfs_insights_dict = {}

for station_id in surface_df['Station'].unique():
    df = surface_df[surface_df['Station'] == station_id]
    
    # add the year and month columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    
    df = df.drop(columns=diff_columns + bacteria_columns).dropna()
    
    X = df.drop(columns=["DOC (mg/l)"])
    y = df[["DOC (mg/l)"]]
    
    # X = extend_features(X, lags=2, rolling_window=3, poly_degree=2)
    
    # Normalize the data
    scaler = MinMaxScaler()
    cols = X.columns
    
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    
    X_tr, X_ts = X[:int(0.7 * len(X))], X[int(0.7 * len(X)):]
    
    y_tr, y_ts = y[:int(0.7 * len(y))], y[int(0.7 * len(y)):]
    
    results, summaries, bfs_insights = backward_feature_selection(
        X_tr, y_tr, X_ts, y_ts, pvalue_threshold=0.05
    )   
    
    results_dict[station_id] = results
    summaries_dict[station_id] = summaries
    bfs_insights_dict[station_id] = bfs_insights

In [None]:
for station_id in surface_df['Station'].unique():
    print(f"=== Station {station_id} ===")
    
    summaries = summaries_dict[station_id]
    bfs_insights = bfs_insights_dict[station_id]
    results = results_dict[station_id]
    
    for i in summaries.keys():
        step = bfs_insights[i]["step"].iloc[0]
        print(f"================= Step {step} =================")
        print()
        print(f"N Features: {bfs_insights[step]['n_features'].iloc[0]}")
        print(f"Features: {bfs_insights[step]['features'].values.tolist()}")
        print(
            f"Dropped Feature: {bfs_insights[step]['dropped_feature'].iloc[0]}"
        )

        print()
        print(summaries[i])
        print("\n")
        
    print("\n\n")
    print("=============================================")
    print("\n\n")
    
    print(f"Final Features for Station {station_id}")
    print(f"=================")
    print()
    print(f"N Features: {bfs_insights[step]['n_features'].iloc[0]}")
    print(f"Features: {bfs_insights[step]['features'].values}")
        