In [52]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import resample

import xgboost as xgb

import optuna

# Define Paths and Load Data

In [53]:
data_folder = os.path.join("..", "..", "data", "berlin")
clean_data_folder = os.path.join(data_folder, "clean_data")
projections_folder = os.path.join(data_folder, "projections")
cat_projections_folder = os.path.join(projections_folder, "cat")

In [54]:
surface_df = pd.read_excel(os.path.join(clean_data_folder, "surface.xlsx"))

In [55]:
ground_df = pd.read_excel(os.path.join(clean_data_folder, "ground.xlsx"))

In [56]:
cat_flow_river_projections = pd.read_excel(
    os.path.join(cat_projections_folder, "flow_river.xlsx")
)

cat_air_temp_projections = pd.read_excel(
    os.path.join(cat_projections_folder, "air_temp.xlsx")
)

cat_precip_projections = pd.read_excel(
    os.path.join(cat_projections_folder, "precip.xlsx")
)

cat_water_temp_projections = pd.read_excel(
    os.path.join(cat_projections_folder, "water_temp.xlsx")
)

In [57]:
diff_columns = ["DateTime", "Station"]
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
    "Coliform (MPN/100ml)"
]

# Modelling

In [58]:
def extend_features(df: pd.DataFrame, lags: int, rolling_window: int, poly_degree: int):
    
    initial_features = df.columns
    # add polynomial features
    poly = PolynomialFeatures(degree=poly_degree)
    df_poly = poly.fit_transform(df)
    df = pd.DataFrame(df_poly, columns=poly.get_feature_names_out(df.columns))
    
    # add lagged, rolling and expanding features for each variable in df
    for col in initial_features.difference(["Year", "Month"]):
        for lag in range(1, lags + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
            
        df[f"{col}_rolling{rolling_window}"] = df[col].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    df.drop(columns=['1'], inplace=True)
    return df

## Surface

In [59]:
drop_columns = [
    "Ammonium (mg/l)",
    "Conductivity (µS/cm)",
    "Dissolved Oxygen (mg/l)",
    "Nitrate (mg/l)",
    "pH",
]

In [60]:
surface_df.drop(columns=drop_columns, inplace=True)

### Prepare Data

#### Historical

In [None]:
# train the models by using all the dataset, then predict on the projections

datasets = {}

# Prepare the data for the models
for station_id in surface_df['Station'].unique():
    df = surface_df[surface_df['Station'] == station_id]
    
    # add the year and month columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    
    # Save the datetime column for later (drop diff returns error
    # if I remove it before)
    datetime_column = df.drop(columns=bacteria_columns).dropna()["DateTime"]
    
    df = df.drop(columns=diff_columns + bacteria_columns).dropna()
    
    X = df.drop(columns=["DOC (mg/l)"])
    y = df[["DOC (mg/l)"]]
    
    # sort the X columns
    X = X.reindex(sorted(X.columns), axis=1)
    
    X = extend_features(X, lags=1, rolling_window=3, poly_degree=2)
    
    # Normalize the data
    scaler = MinMaxScaler()
    cols = X.columns
    
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    
    # Add the datetime column back
    X["DateTime"] = datetime_column.values
    y["DateTime"] = datetime_column.values
    
    
    X = X.set_index("DateTime")
    y = y.set_index("DateTime")
    
    datasets[station_id] = (X, y)

#### Projections

In [None]:
# Prepare projections dataset

projections = [
    cat_flow_river_projections,
    cat_air_temp_projections,
    cat_precip_projections,
    cat_water_temp_projections
]

projections_per_site = {}

for station_id in surface_df['Station'].unique():
    
    projections_per_site[station_id] = {
        'rcp45': pd.DataFrame(),
        'rcp85': pd.DataFrame()
    }
    
    for projection in projections:

        cat_projections_df = projection[projection["Station"] == station_id].copy()
        
        column = cat_projections_df.columns.to_list()[1]

        cat_rcp45 = cat_projections_df[cat_projections_df['label'] == "rcp45"]
        cat_rcp85 = cat_projections_df[cat_projections_df['label'] == "rcp85"]

        cat_rcp45.set_index("DateTime", inplace=True)
        cat_rcp85.set_index("DateTime", inplace=True)

        # Extend cat_rcp45 and cat_rcp85 to a 30 years period

        years = cat_rcp45.index.year.unique()
        months = cat_rcp45.index.month.unique()

        new_cat45_df = pd.DataFrame(columns=['DateTime', column])

        for year in years:
            for month in months:
                value = cat_rcp45.loc[(cat_rcp45.index.year == year) & (cat_rcp45.index.month == month), column].values[0]
                # extend the value to the previous 30 years
                date_range = pd.date_range(start=f"{year-30}-{month}-01", end=f"{year}-{month}-01", freq='Y') + pd.offsets.MonthEnd(month)
                values = np.full(date_range.shape, value)
                new_df = pd.DataFrame({
                    'DateTime': date_range,
                    column: values,
                })
                
                new_cat45_df = pd.concat([new_cat45_df, new_df])
            
        new_cat85_df = pd.DataFrame(columns=['DateTime', column])

        for year in years:
            for month in months:
                value = cat_rcp85.loc[(cat_rcp85.index.year == year) & (cat_rcp85.index.month == month), column].values[0]
                # extend the value to the previous 30 years
                date_range = pd.date_range(start=f"{year-30}-{month}-01", end=f"{year}-{month}-01", freq='Y') + pd.offsets.MonthEnd(month)
                values = np.full(date_range.shape, value)
                new_df = pd.DataFrame({
                    'DateTime': date_range,
                    column: values,
                })
                
                new_cat85_df = pd.concat([new_cat85_df, new_df])
            
        # sort the dataframes
        new_cat45_df.sort_values(by='DateTime', inplace=True)
        new_cat85_df.sort_values(by='DateTime', inplace=True)
        
        if projections_per_site[station_id]['rcp45'].shape[0] == 0:
            projections_per_site[station_id]['rcp45'] = new_cat45_df
            projections_per_site[station_id]['rcp85'] = new_cat85_df
        else:
            projections_per_site[station_id]['rcp45'][column] = new_cat45_df[column]
            projections_per_site[station_id]['rcp85'][column] = new_cat85_df[column]
        

In [63]:
# Prepare the projections for the models

projections_datasets = {}

for station_id in surface_df['Station'].unique():
    
    projections_datasets[station_id] = {}
    
    rcp45_df = projections_per_site[station_id]['rcp45']
    rcp85_df = projections_per_site[station_id]['rcp85']
    
    # add the year and month columns
    rcp45_df["Year"] = rcp45_df["DateTime"].dt.year
    rcp45_df["Month"] = rcp45_df["DateTime"].dt.month
    
    rcp85_df["Year"] = rcp85_df["DateTime"].dt.year
    rcp85_df["Month"] = rcp85_df["DateTime"].dt.month
    
    # Save the datetime column for later (drop diff returns error
    # if I remove it before)
    datetime_column = rcp45_df["DateTime"]
    
    rcp45_df = rcp45_df.drop(columns=['DateTime'])
    rcp85_df = rcp85_df.drop(columns=['DateTime'])
    
    # sort the X columns
    rcp45_df = rcp45_df.reindex(sorted(rcp45_df.columns), axis=1)
    rcp85_df = rcp85_df.reindex(sorted(rcp85_df.columns), axis=1)
    
    rcp45_df = extend_features(rcp45_df, lags=1, rolling_window=3, poly_degree=2)
    rcp85_df = extend_features(rcp85_df, lags=1, rolling_window=3, poly_degree=2)
    
    # Normalize the data
    scaler = MinMaxScaler()
    cols = rcp45_df.columns
    
    rcp45_df = scaler.fit_transform(rcp45_df)
    rcp45_df = pd.DataFrame(rcp45_df, columns=cols)
    
    rcp85_df = scaler.fit_transform(rcp85_df)
    rcp85_df = pd.DataFrame(rcp85_df, columns=cols)
    
    
    # Add the datetime column back
    rcp45_df["DateTime"] = datetime_column.values
    rcp85_df["DateTime"] = datetime_column.values
    
    rcp45_df = rcp45_df.set_index("DateTime")
    rcp85_df = rcp85_df.set_index("DateTime")
    
    projections_datasets[station_id]['rcp45'] = rcp45_df
    projections_datasets[station_id]['rcp85'] = rcp85_df

### Train

In [64]:
xgb_studies = {}

for station_id in surface_df['Station'].unique():
            
    study = optuna.load_study(
    study_name="Hyperparameter Tuning - XGBoost"
    + " + "
    + f"Station{station_id}",
    storage=f"sqlite:///XGBoost-Station{station_id}-Extended.sqlite3",
    )
            
    xgb_studies[station_id] = study

### Prediction

In [65]:
xgb_results = {}

n_iterations = 100

for station_id in surface_df['Station'].unique():
    params = xgb_studies[station_id].best_params
    
    params["objective"] = "reg:squarederror"
    params["booster"] = "gblinear"
    
    X_tr, y_tr = datasets[station_id]
    
    X_ts_rcp45 = projections_datasets[station_id]['rcp45']
    X_ts_rcp85 = projections_datasets[station_id]['rcp85']
    
    # sort the columns in alphabetical order
    X_tr = X_tr.reindex(sorted(X_tr.columns), axis=1)
    X_ts_rcp45 = X_ts_rcp45.reindex(sorted(X_ts_rcp45.columns), axis=1)
    X_ts_rcp85 = X_ts_rcp85.reindex(sorted(X_ts_rcp85.columns), axis=1)
    
    n_size = len(X_tr)
    predictions_rcp45 = np.zeros((len(X_ts_rcp45), n_iterations))
    predictions_rcp85 = np.zeros((len(X_ts_rcp85), n_iterations))
    metrics = []
    
    for i in range(n_iterations):
    # Bootstrap sample (random state changes each iteration)
        X_resampled, y_resampled = resample(X_tr, y_tr, n_samples=n_size, random_state=i)
        
        # Train the model with the best hyperparameters
        model = xgb.XGBRegressor(**params, random_state=42)
        model.fit(X_resampled, y_resampled)
        
        # Predict the projections
        y_pred_rcp45 = model.predict(X_ts_rcp45)
        predictions_rcp45[:, i] = y_pred_rcp45
        
        y_pred_rcp85 = model.predict(X_ts_rcp85)
        predictions_rcp85[:, i] = y_pred_rcp85
        
    
    # Calculate 95% confidence interval of the predictions
    lower_bound_rcp45 = np.percentile(predictions_rcp45, 2.5, axis=1)
    upper_bound_rcp45 = np.percentile(predictions_rcp45, 97.5, axis=1)
    
    lower_bound_rcp85 = np.percentile(predictions_rcp85, 2.5, axis=1)
    upper_bound_rcp85 = np.percentile(predictions_rcp85, 97.5, axis=1)
    
    # Calculate the mean predictions
    mean_predictions_rcp45 = np.mean(predictions_rcp45, axis=1)
    mean_predictions_rcp85 = np.mean(predictions_rcp85, axis=1)
    
    xgb_results[station_id] = {
        'rcp45': {
            'mean': mean_predictions_rcp45,
            'lower_bound': lower_bound_rcp45,
            'upper_bound': upper_bound_rcp45
        },
        'rcp85': {
            'mean': mean_predictions_rcp85,
            'lower_bound': lower_bound_rcp85,
            'upper_bound': upper_bound_rcp85
        }
    }

### Plot

##### RCP45

In [None]:
# plot the results

for station_id in surface_df['Station'].unique():
    print(f"=== Station {station_id} ===")
    
    fig = go.Figure()
    
    y_true = datasets[station_id][1]
    
    end_date = y_true.index[-1]
    
    mean_rcp45 = xgb_results[station_id]['rcp45']['mean']
    lower_bound_rcp45 = xgb_results[station_id]['rcp45']['lower_bound']
    upper_bound_rcp45 = xgb_results[station_id]['rcp45']['upper_bound']
    
    # add the datetime column back
    datetime_column = projections_datasets[station_id]['rcp45'].index
    
    mean_rcp45 = pd.Series(mean_rcp45, index=datetime_column)
    lower_bound_rcp45 = pd.Series(lower_bound_rcp45, index=datetime_column)
    upper_bound_rcp45 = pd.Series(upper_bound_rcp45, index=datetime_column)
    
    # make the prediction start from the end of the true data
    mean_rcp45 = mean_rcp45.loc[end_date:]
    lower_bound_rcp45 = lower_bound_rcp45.loc[end_date:]
    upper_bound_rcp45 = upper_bound_rcp45.loc[end_date:]
    
    # Concatenate historical and projection data
    combined_mean = pd.concat([y_true['DOC (mg/l)'], mean_rcp45])
    combined_lower_bound = pd.concat([y_true['DOC (mg/l)'], lower_bound_rcp45])
    combined_upper_bound = pd.concat([y_true['DOC (mg/l)'], upper_bound_rcp45])
    
    fig.add_trace(
        go.Scatter(
            x=combined_mean.index,
            y=combined_mean,
            mode='lines',
            line=dict(color='blue'),
            name='RCP4.5 Projections'
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=combined_lower_bound.index,
            y=combined_lower_bound,
            mode='lines',
            line=dict(color='blue', width=0),
            showlegend=False
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=combined_upper_bound.index,
            y=combined_upper_bound,
            mode='lines',
            line=dict(color='red', width=0),
            fill='tonexty',
            fillcolor='rgba(0, 100, 80, 0.2)',
            name='95% CI',
            showlegend=True,
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=y_true.index,
            y=y_true['DOC (mg/l)'],
            mode='lines',
            line=dict(color='black'),
            name='Historical Data'
        )
    )
    
    fig.update_layout(
        title=f"Station {station_id} - RCP4.5",
        xaxis_title="Date",
        yaxis_title="DOC (mg/l)"
    )
    
    fig.show()

##### RCP85

In [None]:
# plot the results

for station_id in surface_df['Station'].unique():
    print(f"=== Station {station_id} ===")
    
    fig = go.Figure()
    
    y_true = datasets[station_id][1]
    
    end_date = y_true.index[-1]
    
    mean_rcp85 = xgb_results[station_id]['rcp85']['mean']
    lower_bound_rcp85 = xgb_results[station_id]['rcp85']['lower_bound']
    upper_bound_rcp85 = xgb_results[station_id]['rcp85']['upper_bound']
    
    # add the datetime column back
    datetime_column = projections_datasets[station_id]['rcp85'].index
    
    mean_rcp85 = pd.Series(mean_rcp85, index=datetime_column)
    lower_bound_rcp85 = pd.Series(lower_bound_rcp85, index=datetime_column)
    upper_bound_rcp85 = pd.Series(upper_bound_rcp85, index=datetime_column)
    
    # make the prediction start from the end of the true data
    mean_rcp85 = mean_rcp85.loc[end_date:]
    lower_bound_rcp85 = lower_bound_rcp85.loc[end_date:]
    upper_bound_rcp85 = upper_bound_rcp85.loc[end_date:]
    
    # Concatenate historical and projection data
    combined_mean = pd.concat([y_true['DOC (mg/l)'], mean_rcp85])
    combined_lower_bound = pd.concat([y_true['DOC (mg/l)'], lower_bound_rcp85])
    combined_upper_bound = pd.concat([y_true['DOC (mg/l)'], upper_bound_rcp85])
    
    fig.add_trace(
        go.Scatter(
            x=combined_mean.index,
            y=combined_mean,
            mode='lines',
            line=dict(color='red'),
            name='RCP8.5 Projections'
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=combined_lower_bound.index,
            y=combined_lower_bound,
            mode='lines',
            line=dict(color='red', width=0),
            showlegend=False
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=combined_upper_bound.index,
            y=combined_upper_bound,
            mode='lines',
            line=dict(color='red', width=0),
            fill='tonexty',
            fillcolor='rgba(255, 0, 0, 0.2)',
            name='95% CI',
            showlegend=True,
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=y_true.index,
            y=y_true['DOC (mg/l)'],
            mode='lines',
            line=dict(color='black'),
            name='Historical Data'
        )
    )
    
    fig.update_layout(
        title=f"Station {station_id} - RCP8.5",
        xaxis_title="Date",
        yaxis_title="DOC (mg/l)"
    )
    
    fig.show()

### Trend Analysis