# Berlin Data

Time series for the Havel River inflow to the city (Konradshöhe, Messstellennummer 305) and the downstream station (Schleuse Spandau, Messstellennummer 320), DOC and TOC.

Two groundwater station are attached, only with quality, no DOC/TOC is measured here, but UV254 and other. The groundwater stations are not influenced by bank filtrate and represent near-natural conditions (for a city like Berlin).

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.tsa.seasonal as smt
from googletrans import Translator

import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL

from sklearn.preprocessing import MinMaxScaler

from plotly.subplots import make_subplots
from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)

from scipy.stats import pearsonr

# Define Paths

In [None]:
data_folder = os.path.join("..", "..", "data", "berlin")

raw_data_folder = os.path.join(data_folder, "raw_data")
clean_data_folder = os.path.join(data_folder, "clean_data")
data_info_folder = os.path.join(data_folder, "data_info")

ground_water_folder = os.path.join(raw_data_folder, "ground water")
surface_water_folder = os.path.join(raw_data_folder, "surface water")
meteorological_folder = os.path.join(raw_data_folder, "meteorological")

# Load Data

## Ground Water

In [None]:
ts_gw_df = pd.read_csv(
    os.path.join(
        ground_water_folder, "time-series_ground-water_quality.csv"
    )
)

In [None]:
ts_gw_df

In [None]:
ts_gw_df.rename(
    columns={
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Messwert": "Value",
    },
    inplace=True,
)

## Surface Water

In [None]:
ts_sw_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_quality.csv"
    )
)

In [None]:
flow_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_flow.csv"
    )
)

In [None]:
ts_sw_df

In [None]:
ts_sw_df.rename(
    columns={
        "Messstelle": "Station",
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Wert": "Value",
        "Bestimmungsgrenze": "LOQ",
    },
    inplace=True,
)

ts_sw_df.drop(
    columns=[
        "Entnahmetiefe [m]",
        "Vorzeichen",
        "Messmethode",
    ],
    inplace=True,
)

In [None]:
flow_df

In [None]:
flow_df.rename(
    columns={
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Tagesmittelwert": "Flow River",
    },
    inplace=True,
)

## Meteorological

Daily Measurements

In [None]:
meteo_df = pd.read_csv(
    os.path.join(
        meteorological_folder,
        "produkt_klima_tag_19480101_20231231_00433.csv",
    ),
    sep=";",
)

In [None]:
meteo_df.columns.to_list()

In [None]:
meteo_df.rename(
    columns={
        "STATIONS_ID": "Station ID",
        "MESS_DATUM": "DateTime",
        "  FX": "Wind Speed Max (m/s)",
        "  FM": "Wind Speed Mean (m/s)",
        " RSK": "Cumulated Rainfall (mm)",
        "RSKF": "Cumulated Rainfall Type",
        " SDK": "Sunshine Duration (hours)",
        "SHK_TAG": "Snow Height (cm)",
        "  NM": "Cloud Coverage (1/8)",
        " VPM": "Vapor Pressure (hPa)",
        "  PM": "Pressure (hPa)",
        " TMK": "Temperature Mean (°C)",
        " UPM": "Humidity (%)",
        " TXK": "Temperature Max at 2m (°C)",
        " TNK": "Temperature Min at 2m (°C)",
        " TGK": "Temperature Min at 5cm (°C)",
    },
    inplace=True,
)

# Preprocess Data

## Ground Water Dataset

In [None]:
translator = Translator()

In [None]:
parameters = ts_gw_df["Parameter"].unique()

In [None]:
parameters.tolist()

In [None]:
# parameters_translated = [translator.translate(item, dest='en').text for item in parameters.tolist()]

In [None]:
"""Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
Dissolved oxygen
-Nitrate
-pH
Redox potential"""

In [None]:
# parameters_translated

### Build Dataset per Station

In [None]:
variables = {
    "Temperatur (Luft)": "Air Temperature (°C)",
    "Temperatur (Wasser)": "Water Temperature (°C)",
    "UV-Adsorption (254)": "UVA254 (1/m)",
    "Leitfähigkeit 25°C vor Ort": "Conductivity (µS/cm)",
    "Ammonium": "Ammonium (mg/l)",
    "Nitrat": "Nitrate (mg/l)",
    "pH-Wert (Feld)": "pH",
    "Dichlormethan": "Dichloromethane (µg/l)",
    "Trichlormethan": "Trichloromethane (µg/l)",
    "Tetrachlormethan": "Tetrachloromethane (µg/l)",
    "Bromoform": "Bromoform (µg/l)",
    "Bromdichlormethan": "Bromodichloromethane (µg/l)",
    "Dibromchlormethan": "Dibromochloromethane (µg/l)",
}

In [None]:
ground_df = ts_gw_df[ts_gw_df["Parameter"].isin(variables.keys())]

ground_df["Parameter"] = ground_df["Parameter"].map(variables)

In [None]:
ground_df["Station ID"].unique()

In [None]:
ground_df["DateTime"] = pd.to_datetime(ground_df["DateTime"])

In [None]:
gw_stations_dict = {}
for station in ground_df["Station ID"].unique():
    station_df = ground_df[ground_df["Station ID"] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper("DateTime"),
        columns="Parameter",
        values="Value",
    )

    gw_stations_dict[station] = station_df

### Analyze Stations

In [None]:
ground_info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N Samples",
            "% Missing Values",
            "Frequency (days)",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=pd.MultiIndex.from_product(
        [ground_df["Station ID"].unique(), variables.values()],
        names=["Station ID", "Parameter"],
    ),
)

#### 5049 - Treptow-Köpenick

In [None]:
station_df = gw_stations_dict[5049]

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 6 months

In [None]:
station_df.columns.to_list()

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 5049",
        labels={"Date": "Date", column: column},
    )
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 5049",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Invalid Values

In [None]:
# set to nan the invalid values
station_df.loc[
    station_df["Ammonium (mg/l)"] < 0, ["Ammonium (mg/l)"]
] = np.nan
station_df.loc[
    station_df["Nitrate (mg/l)"] < 0, ["Nitrate (mg/l)"]
] = np.nan
station_df.loc[
    station_df["Dichloromethane (µg/l)"] < 0, ["Dichloromethane (µg/l)"]
] = np.nan
station_df.loc[
    station_df["Tetrachloromethane (µg/l)"] < 0,
    ["Tetrachloromethane (µg/l)"],
] = np.nan
station_df.loc[
    station_df["Trichloromethane (µg/l)"] < 0,
    ["Trichloromethane (µg/l)"],
] = np.nan

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    if station_df[column].dropna().shape[0] <= 1:
        continue

    start_date = (
        station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    )
    end_date = (
        station_df[column].dropna().index.max().strftime("%Y-%m-%d")
    )

    df = station_df[start_date:end_date][column]

    print(f"Start date for {column}: {start_date}")
    print(f"End date for {column}: {end_date}")

    missing_values = df.isna().sum() / df.shape[0]
    print(f"Missing values for {column}: {missing_values}")

    frequency = df.index.to_series().diff().value_counts().index[0].days
    print(f"Frequency for {column}: {frequency}")
    print()

    ground_info_df.loc["N Samples", (5049, column)] = (
        station_df[column].dropna().shape[0]
    )
    ground_info_df.loc[
        "% Missing Values", (5049, column)
    ] = missing_values
    ground_info_df.loc["Frequency (days)", (5049, column)] = frequency
    ground_info_df.loc["Start Date", (5049, column)] = start_date
    ground_info_df.loc["End Date", (5049, column)] = end_date

##### Outliers and Missing Values Imputation

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
station_df.drop(
    columns=[
        "Dichloromethane (µg/l)",
        "Tetrachloromethane (µg/l)",
        "Trichloromethane (µg/l)",
    ],
    inplace=True,
)

In [None]:
# define the outliers through the STL decomposition

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# define the outliers through the STL decomposition

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=2,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
%%script false --no-raise-error
# no outliers detected

# create copy such that the processed columns do not affect the original dataframe until the end
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # remove the outliers
    forecasting_final = forecasting_final[
        forecasting_final["anomaly"] == "No"
    ]

    df = forecasting_final[["ds", "y"]]

    df.set_index("ds", inplace=True)

    df.rename(columns={"y": column}, inplace=True)

    # redo the resampling since the outliers have been removed and
    # some months may have been removed
    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    station_df_copy.loc[df.index, column] = df[column]


station_df = station_df_copy

In [None]:
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)
    
    station_df_copy.loc[df.index, column] = df
    
station_df = station_df_copy

In [None]:
# final check

for column in station_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# truncate the dataset based on the UVA254 date range
start_date = station_df["UVA254 (1/m)"].dropna().index.min()
end_date = station_df["UVA254 (1/m)"].dropna().index.max()

station_df = station_df[start_date:end_date]

In [None]:
gw_5049_df = station_df

In [None]:
# da mettere nell'altro notebook
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend",
        )
    )

    slope = results.params[1]

    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df.index.min()
    end_date = df.index.max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    )

    fig.show()

#### 7285 - Steglitz-Zehlendorf

In [None]:
station_df = gw_stations_dict[7285]

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 6 months

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 7285",
        labels={"DateTime": "DateTime", column: column},
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 7285",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Invalid Values

In [None]:
station_df.columns.to_list()

In [None]:
cols = [
    "Bromodichloromethane (µg/l)",
    "Bromoform (µg/l)",
    "Dibromochloromethane (µg/l)",
    "Dichloromethane (µg/l)",
    "Tetrachloromethane (µg/l)",
    "Trichloromethane (µg/l)",
    "Nitrate (mg/l)",
]

# set to nan the invalid values for the columns
for column in cols:
    station_df.loc[station_df[column] < 0, [column]] = np.nan

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    if station_df[column].dropna().shape[0] < 2:
        continue

    start_date = (
        station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    )
    end_date = (
        station_df[column].dropna().index.max().strftime("%Y-%m-%d")
    )

    df = station_df[start_date:end_date][column]

    print(f"Start date for {column}: {start_date}")
    print(f"End date for {column}: {end_date}")

    missing_values = df.isna().sum() / df.shape[0]
    print(f"Missing values for {column}: {missing_values}")

    frequency = df.index.to_series().diff().value_counts().index[0].days
    print(f"Frequency for {column}: {frequency}")

    ground_info_df.loc["N Samples", (7285, column)] = (
        station_df[column].dropna().shape[0]
    )
    ground_info_df.loc[
        "% Missing Values", (7285, column)
    ] = missing_values
    ground_info_df.loc["Frequency (days)", (7285, column)] = frequency
    ground_info_df.loc["Start Date", (7285, column)] = start_date
    ground_info_df.loc["End Date", (7285, column)] = end_date

##### Outliers and Missing Values Imputation

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
station_df.drop(columns=cols, inplace=True)

In [None]:
# define the outliers through the STL decomposition

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# Prophet is used to remove outliers

# create copy such that the processed columns do not affect the original dataframe until the end
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # remove the outliers
    forecasting_final = forecasting_final[
        forecasting_final["anomaly"] == "No"
    ]

    df = forecasting_final[["ds", "y"]]

    df.set_index("ds", inplace=True)

    df.rename(columns={"y": column}, inplace=True)

    # redo the resampling since the outliers have been removed and
    # some months may have been removed
    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    station_df_copy.loc[df.index, column] = df[column]


station_df = station_df_copy

In [None]:
# final check

for column in station_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# truncate the dataset based on the UVA254 date range
start_date = station_df["UVA254 (1/m)"].dropna().index.min()
end_date = station_df["UVA254 (1/m)"].dropna().index.max()

station_df = station_df[start_date:end_date]

In [None]:
gw_7285_df = station_df

In [None]:
# da mettere nell'altro notebook
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend",
        )
    )

    slope = results.params[1]
    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df.index.min()
    end_date = df.index.max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    )

    fig.show()

### Build Unique Ground Water Dataset

In [None]:
# build unique dataframe for all stations
# set the number of the station as further variable
gw_5049_df["Station"] = 5049
gw_7285_df["Station"] = 7285

gw_5049_df.index.name = "DateTime"
gw_7285_df.index.name = "DateTime"

gw_5049_df.reset_index(inplace=True)
gw_7285_df.reset_index(inplace=True)

gw_df = pd.concat([gw_5049_df, gw_7285_df], axis=0)

### UVA254 vs Ammonium

In [None]:
colors = ["blue", "red"]

fig = go.Figure()

for station_id, station_df in gw_stations_dict.items():
    df = station_df[["Ammonium (mg/l)", "UVA254 (1/m)"]].copy()

    df.dropna(inplace=True)

    scaler = MinMaxScaler()

    X = df["Ammonium (mg/l)"].copy()

    # X = scaler.fit_transform(X.values.reshape(-1, 1))

    X = sm.add_constant(X)
    y = df["UVA254 (1/m)"].copy()

    # y = scaler.fit_transform(y.values.reshape(-1, 1))

    model = sm.OLS(y, X)
    results = model.fit()

    line = pd.Series(results.predict(X), index=df.index)

    slope = results.params[1]
    p_value = results.pvalues[1]

    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")

    color = colors.pop()

    fig.add_trace(
        go.Scatter(
            x=X["Ammonium (mg/l)"],
            y=y,
            mode="markers",
            name=f"Station {station_id}",
            marker=dict(size=8, opacity=0.7, color=color),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=X["Ammonium (mg/l)"],
            y=line,
            mode="lines",
            name=f"Linear Regression Station {station_id}",
            line=dict(dash="dash", color=color),
        )
    )

fig.update_layout(
    xaxis_title="Ammonium (mg/l)",
    yaxis_title="UVA254 (1/m)",
    font=dict(
        size=18,
    ),
    title="Ground Water",
    # legend=dict(
    #     yanchor="top",
    #     y=0.99,
    #     xanchor="right",
    #     x=0.99
    # )
)

fig.show()

## Surface Water Dataset

In [None]:
parameters = ts_sw_df["Parameter"].unique()

In [None]:
parameters.tolist()

In [None]:
# parameters_translated = [translator.translate(item, dest='en').text for item in parameters.tolist()]

In [None]:
"""
Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
-Dissolved oxygen
-Nitrate
-pH
Redox potential
"""

In [None]:
# parameters_translated

### Build Dataset per Station

In [None]:
# the parameters that are present for the moment are:
variables = {
    "Lufttemperatur": "Air Temperature (°C)",
    "Wassertemperatur": "Water Temperature (°C)",
    "Spektraler Absorptionskoeffizient (SAK) 254nm": "UVA254 (1/m)",
    "Leitfähigkeit": "Conductivity (µS/cm)",
    "Ammonium-Stickstoff": "Ammonium (mg/l)",
    "Nitrat-Stickstoff": "Nitrate (mg/l)",
    "pH-Wert": "pH",
    "DOC (Gelöster organischer Kohlenstoff)": "DOC (mg/l)",
    "TOC (Organischer Kohlenstoff)": "TOC (mg/l)",
    "Sauerstoff-Gehalt": "Dissolved Oxygen (mg/l)",
    "Coliforme B.": "Coliform (MPN/100ml)",
    "E.Coli": "E.Coli (MPN/100ml)",
    "BSB1 (Biochem. Sauerstoffbedarf, 24h)": "BOD (mg/l)",
    "Intestinale Enterokokken": "Enterococcus (MPN/100ml)",
}

In [None]:
surface_df = ts_sw_df[ts_sw_df["Parameter"].isin(variables.keys())]

surface_df["Parameter"] = surface_df["Parameter"].map(variables)

In [None]:
surface_df["Station ID"].unique()

In [None]:
surface_df["Station"].unique()

In [None]:
surface_df["DateTime"] = pd.to_datetime(surface_df["DateTime"])

In [None]:
stations_dict = {}
for station in surface_df["Station ID"].unique():
    station_df = surface_df[surface_df["Station ID"] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper("DateTime"),
        columns="Parameter",
        values="Value",
    )

    stations_dict[station] = station_df

#### Separate Bacteria Variables

In [None]:
bacteria_columns = [
    "E.Coli (MPN/100ml)",
    "Coliform (MPN/100ml)",
    "Enterococcus (MPN/100ml)",
]

In [None]:
# already build the final dataset for the bacteria

bacteria_dict = {}
for station in surface_df["Station ID"].unique():
    station_df = surface_df[surface_df["Station ID"] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper("DateTime"),
        columns="Parameter",
        values=["Value", "LOQ"],
    )

    # get only the bacteria columns
    station_df = station_df[
        station_df.columns[station_df.columns.get_level_values(1).isin(bacteria_columns)]
    ]
    
    station_df['Station'] = station
    station_df.index = station_df.index.date
    
    bacteria_dict[station] = station_df

In [None]:
bacteria_df = pd.concat(bacteria_dict.values(), axis=0)

In [None]:
bacteria_df

In [None]:
# remove bacteria from the station_df datasets in station_dict

for station in stations_dict.keys():
    station_df = stations_dict[station]
    station_df = station_df[
        ~station_df.index.isin(bacteria_dict[station].index)
    ]
    stations_dict[station] = station_df

### Analyze Stations

Coliform: /100 ml

E.Coli: /100 ml

#### Plot Variables

In [None]:
# get common columns for all the stations
common_columns = set(stations_dict[105].columns)
for station_id, station_df in stations_dict.items():
    common_columns = common_columns.intersection(station_df.columns)

In [None]:
for column in common_columns:
    fig = go.Figure()

    for station_id, station_df in stations_dict.items():
        column_df = station_df[column].copy()

        column_df.dropna(inplace=True)

        fig.add_trace(
            go.Scatter(
                x=column_df.index,
                y=column_df,
                mode="lines",
                name=f"Station {station_id}",
            )
        )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )
    fig.show()

In [None]:
surface_info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N Samples",
            "% Missing Values",
            "Frequency (days)",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=pd.MultiIndex.from_product(
        [surface_df["Station ID"].unique(), variables.values()],
        names=["Station ID", "Parameter"],
    ),
)

#### 105 - Dämeritzsee-Seemitte

In [None]:
station_df = stations_dict[105]

In [None]:
flow_df["DateTime"] = pd.to_datetime(flow_df["DateTime"])

station_flow_df = flow_df[flow_df["Station ID"] == 5827101]

station_flow_df = station_flow_df[["DateTime", "Flow River"]].set_index(
    "DateTime"
)

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df, left_index=True, right_index=True, how="left"
)

station_df.index = pd.to_datetime(station_df.index)

station_df.rename(
    columns={"Flow River": "Flow River Rate (m³/s)"}, inplace=True
)

In [None]:
# for each column, compute the % of missing values
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    df = station_df[date_range[0] : date_range[1]][column]

    missing_values = df.isna().sum() / df.shape[0]
    print(f"{column}: {missing_values}")

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 105 - Range: {date_range[0].date()} - {date_range[1].date()}",
        labels={"DateTime": "DateTime", column: column},
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Invalid Values

In [None]:
station_df.columns.to_list()

In [None]:
station_df.loc[station_df["DOC (mg/l)"] <= 0, ["DOC (mg/l)"]] = np.nan
station_df.loc[station_df["TOC (mg/l)"] <= 0, ["TOC (mg/l)"]] = np.nan
station_df.loc[
    station_df["Flow River Rate (m³/s)"] < 0, ["Flow River Rate (m³/s)"]
] = np.nan
station_df.loc[
    station_df["Dissolved Oxygen (mg/l)"] < 0,
    ["Dissolved Oxygen (mg/l)"],
] = np.nan
# station_df.loc[
#     (station_df["E.Coli (MPN/100ml)"] < 0)
#     | (station_df["E.Coli (MPN/100ml)"] > 3000),
#     ["E.Coli (MPN/100ml)"],
# ] = np.nan
# station_df.loc[
#     (station_df["Coliform (MPN/100ml)"] < 0)
#     | (station_df["Coliform (MPN/100ml)"] >= 5000),
#     ["Coliform (MPN/100ml)"],
# ] = np.nan
station_df.loc[
    (station_df["BOD (mg/l)"] < 0) | (station_df["BOD (mg/l)"] > 100),
    ["BOD (mg/l)"],
] = np.nan
station_df.loc[
    station_df["Ammonium (mg/l)"] < 0, ["Ammonium (mg/l)"]
] = np.nan
station_df.loc[
    station_df["Nitrate (mg/l)"] < 0, ["Nitrate (mg/l)"]
] = np.nan
station_df.loc[station_df["pH"] < 7, ["pH"]] = np.nan

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    df = station_df[column].copy()

    start_date = df.dropna().index.min().strftime("%Y-%m-%d")
    end_date = df.dropna().index.max().strftime("%Y-%m-%d")

    df = df[start_date:end_date]

    missing_values = df.isna().sum() / df.shape[0] * 100

    surface_info_df.loc["N Samples", (105, column)] = (
        station_df[column].dropna().shape[0]
    )
    surface_info_df.loc[
        "% Missing Values", (105, column)
    ] = missing_values
    surface_info_df.loc["Frequency (days)", (105, column)] = (
        station_df.index.to_series().diff().value_counts().index[0].days
    )
    surface_info_df.loc["Start Date", (105, column)] = start_date
    surface_info_df.loc["End Date", (105, column)] = end_date

##### Outliers and Missing Values Imputation

In [None]:
# for each column, compute the % of missing values
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    df = station_df[date_range[0] : date_range[1]][column]

    missing_values = df.isna().sum() / df.shape[0]
    print(f"{column}: {missing_values}")
    print()

In [None]:
station_df.drop(columns="UVA254 (1/m)", inplace=True)

In [None]:
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    # df.interpolate(method='time', inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig = make_subplots(
        rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1
    )

    fig.add_trace(
        go.Scatter(
            x=df["ds"],
            y=df["y"],
            mode="lines",
            name="Original",
            line=dict(color="blue"),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction",
            line=dict(color="red"),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Anomaly",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
            line=dict(color="green"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
            line=dict(color="orange"),
        ),
        row=2,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        # legend=dict(
        #         yanchor="top",
        #         y=0.99,
        #         xanchor="left",
        #         x=0.01
        #     )
    )

    fig.show()

In [None]:
# define the outliers through the STL decomposition

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# Prophet is used to remove outliers

# create copy such that the processed columns do not affect the original dataframe until the end
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # remove the outliers
    forecasting_final = forecasting_final[
        forecasting_final["anomaly"] == "No"
    ]

    df = forecasting_final[["ds", "y"]]

    df.set_index("ds", inplace=True)

    df.rename(columns={"y": column}, inplace=True)

    # redo the resampling since the outliers have been removed and
    # some months may have been removed
    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    station_df_copy.loc[df.index, column] = df[column]


station_df = station_df_copy

In [None]:
# final check

for column in station_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# truncate the dataset based on the DOC date range
start_date = station_df["DOC (mg/l)"].dropna().index.min()
end_date = station_df["DOC (mg/l)"].dropna().index.max()

station_df = station_df[start_date:end_date]

In [None]:
sw_105_df = station_df

In [None]:
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend",
        )
    )

    slope = results.params[1]

    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df.index.min()
    end_date = df.index.max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    )

    fig.show()

#### 305 - Oberhavel-Konradshöhe

In [None]:
station_df = stations_dict[305]

In [None]:
flow_df["DateTime"] = pd.to_datetime(flow_df["DateTime"])

station_flow_df = flow_df[flow_df["Station ID"] == 5815911]

station_flow_df = station_flow_df[["DateTime", "Flow River"]].set_index(
    "DateTime"
)

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df, left_index=True, right_index=True, how="left"
)

station_df.rename(
    columns={"Flow River": "Flow River Rate (m³/s)"}, inplace=True
)

station_df.index = pd.to_datetime(station_df.index)

In [None]:
# for each column, compute the % of missing values
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    df = station_df[date_range[0] : date_range[1]][column]

    missing_values = df.isna().sum() / df.shape[0]
    print(f"{column}: {missing_values}")

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    # compute date range for which the data is available
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 305 - Range: {date_range[0].date()} - {date_range[1].date()}",
        labels={"DateTime": "DateTime", column: column},
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 305",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Invalid Values

In [None]:
station_df.loc[
    (station_df["DOC (mg/l)"] <= 0) | (station_df["DOC (mg/l)"] >= 20),
    ["DOC (mg/l)"],
] = np.nan
station_df.loc[station_df["TOC (mg/l)"] <= 0, ["TOC (mg/l)"]] = np.nan
station_df.loc[
    station_df["Flow River Rate (m³/s)"] < 0, ["Flow River Rate (m³/s)"]
] = np.nan
station_df.loc[
    station_df["Dissolved Oxygen (mg/l)"] < 0,
    ["Dissolved Oxygen (mg/l)"],
] = np.nan
# station_df.loc[
#     (station_df["E.Coli (MPN/100ml)"] < 0)
#     | (station_df["E.Coli (MPN/100ml)"] > 3000),
#     ["E.Coli (MPN/100ml)"],
# ] = np.nan
# station_df.loc[
#     (station_df["Coliform (MPN/100ml)"] < 0)
#     | (station_df["Coliform (MPN/100ml)"] >= 5000),
#     ["Coliform (MPN/100ml)"],
# ] = np.nan
station_df.loc[
    (station_df["BOD (mg/l)"] < 0) | (station_df["BOD (mg/l)"] > 100),
    ["BOD (mg/l)"],
] = np.nan
station_df.loc[
    station_df["Ammonium (mg/l)"] < 0, ["Ammonium (mg/l)"]
] = np.nan
station_df.loc[
    station_df["Nitrate (mg/l)"] < 0, ["Nitrate (mg/l)"]
] = np.nan

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 305",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    df = station_df[column].copy()

    start_date = df.dropna().index.min().strftime("%Y-%m-%d")
    end_date = df.dropna().index.max().strftime("%Y-%m-%d")

    df = df[start_date:end_date]

    missing_values = df.isna().sum() / df.shape[0] * 100

    surface_info_df.loc["N Samples", (305, column)] = (
        station_df[column].dropna().shape[0]
    )
    surface_info_df.loc[
        "% Missing Values", (305, column)
    ] = missing_values
    surface_info_df.loc["Frequency (days)", (305, column)] = (
        station_df.index.to_series().diff().value_counts().index[0].days
    )
    surface_info_df.loc["Start Date", (305, column)] = start_date
    surface_info_df.loc["End Date", (305, column)] = end_date

##### Outliers and Missing Values Imputation

In [None]:
# for each column, compute the % of missing values
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    df = station_df[date_range[0] : date_range[1]][column]

    missing_values = df.isna().sum() / df.shape[0]
    print(f"{column}: {missing_values}")
    print()

In [None]:
# define the outliers through the STL decomposition

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# Prophet is used to remove outliers

# create copy such that the processed columns do not affect the original dataframe until the end
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    df.index.name = "ds"

    # substitute the outliers with the trend + seasonal components of the STL decomposition
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # remove the outliers
    forecasting_final = forecasting_final[
        forecasting_final["anomaly"] == "No"
    ]

    df = forecasting_final[["ds", "y"]]

    df.set_index("ds", inplace=True)

    df.rename(columns={"y": column}, inplace=True)

    # redo the resampling since the outliers have been removed and
    # some months may have been removed
    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    station_df_copy.loc[df.index, column] = df[column]


station_df = station_df_copy

In [None]:
# final check

for column in station_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# truncate the dataset based on the DOC date range
start_date = station_df["DOC (mg/l)"].dropna().index.min()
end_date = station_df["DOC (mg/l)"].dropna().index.max()

station_df = station_df[start_date:end_date]

In [None]:
sw_305_df = station_df

##### Trend

In [None]:
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df
    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend",
        )
    )

    # get the slope of the regression
    slope = results.params[1]

    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df.index.min()
    end_date = df.index.max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    )

    fig.show()

#### 325 - Havel-Pichelsdorfer Gemünd

In [None]:
station_df = stations_dict[325]

In [None]:
flow_df["DateTime"] = pd.to_datetime(flow_df["DateTime"])

station_flow_df = flow_df[flow_df["Station ID"] == 5803200]

station_flow_df = station_flow_df[["DateTime", "Flow River"]].set_index(
    "DateTime"
)

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df, left_index=True, right_index=True, how="left"
)

station_df.rename(
    columns={"Flow River": "Flow River Rate (m³/s)"}, inplace=True
)

station_df.index = pd.to_datetime(station_df.index)

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 325",
        labels={"DateTime": "DateTime", column: column},
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 325",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Invalid Values

In [None]:
station_df.loc[
    (station_df["DOC (mg/l)"] > 15) | (station_df["DOC (mg/l)"] < 4.5),
    ["DOC (mg/l)"],
] = np.nan
station_df.loc[station_df["TOC (mg/l)"] <= 0, ["TOC (mg/l)"]] = np.nan
station_df.loc[
    station_df["Flow River Rate (m³/s)"] < 0, ["Flow River Rate (m³/s)"]
] = np.nan
station_df.loc[
    station_df["Dissolved Oxygen (mg/l)"] < 0,
    ["Dissolved Oxygen (mg/l)"],
] = np.nan
# station_df.loc[
#     (station_df["E.Coli (MPN/100ml)"] < 0)
#     | (station_df["E.Coli (MPN/100ml)"] > 3000),
#     ["E.Coli (MPN/100ml)"],
# ] = np.nan
# station_df.loc[
#     (station_df["Coliform (MPN/100ml)"] < 0)
#     | (station_df["Coliform (MPN/100ml)"] >= 5000),
#     ["Coliform (MPN/100ml)"],
# ] = np.nan
station_df.loc[
    (station_df["BOD (mg/l)"] < 0) | (station_df["BOD (mg/l)"] > 100),
    ["BOD (mg/l)"],
] = np.nan
station_df.loc[
    station_df["Ammonium (mg/l)"] < 0, ["Ammonium (mg/l)"]
] = np.nan
station_df.loc[
    station_df["Nitrate (mg/l)"] < 0, ["Nitrate (mg/l)"]
] = np.nan
station_df.loc[station_df["pH"] < 7, ["pH"]] = np.nan

In [None]:
# boxplot of the data
for column in station_df.columns:
    fig = go.Figure()
    column_df = station_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    df = station_df[column].copy()

    start_date = df.dropna().index.min().strftime("%Y-%m-%d")
    end_date = df.dropna().index.max().strftime("%Y-%m-%d")

    df = df[start_date:end_date]

    missing_values = df.isna().sum() / df.shape[0] * 100

    surface_info_df.loc["N Samples", (325, column)] = (
        station_df[column].dropna().shape[0]
    )
    surface_info_df.loc[
        "% Missing Values", (325, column)
    ] = missing_values
    surface_info_df.loc["Frequency (days)", (325, column)] = (
        station_df.index.to_series().diff().value_counts().index[0].days
    )
    surface_info_df.loc["Start Date", (325, column)] = start_date
    surface_info_df.loc["End Date", (325, column)] = end_date

##### Outliers and Missing Values Imputation

In [None]:
# for each column, compute the % of missing values
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    df = station_df[date_range[0] : date_range[1]][column]

    missing_values = df.isna().sum() / df.shape[0]
    print(f"{column}: {missing_values}")
    print()

In [None]:
# define the outliers through the STL decomposition and Prophet

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# Prophet is used to remove outliers

# create copy such that the processed columns do not affect the original dataframe until the end
station_df_copy = station_df.copy()

station_df_copy = station_df_copy.resample("M").median()

for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    # remove the outliers
    forecasting_final = forecasting_final[
        forecasting_final["anomaly"] == "No"
    ]

    df = forecasting_final[["ds", "y"]]

    df.set_index("ds", inplace=True)

    df.rename(columns={"y": column}, inplace=True)

    # redo the resampling since the outliers have been removed and
    # some months may have been removed
    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    station_df_copy.loc[df.index, column] = df[column]


station_df = station_df_copy

In [None]:
# final check

for column in station_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# truncate the dataset based on the DOC date range
start_date = station_df["DOC (mg/l)"].dropna().index.min()
end_date = station_df["DOC (mg/l)"].dropna().index.max()

station_df = station_df[start_date:end_date]

In [None]:
sw_325_df = station_df

##### Trend

In [None]:
for column in station_df.columns:
    df = station_df[column].copy()

    df.dropna(inplace=True)

    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df
    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend",
        )
    )

    # get the slope of the regression
    slope = results.params[1]

    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df.index.min()
    end_date = df.index.max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    )

    fig.show()

### Build Unique Surface Water Dataset

In [None]:
# build unique dataframe for all stations
# set the number of the station as further variable
sw_105_df["Station"] = 105
sw_305_df["Station"] = 305
sw_325_df["Station"] = 325

sw_105_df.index.name = "DateTime"
sw_305_df.index.name = "DateTime"
sw_325_df.index.name = "DateTime"

sw_105_df.reset_index(inplace=True)
sw_305_df.reset_index(inplace=True)
sw_325_df.reset_index(inplace=True)

# merge the dataframes
sw_df = pd.concat([sw_105_df, sw_305_df, sw_325_df])

### DOC vs TOC per station

In [None]:
for station_id in stations_dict.keys():
    station_df = stations_dict[station_id]

    # plot the doc and toc in a scatter plot to see if there is a correlation
    fig = px.scatter(
        station_df,
        x="DOC (mg/l)",
        y="TOC (mg/l)",
        trendline="ols",
        trendline_color_override="red",
        trendline_scope="overall",
    )

    results = px.get_trendline_results(fig)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=station_df["DOC (mg/l)"],
            y=station_df["TOC (mg/l)"],
            mode="markers",
            name="Data",
            marker=dict(size=8, color="blue", opacity=0.7),
        )
    )

    # add line on bisector
    # fig.add_trace(
    #     go.Scatter(
    #         x=[0, 20],
    #         y=[0, 20],
    #         mode='lines',
    #         name='Bisector',
    #         line=dict(
    #             color='red',
    #             width=2,
    #             dash='dash'
    #         )
    #     )
    # )

    # get the slope and intercept of the trendline
    slope = results.iloc[0]["px_fit_results"].params[1]
    intercept = results.iloc[0]["px_fit_results"].params[0]

    fig.add_annotation(
        x=0.9,
        y=0.1,
        xref="paper",
        yref="paper",
        text=f"y = {slope:.2f}x + {intercept:.2f}",
        showarrow=False,
        font=dict(size=18, color="red"),
    )

    x = np.linspace(2, 14, 100)

    fig.add_trace(
        go.Scatter(
            x=x,
            y=slope * x + intercept,
            mode="lines",
            name="Overall Trendline",
            line=dict(color="red", width=2),
        )
    )

    # add the equation to the legend
    fig.update_traces(
        name=f"Linear Regression",
        selector=dict(name="Overall Trendline"),
    )

    if station_id == 105:
        fig.update_layout(
            xaxis_title="DOC (mg/l)",
            yaxis_title="TOC (mg/l)",
            font=dict(
                size=18,
            ),
            title=f"DOC vs TOC at station {station_id}",
            legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        )

    else:
        fig.update_layout(
            xaxis_title="DOC (mg/l)",
            yaxis_title="TOC (mg/l)",
            font=dict(
                size=18,
            ),
            title=f"DOC vs TOC at station {station_id}",
            legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
        )

    fig.show(width=20, height=10)

### DOC vs Ammonium

In [None]:
colors = ["blue", "red", "green"]

fig = go.Figure()

for station_id, station_df in stations_dict.items():
    station_df.index = pd.to_datetime(station_df.index)

    station_df = station_df.resample("M").median()

    station_df.interpolate(method="time", inplace=True)

    df = station_df[["Ammonium (mg/l)", "DOC (mg/l)"]].copy()

    df.dropna(inplace=True)

    # compute linear regression and plot the line
    X = df["Ammonium (mg/l)"].copy()
    X = sm.add_constant(X)
    y = df["DOC (mg/l)"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    line = pd.Series(results.predict(X), index=df.index)

    slope = results.params[1]
    p_value = results.pvalues[1]

    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")

    color = colors.pop()

    fig.add_trace(
        go.Scatter(
            x=df["Ammonium (mg/l)"],
            y=df["DOC (mg/l)"],
            mode="markers",
            name=f"Station {station_id}",
            marker=dict(size=8, opacity=0.7, color=color),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=df["Ammonium (mg/l)"],
            y=line,
            mode="lines",
            name=f"Linear Regression Station {station_id}",
            line=dict(dash="dash", color=color),
        )
    )


fig.update_layout(
    xaxis_title="Ammonium (mg/l)",
    yaxis_title="DOC (mg/l)",
    font=dict(
        size=18,
    ),
    title="Surface Water",
    # legend=dict(
    #     yanchor="top",
    #     y=0.99,
    #     xanchor="right",
    #     x=0.99
    # )
)

fig.show()

## Meteorological

In [None]:
diff_columns = ["QN_3", "QN_4", "eor", "Cumulated Rainfall Type"]

In [None]:
meteo_df

In [None]:
meteo_df["DateTime"] = pd.to_datetime(
    meteo_df["DateTime"], format="%Y%m%d"
)

meteo_df.set_index("DateTime", inplace=True)

In [None]:
meteo_df.drop(columns=["Station ID"], inplace=True)

In [None]:
meteo_df.loc[
    meteo_df["Cumulated Rainfall (mm)"] < 0, ["Cumulated Rainfall (mm)"]
] = np.nan

In [None]:
# set to nan every value that is equal to -999 or -999.0 in the dataframe
meteo_df.replace(-999, np.nan, inplace=True)

### Analyze Station

In [None]:
meteo_info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N Samples",
            "% Missing Values",
            "Frequency (days)",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=pd.Index(["Parameter"]),
)

#### Timeseries

In [None]:
for column in meteo_df.columns.difference(diff_columns):
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=meteo_df.index,
            y=meteo_df[column],
            mode="lines",
            name=column,
            line=dict(color="blue"),
        )
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        title=column,
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    fig.show()

#### Boxplots

In [None]:
# boxplot of the data
for column in meteo_df.columns.difference(diff_columns):
    fig = go.Figure()
    column_df = meteo_df[column]

    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(y=column_df[column_df.index.year == year], name=year)
        )
    fig.update_layout(
        title=f"{column} at airport",
        xaxis_title="Year",
        yaxis_title=column,
    )

    fig.show()

#### Store Info

In [None]:
for column in meteo_df.columns.difference(diff_columns):
    start_date = (
        meteo_df[column].dropna().index.min().strftime("%Y-%m-%d")
    )
    end_date = (
        meteo_df[column].dropna().index.max().strftime("%Y-%m-%d")
    )

    df = meteo_df[start_date:end_date][column]

    print(f"Start date for {column}: {start_date}")
    print(f"End date for {column}: {end_date}")

    missing_values = df.isna().sum() / df.shape[0]
    print(f"Missing values for {column}: {missing_values}")

    frequency = df.index.to_series().diff().value_counts().index[0].days
    print(f"Frequency for {column}: {frequency}")

    meteo_info_df.loc["N Samples", column] = (
        meteo_df[column].dropna().shape[0]
    )
    meteo_info_df.loc["% Missing Values", column] = missing_values
    meteo_info_df.loc["Frequency (days)", column] = frequency
    meteo_info_df.loc["Start Date", column] = start_date
    meteo_info_df.loc["End Date", column] = end_date

#### Outliers and Missing Values Imputation

In [None]:
# define the outliers through the STL decomposition

for column in meteo_df.columns.difference(diff_columns):
    df = meteo_df[column].copy()

    df.dropna(inplace=True)

    df = df.resample("M").median()

    df.interpolate(method="time", inplace=True)

    date_range = meteo_df[column].dropna().index
    date_range = meteo_df.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # === STL decomposition ===

    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid

    denoised_df = trend + seasonal

    mean_resid = np.mean(resid)
    std_resid = np.std(resid)

    threshold = 3 * std_resid

    outliers_index = resid[
        (resid > mean_resid + threshold)
        | (resid < mean_resid - threshold)
    ].index

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode="lines",
            name="Original",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=denoised_df,
            mode="lines",
            name="Trend + Seasonal (STL)",
        ),
        row=1,
        col=1,
    )

    print("===== STL =====")
    print()

    # compute rmse between the original and the denoised
    MAE = mean_absolute_error(df, denoised_df)
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(df, denoised_df)
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(df, denoised_df)
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(int(mean_squared_error(df, denoised_df)))
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(df, denoised_df)
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    # compute std of original and std of residuals
    std_df = df.std()

    print()
    print(f"Data std: {std_df}, Resid std: {std_resid}")

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=df.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=resid,
            mode="lines",
            name="Residuals",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=outliers_index,
            y=resid.loc[outliers_index],
            mode="markers",
            name="Outliers (STL)",
        ),
        row=2,
        col=1,
    )

    # plot the threshold
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid + threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=[mean_resid - threshold] * df.shape[0],
            mode="lines",
            name="Threshold",
            line=dict(dash="dash", color="black"),
        ),
        row=2,
        col=1,
    )

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]],
        df,
        how="inner",
        on="ds",
    )

    # Calculate the prediction error and uncertainty
    forecasting_final["error"] = (
        forecasting_final["y"] - forecasting_final["yhat"]
    )
    forecasting_final["uncertainty"] = (
        forecasting_final["yhat_upper"]
        - forecasting_final["yhat_lower"]
    )

    # Anomaly detection
    factor = 1.5
    forecasting_final["anomaly"] = forecasting_final.apply(
        lambda x: "Yes"
        if (np.abs(x["error"]) > factor * x["uncertainty"])
        else "No",
        axis=1,
    )

    print("===== Prophet =====")

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Absolute Error (MAE): " + str(np.round(MAE, 2)))

    # Median Absolute Error (MedAE)
    MEDAE = median_absolute_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Median Absolute Error (MedAE): " + str(np.round(MEDAE, 2)))

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print("Mean Squared Error (MSE): " + str(np.round(MSE, 2)))

    # Root Mean Squarred Error (RMSE)
    RMSE = np.sqrt(
        int(
            mean_squared_error(
                forecasting_final["yhat"], forecasting_final["y"]
            )
        )
    )
    print("Root Mean Squared Error (RMSE): " + str(np.round(RMSE, 2)))

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(
        forecasting_final["yhat"], forecasting_final["y"]
    )
    print(
        "Mean Absolute Percentage Error (MAPE): "
        + str(np.round(MAPE, 2))
        + " %"
    )

    anomaly = forecasting_final[forecasting_final["anomaly"] == "Yes"]

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["yhat"],
            mode="lines",
            name="Prediction (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["y"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["error"],
            mode="lines",
            name="Error",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["uncertainty"],
            mode="lines",
            name="Uncertainty",
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=anomaly["ds"],
            y=anomaly["error"],
            mode="markers",
            name="Outliers (Prophet)",
        ),
        row=3,
        col=1,
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        height=800,
        width=1000,
    )

    fig.show()

In [None]:
# No need to remove outliers
meteo_df.drop(columns=diff_columns, inplace=True)

meteo_df = meteo_df.resample("M").median()

meteo_df.interpolate(method="time", inplace=True)

In [None]:
# final check

for column in meteo_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=meteo_df.index,
            y=meteo_df[column],
            mode="lines",
            name="Original",
        )
    )

    fig.update_layout(
        title=column,
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
    )

    fig.show()

# Add Meteo Data to Surface and Ground Water

## Surface Water

In [None]:
# Need to compare the air temperature between the airport and the stations first

# plot the air temperature for the airport and the stations
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=meteo_df.index,
        y=meteo_df["Temperature Mean (°C)"],
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
    )
)

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    fig.add_trace(
        go.Scatter(
            x=station_df["DateTime"],
            y=station_df["Air Temperature (°C)"],
            mode="lines",
            name=f"Station {station_id}",
        )
    )

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

In [None]:
# compute pearson correlation

for station_id in sw_df["Station"].unique():
    start_date = sw_df[sw_df["Station"] == station_id]["DateTime"].min()
    end_date = sw_df[sw_df["Station"] == station_id]["DateTime"].max()

    # take the common date range with the airport
    start_date = max(start_date, meteo_df.index.min())
    end_date = min(end_date, meteo_df.index.max())

    airport_df = meteo_df[start_date:end_date].copy()

    # take the common date range with the station
    station_df = sw_df[sw_df["Station"] == station_id]
    station_df = station_df[
        (station_df["DateTime"] >= start_date)
        & (station_df["DateTime"] <= end_date)
    ]

    # compute pearson correlation
    corr, _ = pearsonr(
        airport_df["Temperature Mean (°C)"],
        station_df["Air Temperature (°C)"],
    )

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df.index,
            y=airport_df["Temperature Mean (°C)"],
            mode="lines",
            name="Airport",
            line=dict(color="blue"),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=sw_df[sw_df["Station"] == station_id]["DateTime"],
            y=sw_df[sw_df["Station"] == station_id][
                "Air Temperature (°C)"
            ],
            mode="lines",
            name=f"Station {station_id}",
        )
    )

    # add the correlation to the plot
    fig.add_annotation(
        x=0.01,
        y=0.95,
        xref="paper",
        yref="paper",
        text=f"Pearson Correlation: {corr:.2f}",
        showarrow=False,
        font=dict(
            size=18,
        ),
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Air Temperature (°C)",
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# scatter plot of the air temperature between the airport and the stations
for station_id in sw_df["Station"].unique():
    start_date = sw_df[sw_df["Station"] == station_id]["DateTime"].min()
    end_date = sw_df[sw_df["Station"] == station_id]["DateTime"].max()

    # take the common date range with the airport
    start_date = max(start_date, meteo_df.index.min())
    end_date = min(end_date, meteo_df.index.max())

    airport_df = meteo_df[start_date:end_date].copy()

    # take the common date range with the station
    station_df = sw_df[sw_df["Station"] == station_id]
    station_df = station_df[
        (station_df["DateTime"] >= start_date)
        & (station_df["DateTime"] <= end_date)
    ]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df["Temperature Mean (°C)"],
            y=station_df["Air Temperature (°C)"],
            mode="markers",
            name="Data",
            marker=dict(size=8, color="blue", opacity=0.7),
        )
    )

    # add line on bisector
    fig.add_trace(
        go.Scatter(
            x=[-10, 40],
            y=[-10, 40],
            mode="lines",
            name="Bisector",
            line=dict(color="red", width=2, dash="dash"),
        )
    )

    fig.update_layout(
        xaxis_title="Airport",
        yaxis_title=f"Station {station_id}",
        font=dict(
            size=18,
        ),
        title="Air Temperature",
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    fig.show()

In [None]:
meteo_df.columns.to_list()

In [None]:
# reindex the sw_df first to have unique indices
sw_df.reset_index(inplace=True)

In [None]:
# The correlation is high between the airport and the stations,
# so we can add the airport data variables to the stations

# add the rainfall data to the stations
sw_df["Cumulated Rainfall (mm)"] = np.nan

for station_id in sw_df["Station"].unique():
    start_date = sw_df[sw_df["Station"] == station_id]["DateTime"].min()
    end_date = sw_df[sw_df["Station"] == station_id]["DateTime"].max()

    # take the common date range with the airport
    start_date = max(start_date, meteo_df.index.min())
    end_date = min(end_date, meteo_df.index.max())

    airport_df = meteo_df[start_date:end_date].copy()

    # take the common date range with the station
    # Identify the indices in sw_df that match the station_id and are within the date range
    indices = sw_df[
        (sw_df["Station"] == station_id)
        & (sw_df["DateTime"] >= start_date)
        & (sw_df["DateTime"] <= end_date)
    ].index

    # Directly update sw_df for the matching indices
    sw_df.loc[indices, "Cumulated Rainfall (mm)"] = airport_df[
        "Cumulated Rainfall (mm)"
    ].values

In [None]:
sw_df["Cumulated Rainfall (mm)"].fillna(value=0, inplace=True)

In [None]:
sw_df.isna().sum()

In [None]:
sw_df.drop(columns=["index"], inplace=True)

In [None]:
# print the date range for every station and variable

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    for column in station_df.columns.difference(
        ["DateTime", "Station"]
    ):
        start_date = (
            station_df[["DateTime", column]].dropna()["DateTime"].min()
        )
        end_date = (
            station_df[["DateTime", column]].dropna()["DateTime"].max()
        )

        print(f"Station {station_id} - {column}")
        print(f"Start date: {start_date}")
        print(f"End date: {end_date}")
        print()

In [None]:
# Fix ammonium last value
sw_df["Ammonium (mg/l)"].ffill(inplace=True)

## Ground Water

In [None]:
# Need to compare the air temperature between the airport and the stations first

# plot the air temperature for the airport and the stations
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=meteo_df.index,
        y=meteo_df["Temperature Mean (°C)"],
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
    )
)

for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="lines",
            name=f"Station {station_id}",
        )
    )

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

In [None]:
# compute pearson correlation

for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    station_df = station_df.resample("M").median()

    # get dates for the station where the temperature is not nan
    dates = station_df["Air Temperature (°C)"].dropna().index

    station_df = station_df.loc[dates]

    airport_df = meteo_df.loc[dates].copy()

    # compute pearson correlation
    corr, _ = pearsonr(
        airport_df["Temperature Mean (°C)"],
        station_df["Air Temperature (°C)"],
    )

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df.index,
            y=airport_df["Temperature Mean (°C)"],
            mode="lines",
            name="Airport",
            line=dict(color="blue"),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="markers",
            name=f"Station {station_id}",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="lines",
        )
    )

    # add the correlation to the plot
    fig.add_annotation(
        x=0.01,
        y=0.95,
        xref="paper",
        yref="paper",
        text=f"Pearson Correlation: {corr:.2f}",
        showarrow=False,
        font=dict(
            size=18,
        ),
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Air Temperature (°C)",
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# scatter plot of the air temperature between the airport and the stations
for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    station_df = station_df.resample("M").median()

    # get dates for the station where the temperature is not nan
    dates = station_df["Air Temperature (°C)"].dropna().index

    station_df = station_df.loc[dates]

    airport_df = meteo_df.loc[dates].copy()

    X = airport_df["Temperature Mean (°C)"].copy()

    # X = scaler.fit_transform(X.values.reshape(-1, 1))

    X = sm.add_constant(X)
    y = station_df["Air Temperature (°C)"].copy()

    # y = scaler.fit_transform(y.values.reshape(-1, 1))

    model = sm.OLS(y, X)
    results = model.fit()

    line = pd.Series(results.predict(X), index=df.index)

    slope = results.params[1]
    p_value = results.pvalues[1]

    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df["Temperature Mean (°C)"],
            y=station_df["Air Temperature (°C)"],
            mode="markers",
            name="Data",
            marker=dict(size=8, color="blue", opacity=0.7),
        )
    )

    # add line on bisector
    fig.add_trace(
        go.Scatter(
            x=[-10, 40],
            y=[-10, 40],
            mode="lines",
            name="Bisector",
            line=dict(color="red", width=2, dash="dash"),
        )
    )

    fig.update_layout(
        xaxis_title="Airport",
        yaxis_title=f"Station {station_id}",
        font=dict(
            size=18,
        ),
        title="Air Temperature",
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    fig.show()

In [None]:
meteo_df.columns.to_list()

In [None]:
# reindex the sw_df first to have unique indices
gw_df.reset_index(inplace=True)

In [None]:
# The correlation is high between the airport and the stations,
# so we can add the airport data variables to the stations

# add the rainfall data to the stations
gw_df["Cumulated Rainfall (mm)"] = np.nan

for station_id in gw_df["Station"].unique():
    start_date = gw_df[gw_df["Station"] == station_id]["DateTime"].min()
    end_date = gw_df[gw_df["Station"] == station_id]["DateTime"].max()

    # take the common date range with the airport
    start_date = max(start_date, meteo_df.index.min())
    end_date = min(end_date, meteo_df.index.max())

    airport_df = meteo_df[start_date:end_date].copy()

    # take the common date range with the station
    # Identify the indices in sw_df that match the station_id and are within the date range
    indices = gw_df[
        (gw_df["Station"] == station_id)
        & (gw_df["DateTime"] >= start_date)
        & (gw_df["DateTime"] <= end_date)
    ].index

    # Directly update sw_df for the matching indices
    gw_df.loc[indices, "Cumulated Rainfall (mm)"] = airport_df[
        "Cumulated Rainfall (mm)"
    ].values

In [None]:
gw_df.isna().sum()

In [None]:
gw_df.drop(columns="index", inplace=True)

# Store Results

In [None]:
# sort the columns
surface_info_df.sort_index(axis=1, inplace=True)
ground_info_df.sort_index(axis=1, inplace=True)
bacteria_info_df.sort_index(axis=1, inplace=True)
meteo_info_df.sort_index(axis=1, inplace=True)

In [None]:
# %%script false --no-raise-error
surface_info_df.to_excel(
    os.path.join(data_info_folder, "surface_water_info.xlsx")
)
ground_info_df.to_excel(
    os.path.join(data_info_folder, "ground_water_info.xlsx")
)
bacteria_info_df.to_excel(
    os.path.join(data_info_folder, "bacteria_info.xlsx")
)
meteo_info_df.to_excel(
    os.path.join(data_info_folder, "meteo_info.xlsx")
)

In [None]:
sw_df.to_excel(
    os.path.join(clean_data_folder, "surface.xlsx"), index=False
)
gw_df.to_excel(
    os.path.join(clean_data_folder, "ground.xlsx"), index=False
)

# UVA254 Raw Analysis

In [None]:
ts_uva254_df = ts_gw_df[
    ts_gw_df["Parameter"] == "UV-Adsorption (254)"
].copy()

In [None]:
ts_uva254_df

In [None]:
ts_uva254_df["DateTime"] = pd.to_datetime(
    ts_uva254_df["DateTime"], format="%Y-%m-%d", errors="coerce"
)

In [None]:
ts_uva254_df["Year"] = ts_uva254_df["DateTime"].dt.year
ts_uva254_df["Month"] = ts_uva254_df["DateTime"].dt.month

In [None]:
counts = ts_uva254_df["Station ID"].value_counts()

In [None]:
fig = px.line(
    ts_uva254_df,
    x="Date",
    y="Value",
    color="Station ID",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254)",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="Date",
    yaxis_title="Value",
)

fig.show()

### Station 7285

In [None]:
station_7285_df = ts_uva254_df[
    ts_uva254_df["Station ID"] == 7285
].copy()

In [None]:
station_7285_df["Season"] = station_7285_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

In [None]:
# plot station 7285 with seasons as hue
fig = px.line(
    station_7285_df,
    x="DateTime",
    y="Value",
    color="Season",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254) at station 7285",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="DateTime",
    yaxis_title="Value",
)

fig.show()

In [None]:
mean_station_7285_df = (
    station_7285_df.groupby(["Year"])
    .agg({"Value": ["mean", "count"]})
    .reset_index()
    .copy()
)

In [None]:
mean_station_7285_df

In [None]:
mean_station_7285_df = (
    station_7285_df.groupby(["Season"])
    .agg({"Value": ["mean", "count"]})
    .reset_index()
    .copy()
)

In [None]:
mean_station_7285_df

In [None]:
# maggiorparte delle misurazioni in autunno e primavera, semestrali circa

In [None]:
ts = station_7285_df[["DateTime", "Value"]].copy()

result_7285 = smt.seasonal_decompose(
    ts.set_index("Date"), model="additive", period=2
)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=result_7285.trend.index,
        y=result_7285.trend,
        mode="lines+markers",
        name="MA period=2",
        line=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=ts["Date"],
        y=ts["Value"],
        mode="lines+markers",
        name="Original",
        line=dict(color="red"),
    )
)

fig.show()

### Station 5130