# Berlin Data

Time series for the Havel River inflow to the city (Konradshöhe, Messstellennummer 305) and the downstream station (Schleuse Spandau, Messstellennummer 320), DOC and TOC.

Two groundwater station are attached, only with quality, no DOC/TOC is measured here, but UV254 and other. The groundwater stations are not influenced by bank filtrate and represent near-natural conditions (for a city like Berlin).

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import statsmodels.tsa.seasonal as smt
from googletrans import Translator

import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL

from sklearn.preprocessing import MinMaxScaler

# Define Paths

In [None]:
data_folder = os.path.join("..", "..", "data", "berlin")

raw_data_folder = os.path.join(data_folder, "raw_data")
data_info_folder = os.path.join(data_folder, "data_info")

ground_water_folder = os.path.join(raw_data_folder, "ground water")
surface_water_folder = os.path.join(raw_data_folder, "surface water")

# Load Data

## Ground Water

In [None]:
ts_gw_df = pd.read_csv(
    os.path.join(
        ground_water_folder, "time-series_ground-water_quality.csv"
    )
)

In [None]:
ts_gw_df

In [None]:
ts_gw_df.rename(
    columns={
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Messwert": "Value",
    },
    inplace=True,
)

## Surface Water

In [None]:
ts_sw_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_quality.csv"
    )
)

In [None]:
flow_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_flow.csv"
    )
)

In [None]:
ts_sw_df

In [None]:
ts_sw_df.rename(
    columns={
        "Messstelle": "Station",
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Wert": "Value",
    },
    inplace=True,
)

ts_sw_df.drop(
    columns=[
        'Entnahmetiefe [m]',
        'Vorzeichen',
        'Bestimmungsgrenze',
        'Messmethode'
    ],
    inplace=True
)

In [None]:
flow_df

In [None]:
flow_df.rename(
    columns={
        "Messstellennummer": "Station ID",
        "Datum": "DateTime",
        "Einheit": "Unit",
        "Tagesmittelwert": "Flow River",
    },
    inplace=True,
)

# Preprocess Data

## Ground Water Dataset

In [None]:
"""Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
Dissolved oxygen
-Nitrate
-pH
Redox potential"""

### Build Dataset per Station

In [None]:
variables = {
    'Temperatur (Luft)': 'Air Temperature',
    'Temperatur (Wasser)': 'Water Temperature',
    'UV-Adsorption (254)': 'Absorbance 254nm',
    'Leitfähigkeit 25°C vor Ort': 'Conductivity',
    'Ammonium (N)': 'Ammonium',
    'Nitrat (N)': 'Nitrate',
    'pH-Wert (Feld)': 'pH',
}

In [None]:
ground_df = ts_gw_df[ts_gw_df['Parameter'].isin(variables.keys())]

ground_df['Parameter'] = ground_df['Parameter'].map(variables)

In [None]:
ground_df['Station ID'].unique()

In [None]:
ground_df['DateTime'] = pd.to_datetime(ground_df['DateTime'])

In [None]:
stations_dict = {}
for station in ground_df['Station ID'].unique():
    station_df = ground_df[ground_df['Station ID'] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper('DateTime'),
        columns='Parameter',
        values='Value'
    )
    
    stations_dict[station] = station_df

### Analyze Stations

In [None]:
ground_info_df = pd.DataFrame(
    index=pd.Index(
        ['N Samples', '% Missing Values', 'Frequency (days)', 'Start Date', 'End Date'], name='Info'
    ),
    columns=pd.MultiIndex.from_product(
        [ground_df['Station ID'].unique(), variables.values()],
        names=['Station ID', 'Parameter']
    )
)

#### 5130 - Treptow-Köpenick

In [None]:
station_df = stations_dict[5130]

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 6 months

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 5130",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title=column,
        font=dict(
            size=18,
        )
    )
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 5130",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    ground_info_df.loc['N Samples', (5130, column)] = station_df[column].dropna().shape[0]
    ground_info_df.loc['% Missing Values', (5130, column)] = station_df[column].isna().sum() / station_df[column].shape[0]
    ground_info_df.loc['Frequency (days)', (5130, column)] = station_df.index.to_series().diff().value_counts().index[0].days
    ground_info_df.loc['Start Date', (5130, column)] = station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    ground_info_df.loc['End Date', (5130, column)] = station_df[column].dropna().index.max().strftime("%Y-%m-%d")

##### Trend

In [None]:
station_df = station_df.resample('M').median()

station_df.interpolate(method='time', inplace=True)

In [None]:
for column in station_df.columns:
    
    df = station_df[column].copy()
    
    df.dropna(inplace=True)
    
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]
        
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid
    
    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()
    
    model = sm.OLS(y, X)
    results = model.fit()
    
    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode='lines',
            name='Original',
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode='lines',
            name='Trend',
        )
    )
    
    slope = results.params[1]
    
    print(f"{column} - Slope: {slope}")
    
    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")
    
    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode='lines',
            name=f'Linear Regression',
            line=dict(
                dash='dash',
                color='black'
            ),
        ),
        
    )
    
    start_date = df.index.min()
    end_date = df.index.max()
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            )
    )

    fig.show()

#### 7285 - Steglitz-Zehlendorf

In [None]:
station_df = stations_dict[7285]

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 6 months

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 7285",
        labels={
            'DateTime': 'DateTime',
            column: column
        }
    )
    
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title=column,
        font=dict(
            size=18,
        )
    )
    
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 7285",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    ground_info_df.loc['N Samples', (7285, column)] = station_df[column].dropna().shape[0]
    ground_info_df.loc['% Missing Values', (7285, column)] = station_df[column].isna().sum() / station_df[column].shape[0]
    ground_info_df.loc['Frequency (days)', (7285, column)] = station_df.index.to_series().diff().value_counts().index[0].days
    ground_info_df.loc['Start Date', (7285, column)] = station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    ground_info_df.loc['End Date', (7285, column)] = station_df[column].dropna().index.max().strftime("%Y-%m-%d")

##### Trend

In [None]:
station_df = station_df.resample('M').median()

station_df.interpolate(method='time', inplace=True)

In [None]:
for column in station_df.columns:
    
    df = station_df[column].copy()
    
    df.dropna(inplace=True)
    
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]
        
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid
    
    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()
    
    model = sm.OLS(y, X)
    results = model.fit()
    
    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode='lines',
            name='Original',
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode='lines',
            name='Trend',
        )
    )
    
    slope = results.params[1]
    print(f"{column} - Slope: {slope}")
    
    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")
    
    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode='lines',
            name=f'Linear Regression',
            line=dict(
                dash='dash',
                color='black'
            ),
        ),
        
    )
    
    start_date = df.index.min()
    end_date = df.index.max()
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            )
    )

    fig.show()

### UVA254 vs Ammonium

In [None]:
colors = ['blue', 'red']

fig = go.Figure()

for station_id, station_df in stations_dict.items():
    
    df = station_df[['Ammonium', 'Absorbance 254nm']].copy()
    
    df.dropna(inplace=True)
    
    scaler = MinMaxScaler()
    
    X = df['Ammonium'].copy()
    
    # X = scaler.fit_transform(X.values.reshape(-1, 1))
    
    X = sm.add_constant(X)
    y = df['Absorbance 254nm'].copy()
    
    # y = scaler.fit_transform(y.values.reshape(-1, 1))
    
    model = sm.OLS(y, X)
    results = model.fit()
    
    line = pd.Series(results.predict(X), index=df.index)
    
    slope = results.params[1]
    p_value = results.pvalues[1]
    
    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")
    
    color = colors.pop()

    fig.add_trace(
        go.Scatter(
            x=X['Ammonium'],
            y=y,
            mode='markers',
            name=f"Station {station_id}",
            marker=dict(
                size=8,
                opacity=0.7,
                color = color
            )
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=X['Ammonium'],
            y=line,
            mode='lines',
            name=f"Linear Regression Station {station_id}",
            line=dict(
                dash='dash',
                color=color
            )
        )
    )
    
fig.update_layout(
    xaxis_title='Ammonium',
    yaxis_title='Absorbance 254nm',
    font=dict(
        size=18,
    ),
    title='Ground Water',
    # legend=dict(
    #     yanchor="top",
    #     y=0.99,
    #     xanchor="right",
    #     x=0.99
    # )
)

fig.show()

## Surface Water Dataset

In [None]:
"""
Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
-Dissolved oxygen
-Nitrate
-pH
Redox potential
"""

### Build Dataset per Station

NB: there is only one station (305) for which there are measurements of flow close to it (5815911)

In [None]:
# the parameters that are present for the moment are:
variables = {
    'Lufttemperatur': 'Air Temperature',
    'Wassertemperatur': 'Water Temperature',
    'Spektraler Absorptionskoeffizient (SAK) 254nm': 'Absorbance 254nm',
    'Leitfähigkeit': 'Conductivity',
    'Ammonium-Stickstoff': 'Ammonium',
    'Nitrat-Stickstoff': 'Nitrate',
    'pH-Wert': 'pH',
    'DOC (Gelöster organischer Kohlenstoff)': 'DOC',
    'TOC (Organischer Kohlenstoff)': 'TOC',
    'Sauerstoff-Gehalt': 'Dissolved Oxygen',
}

In [None]:
surface_df = ts_sw_df[ts_sw_df['Parameter'].isin(variables.keys())]

surface_df['Parameter'] = surface_df['Parameter'].map(variables)

In [None]:
surface_df['Station ID'].unique()

In [None]:
surface_df['Station'].unique()

In [None]:
surface_df['DateTime'] = pd.to_datetime(surface_df['DateTime'])

In [None]:
stations_dict = {}
for station in surface_df['Station ID'].unique():
    station_df = surface_df[surface_df['Station ID'] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper('DateTime'),
        columns='Parameter',
        values='Value'
    )
    
    stations_dict[station] = station_df

### Analyze Stations

In [None]:
surface_info_df = pd.DataFrame(
    index=pd.Index(
        ['N Samples', '% Missing Values', 'Frequency (days)', 'Start Date', 'End Date'], name='Info'
    ),
    columns=pd.MultiIndex.from_product(
        [surface_df['Station ID'].unique(), variables.values()],
        names=['Station ID', 'Parameter']
    )
)

#### 105 - Dämeritzsee-Seemitte

In [None]:
station_df = stations_dict[105]

In [None]:
flow_df['DateTime'] = pd.to_datetime(flow_df['DateTime'])

station_flow_df = flow_df[flow_df['Station ID'] == 5827101]

station_flow_df = station_flow_df[['DateTime', 'Flow River']].set_index('DateTime')

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df,
    left_index=True,
    right_index=True,
    how='left'
)

station_df.index = pd.to_datetime(station_df.index)

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 105 - Range: {date_range[0].date()} - {date_range[1].date()}",
        labels={
            'DateTime': 'DateTime',
            column: column
        }
    )
    
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title=column,
        font=dict(
            size=18,
        )
    )
    
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    surface_info_df.loc['N Samples', (105, column)] = station_df[column].dropna().shape[0]
    surface_info_df.loc['% Missing Values', (105, column)] = station_df[column].isna().sum() / station_df[column].shape[0]
    surface_info_df.loc['Frequency (days)', (105, column)] = station_df.index.to_series().diff().value_counts().index[0].days
    surface_info_df.loc['Start Date', (105, column)] = station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    surface_info_df.loc['End Date', (105, column)] = station_df[column].dropna().index.max().strftime("%Y-%m-%d")

In [None]:
station_df.loc[station_df['DOC'] <= 0, ['DOC']] = np.nan

##### Trends

In [None]:
station_df = station_df.resample('M').median()

station_df.interpolate(method='time', inplace=True)

In [None]:
for column in station_df.columns:
    
    df = station_df[column].copy()
    
    df.dropna(inplace=True)
    
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]
        
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid
    
    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df.copy()
    
    model = sm.OLS(y, X)
    results = model.fit()
    
    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode='lines',
            name='Original',
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode='lines',
            name='Trend',
        )
    )
    
    slope = results.params[1]
    
    print(f"{column} - Slope: {slope}")
    
    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")
    
    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode='lines',
            name=f'Linear Regression',
            line=dict(
                dash='dash',
                color='black'
            ),
        ),
        
    )
    
    start_date = df.index.min()
    end_date = df.index.max()
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            )
    )

    fig.show()

#### 305 - Oberhavel-Konradshöhe

In [None]:
station_df = stations_dict[305]

In [None]:
flow_df['DateTime'] = pd.to_datetime(flow_df['DateTime'])

station_flow_df = flow_df[flow_df['Station ID'] == 5815911]

station_flow_df = station_flow_df[['DateTime', 'Flow River']].set_index('DateTime')

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df,
    left_index=True,
    right_index=True,
    how='left'
)

station_df.index = pd.to_datetime(station_df.index)

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    
    # compute date range for which the data is available
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 305 - Range: {date_range[0].date()} - {date_range[1].date()}",
        labels={
            'DateTime': 'DateTime',
            column: column
        }
    )
    
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title=column,
        font=dict(
            size=18,
        )
    )
    
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 305",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    surface_info_df.loc['N Samples', (305, column)] = station_df[column].dropna().shape[0]
    surface_info_df.loc['% Missing Values', (305, column)] = station_df[column].isna().sum() / station_df[column].shape[0]
    surface_info_df.loc['Frequency (days)', (305, column)] = station_df.index.to_series().diff().value_counts().index[0].days
    surface_info_df.loc['Start Date', (305, column)] = station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    surface_info_df.loc['End Date', (305, column)] = station_df[column].dropna().index.max().strftime("%Y-%m-%d")

In [None]:
# set to NaN the DOC which is > 17
station_df.loc[station_df['DOC'] > 17, ['DOC']] = np.nan

##### Trend

In [None]:
station_df = station_df.resample('M').median()

station_df.interpolate(method='time', inplace=True)

In [None]:
for column in station_df.columns:
    
    df = station_df[column].copy()
    
    df.dropna(inplace=True)
    
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]
        
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid
    
    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df
    model = sm.OLS(y, X)
    results = model.fit()
    
    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode='lines',
            name='Original',
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode='lines',
            name='Trend',
        )
    )
    
    # get the slope of the regression
    slope = results.params[1]
    
    print(f"{column} - Slope: {slope}")
    
    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")
    
    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode='lines',
            name=f'Linear Regression',
            line=dict(
                dash='dash',
                color='black'
            )
        ),
        
    )
    
    start_date = df.index.min()
    end_date = df.index.max()
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
        )
    )

    fig.show()

#### 325 - Havel-Pichelsdorfer Gemünd

In [None]:
station_df = stations_dict[325]

In [None]:
flow_df['DateTime'] = pd.to_datetime(flow_df['DateTime'])

station_flow_df = flow_df[flow_df['Station ID'] == 5803200]

station_flow_df = station_flow_df[['DateTime', 'Flow River']].set_index('DateTime')

station_flow_df.index = station_flow_df.index.date
station_df.index = station_df.index.date

# merge the flow data with the surface water data for the same date (just date, not time)
station_df = station_df.merge(
    station_flow_df,
    left_index=True,
    right_index=True,
    how='left'
)

station_df.index = pd.to_datetime(station_df.index)

In [None]:
station_df.isna().sum() / station_df.shape[0]

In [None]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

In [None]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [None]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 325",
        labels={
            'DateTime': 'DateTime',
            column: column
        }
    )
    
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title=column,
        font=dict(
            size=18,
        )
    )
    fig.show()

##### Boxplots

In [None]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 325",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show()

##### Store Info

In [None]:
# store the information in the station_info_df
for column in station_df.columns:
    surface_info_df.loc['N Samples', (325, column)] = station_df[column].dropna().shape[0]
    surface_info_df.loc['% Missing Values', (325, column)] = station_df[column].isna().sum() / station_df[column].shape[0]
    surface_info_df.loc['Frequency (days)', (325, column)] = station_df.index.to_series().diff().value_counts().index[0].days
    surface_info_df.loc['Start Date', (325, column)] = station_df[column].dropna().index.min().strftime("%Y-%m-%d")
    surface_info_df.loc['End Date', (325, column)] = station_df[column].dropna().index.max().strftime("%Y-%m-%d")

In [None]:
station_df.loc[(station_df['DOC'] > 15) | (station_df['DOC'] < 4.5), ['DOC']] = np.nan

##### Trend

In [None]:
station_df = station_df.resample('M').median()

station_df.interpolate(method='time', inplace=True)

In [None]:
for column in station_df.columns:
    
    df = station_df[column].copy()
    
    df.dropna(inplace=True)
    
    date_range = station_df[column].dropna().index
    date_range = date_range.min(), date_range.max()
    
    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]
        
    stl = STL(df, period=12)

    result = stl.fit()
    seasonal, trend, resid = result.seasonal, result.trend, result.resid
    
    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df
    model = sm.OLS(y, X)
    results = model.fit()
    
    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df.index)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df,
            mode='lines',
            name='Original',
        )
    )

    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode='lines',
            name='Trend',
        )
    )
    
    # get the slope of the regression
    slope = results.params[1]
    
    print(f"{column} - Slope: {slope}")
    
    p_value = results.pvalues[1]
    print(f"{column} - P-value: {p_value}")
    
    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode='lines',
            name=f'Linear Regression',
            line=dict(
                dash='dash',
                color='black'
            )
        ),
        
    )
    
    start_date = df.index.min()
    end_date = df.index.max()
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )

    fig.show()

### Store Results

In [None]:
# %%script false --no-raise-error
surface_info_df.to_excel(os.path.join(data_info_folder, "surface_water_info.xlsx"))
ground_info_df.to_excel(os.path.join(data_info_folder, "ground_water_info.xlsx"))

### DOC vs TOC per station

In [None]:
for station_id in stations_dict.keys():
    station_df = stations_dict[station_id]
    
    # plot the doc and toc in a scatter plot to see if there is a correlation
    fig = px.scatter(
        station_df,
        x='DOC',
        y='TOC',
        trendline='ols',
        trendline_color_override='red',
        trendline_scope='overall',
    )
    
    results = px.get_trendline_results(fig)
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=station_df['DOC'],
            y=station_df['TOC'],
            mode='markers',
            name='Data',
            marker=dict(
                size=8,
                color='blue',
                opacity=0.7
            )
        )
    )
    
    # add line on bisector
    # fig.add_trace(
    #     go.Scatter(
    #         x=[0, 20],
    #         y=[0, 20],
    #         mode='lines',
    #         name='Bisector',
    #         line=dict(
    #             color='red',
    #             width=2,
    #             dash='dash'
    #         )
    #     )
    # )

    # get the slope and intercept of the trendline
    slope = results.iloc[0]['px_fit_results'].params[1]
    intercept = results.iloc[0]['px_fit_results'].params[0]
    
    fig.add_annotation(
        x=0.9,
        y=0.1,
        xref='paper',
        yref='paper',
        text=f"y = {slope:.2f}x + {intercept:.2f}",
        showarrow=False,
        font=dict(
            size=18,
            color='red'
        )
    )
    
    x = np.linspace(2, 14, 100)
    
    fig.add_trace(
        go.Scatter(
            x=x,
            y=slope * x + intercept,
            mode='lines',
            name='Overall Trendline',
            line=dict(
                color='red',
                width=2
            )
        )
    )
    
    # add the equation to the legend
    fig.update_traces(
        name=f"Linear Regression",
        selector=dict(name='Overall Trendline'),
    )
    
    if station_id == 105:
    
        fig.update_layout(
            xaxis_title='DOC',
            yaxis_title='TOC',
            font=dict(
                size=18,
            ),
            title=f"DOC vs TOC at station {station_id}",
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            )
        )
    
    else:
        fig.update_layout(
            xaxis_title='DOC',
            yaxis_title='TOC',
            font=dict(
                size=18,
            ),
            title=f"DOC vs TOC at station {station_id}",
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="right",
                x=0.99
            )
        )    
    
    
    fig.show(width=20, height=10)

### DOC vs Ammonium

In [None]:
colors = ['blue', 'red', 'green']

fig = go.Figure()

for station_id, station_df in stations_dict.items():
    
    station_df.index = pd.to_datetime(station_df.index)
 
    station_df = station_df.resample('M').median()

    station_df.interpolate(method='time', inplace=True)
    
    df = station_df[['Ammonium', 'DOC']].copy()
    
    df.dropna(inplace=True)
    
    # compute linear regression and plot the line
    X = df['Ammonium'].copy()
    X = sm.add_constant(X)
    y = df['DOC'].copy()
    
    model = sm.OLS(y, X)
    results = model.fit()
    
    line = pd.Series(results.predict(X), index=df.index)
    
    slope = results.params[1]
    p_value = results.pvalues[1]
    
    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")
    
    color = colors.pop()

    fig.add_trace(
        go.Scatter(
            x=df['Ammonium'],
            y=df['DOC'],
            mode='markers',
            name=f"Station {station_id}",
            marker=dict(
                size=8,
                opacity=0.7,
                color=color
            )
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=df['Ammonium'],
            y=line,
            mode='lines',
            name=f"Linear Regression Station {station_id}",
            line=dict(
                dash='dash',
                color=color
            )
        )
    )
    
    
fig.update_layout(
    xaxis_title='Ammonium',
    yaxis_title='DOC',
    font=dict(
        size=18,
    ),
    title='Surface Water',
    # legend=dict(
    #     yanchor="top",
    #     y=0.99,
    #     xanchor="right",
    #     x=0.99
    # )
)

fig.show()

# UVA254 Raw Analysis

In [None]:
ts_uva254_df = ts_gw_df[
    ts_gw_df["Parameter"] == "UV-Adsorption (254)"
].copy()

In [None]:
ts_uva254_df

In [None]:
ts_uva254_df["DateTime"] = pd.to_datetime(
    ts_uva254_df["DateTime"], format="%Y-%m-%d", errors="coerce"
)

In [None]:
ts_uva254_df['Year'] = ts_uva254_df['DateTime'].dt.year
ts_uva254_df['Month'] = ts_uva254_df['DateTime'].dt.month

In [None]:
counts = ts_uva254_df['Station ID'].value_counts()

In [None]:
fig = px.line(
    ts_uva254_df,
    x="Date",
    y="Value",
    color="Station ID",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254)",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="Date",
    yaxis_title="Value",
)

fig.show()

### Station 7285

In [None]:
station_7285_df = ts_uva254_df[ts_uva254_df['Station ID'] == 7285].copy()

In [None]:
station_7285_df["Season"] = station_7285_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

In [None]:
# plot station 7285 with seasons as hue
fig = px.line(
    station_7285_df,
    x="DateTime",
    y="Value",
    color="Season",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254) at station 7285",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="DateTime",
    yaxis_title="Value",
)

fig.show()

In [None]:
mean_station_7285_df = station_7285_df.groupby(["Year"]).agg({"Value": ["mean", "count"]}).reset_index().copy()

In [None]:
mean_station_7285_df

In [None]:
mean_station_7285_df = station_7285_df.groupby(["Season"]).agg({"Value": ["mean", "count"]}).reset_index().copy()

In [None]:
mean_station_7285_df

In [None]:
# maggiorparte delle misurazioni in autunno e primavera, semestrali circa

In [None]:
ts = station_7285_df[['DateTime', 'Value']].copy()

result_7285 = smt.seasonal_decompose(
    ts.set_index('Date'), model="additive", period=2
)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=result_7285.trend.index,
        y=result_7285.trend,
        mode="lines+markers",
        name="MA period=2",
        line=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=ts['Date'],
        y=ts['Value'],
        mode="lines+markers",
        name="Original",
        line=dict(color="red"),
    )
)

fig.show()

### Station 5130