In [60]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
import plotly.graph_objects as go

import statsmodels.tsa.seasonal as smt
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler

from plotly.subplots import make_subplots

from prophet import Prophet

from sklearn.metrics import (
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)

from scipy.stats import pearsonr

In [61]:
data_folder = os.path.join("..", "..", "data", "berlin")

clean_data_folder = os.path.join(data_folder, "clean_data")
projections_folder = os.path.join(data_folder, "projections")

my_projections_folder = os.path.join(projections_folder, "found_by_me")

# Load Clean Data

In [62]:
surface_df = pd.read_excel(
    os.path.join(clean_data_folder, "surface.xlsx")
)
ground_df = pd.read_excel(
    os.path.join(clean_data_folder, "ground.xlsx")
)

flow_river_projections = pd.read_excel(
    os.path.join(my_projections_folder, "flow_river.xlsx")
)

air_temp_projections = pd.read_excel(
    os.path.join(my_projections_folder, "air_temp.xlsx")
)

precip_projections = pd.read_excel(
    os.path.join(my_projections_folder, "precip.xlsx")
)

In [63]:
surface_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1521 entries, 0 to 1520
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   DateTime                       1521 non-null   datetime64[ns]
 1   Air Temperature (°C)           1521 non-null   float64       
 2   Ammonium (mg/l)                1521 non-null   float64       
 3   BOD (mg/l)                     875 non-null    float64       
 4   Coliform Bacteria (MPN/100ml)  1521 non-null   float64       
 5   Conductivity (µS/cm)           1521 non-null   float64       
 6   DOC (mg/l)                     1521 non-null   float64       
 7   Dissolved Oxygen (mg/l)        1521 non-null   float64       
 8   E.Coli (MPN/100ml)             1521 non-null   float64       
 9   Enterococcus (MPN/100ml)       754 non-null    float64       
 10  Nitrate (mg/l)                 1521 non-null   float64       
 11  TOC (mg/l)       

In [64]:
ground_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   DateTime                 594 non-null    datetime64[ns]
 1   Air Temperature (°C)     594 non-null    float64       
 2   Ammonium (mg/l)          594 non-null    float64       
 3   Conductivity (µS/cm)     594 non-null    float64       
 4   Nitrate (mg/l)           258 non-null    float64       
 5   UVA254 (1/m)             594 non-null    float64       
 6   Water Temperature (°C)   594 non-null    float64       
 7   pH                       594 non-null    float64       
 8   Station                  594 non-null    int64         
 9   Cumulated Rainfall (mm)  594 non-null    float64       
dtypes: datetime64[ns](1), float64(8), int64(1)
memory usage: 46.5 KB


# Bacteria Inspection

In [None]:
bacteria_info_df = pd.DataFrame(
    index=pd.Index(["Min Value", "Max Value"], name="Info"),
    columns=pd.MultiIndex.from_product(
        [list(stations_dict.keys()), bacteria_columns],
        names=["Station ID", "Parameter"],
    ),
)

In [None]:
### Bacteria Inspection

fig = go.Figure()

for column in bacteria_columns:
    # Compute % of 0s, min and max for every year.
    min_values = np.array([])
    max_values = np.array([])

    for year in station_df.index.year.unique():
        year_df = station_df[station_df.index.year == year]

        min_value = year_df[column].min()
        max_value = year_df[column].max()

        min_values = np.append(min_values, min_value)
        max_values = np.append(max_values, max_value)

        zero_percentage = (
            year_df[year_df[column] == 0].shape[0]
            / year_df.shape[0]
            * 100
        )

        print(f"{column} - Year: % of 0s: {zero_percentage}")

    # create scatter with min and max values
    fig.add_trace(
        go.Scatter(
            x=station_df.index.year.unique(),
            y=min_values,
            mode="lines+markers",
            name=column + " Min Values",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=station_df.index.year.unique(),
            y=max_values,
            mode="lines+markers",
            name=column + " Max Values",
        )
    )

    min_values = min_values[np.isfinite(min_values)]
    max_values = max_values[np.isfinite(max_values)]

    min_mean = np.mean(min_values)
    min_std = np.std(min_values)

    max_mean = np.mean(max_values)
    max_std = np.std(max_values)

    bacteria_info_df.loc[
        "Min Value", (105, column)
    ] = f"{min_mean:.2f} ± {min_std:.2f}"
    bacteria_info_df.loc[
        "Max Value", (105, column)
    ] = f"{max_mean:.2f} ± {max_std:.2f}"

fig.update_layout(
    title="Min and Max values for Station 105",
    xaxis_title="Year",
    yaxis_title="MPN/100ml",
    font=dict(
        size=18,
    ),
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

fig.write_html(
    os.path.join(
        data_folder,
        "plots",
        "min_max_surface_bacteria",
        f"station105.html",
    )
)

fig.show()

# Trend Analysis

In [65]:
diff_columns = np.array(["DateTime", "Station"])

## Surface

In [66]:
for station_id in surface_df["Station"].unique():
    station_df = surface_df[surface_df["Station"] == station_id].copy()

    for column in station_df.columns.difference(diff_columns):
        df = station_df[["DateTime", column]].copy()

        df.set_index("DateTime", inplace=True)

        df.dropna(inplace=True)

        date_range = df.index
        date_range = date_range.min(), date_range.max()

        # make sure that the dataframe starts and finishes in the same month
        start_index = df[df.index.month == date_range[1].month].index[0]

        # Slice the dataframe to start from the found index
        df = df.loc[start_index:]

        # ===== Prophet =====

        df.index.name = "ds"

        df = df.reset_index()

        df.rename(columns={column: "y"}, inplace=True)

        # using prophet

        model = Prophet()
        model.fit(df)
        # Make predictions for both columns
        future = model.make_future_dataframe(periods=0)
        forecast = model.predict(future)

        # Merging forecasted data with your original data
        forecasting_final = pd.merge(
            forecast,
            df,
            how="inner",
            on="ds",
        )

        # compute linear regression on trend
        X = np.arange(df.shape[0])
        X = sm.add_constant(X)
        y = df["y"].copy()

        model = sm.OLS(y, X)
        results = model.fit()

        # plot the line of the linear regression
        line = pd.Series(results.predict(X), index=df['ds'])

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=df['ds'],
                y=df["y"],
                mode="lines",
                name="Original",
            )
        )

        fig.add_trace(
            go.Scatter(
                x=forecasting_final["ds"],
                y=forecasting_final["trend"],
                mode="lines",
                name="Trend",
            )
        )

        slope = results.params.iloc[1]
        print(f"{column} - Slope: {slope}")

        p_value = results.pvalues.iloc[1]
        print(f"{column} - P-value: {p_value}")

        fig.add_trace(
            go.Scatter(
                x=line.index,
                y=line,
                mode="lines",
                name=f"Linear Regression",
                line=dict(dash="dash", color="black"),
            ),
        )

        start_date = df['ds'].min()
        end_date = df['ds'].max()

        fig.update_layout(
            xaxis_title="Date",
            yaxis_title=column,
            font=dict(
                size=18,
            ),
            title=f"{station_id} - {column} - {start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}",
        )

        fig.show()

18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


Air Temperature (°C) - Slope: 0.010388430442687066
Air Temperature (°C) - P-value: 0.002433672823393708


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


Ammonium (mg/l) - Slope: -0.00011507691916459575
Ammonium (mg/l) - P-value: 7.813091778464309e-07


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


BOD (mg/l) - Slope: -0.0013787111049296482
BOD (mg/l) - P-value: 0.21632130516202094


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


Coliform Bacteria (MPN/100ml) - Slope: -0.44389538276409585
Coliform Bacteria (MPN/100ml) - P-value: 0.0005991211312683302


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


Conductivity (µS/cm) - Slope: 0.47246448090225807
Conductivity (µS/cm) - P-value: 1.660211195060166e-70


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


Cumulated Rainfall (mm) - Slope: -0.00013906610657911733
Cumulated Rainfall (mm) - P-value: 0.4351001329567995


18:26:27 - cmdstanpy - INFO - Chain [1] start processing
18:26:27 - cmdstanpy - INFO - Chain [1] done processing


DOC (mg/l) - Slope: -0.00074772559240662
DOC (mg/l) - P-value: 0.05949461136014941


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


Dissolved Oxygen (mg/l) - Slope: -0.0026345527490950903
Dissolved Oxygen (mg/l) - P-value: 0.003770761157600995


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


E.Coli (MPN/100ml) - Slope: -0.7203677574378157
E.Coli (MPN/100ml) - P-value: 1.2827159166693552e-19


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


Enterococcus (MPN/100ml) - Slope: -0.17122619826997812
Enterococcus (MPN/100ml) - P-value: 4.8938307435926795e-54


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


Flow River Rate (m³/s) - Slope: -0.007542668617042322
Flow River Rate (m³/s) - P-value: 0.39531158970801494


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


Nitrate (mg/l) - Slope: -0.0019251909851896423
Nitrate (mg/l) - P-value: 8.090324038030192e-13


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


TOC (mg/l) - Slope: 0.001568500037749012
TOC (mg/l) - P-value: 0.028488141849711195


18:26:28 - cmdstanpy - INFO - Chain [1] start processing
18:26:28 - cmdstanpy - INFO - Chain [1] done processing


Water Temperature (°C) - Slope: 0.006172719163606798
Water Temperature (°C) - P-value: 0.0633087230144728


18:26:29 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


pH - Slope: -0.0004597400088549388
pH - P-value: 0.0005664528286910426


18:26:29 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


Air Temperature (°C) - Slope: 0.0016814115046457723
Air Temperature (°C) - P-value: 0.41877503327327914


18:26:29 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


Ammonium (mg/l) - Slope: -0.0004649191426428239
Ammonium (mg/l) - P-value: 3.393512254669159e-29


18:26:29 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


BOD (mg/l) - Slope: -0.005006804268309002
BOD (mg/l) - P-value: 5.744090822984654e-17


18:26:29 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


Coliform Bacteria (MPN/100ml) - Slope: -1.6645264645015487
Coliform Bacteria (MPN/100ml) - P-value: 1.056210828398055e-30


18:26:30 - cmdstanpy - INFO - Chain [1] start processing
18:26:30 - cmdstanpy - INFO - Chain [1] done processing


Conductivity (µS/cm) - Slope: 0.024941042349968995
Conductivity (µS/cm) - P-value: 0.06656053369030164


18:26:30 - cmdstanpy - INFO - Chain [1] start processing
18:26:30 - cmdstanpy - INFO - Chain [1] done processing


Cumulated Rainfall (mm) - Slope: -0.0001428630357821253
Cumulated Rainfall (mm) - P-value: 0.14001398694196676


18:26:30 - cmdstanpy - INFO - Chain [1] start processing
18:26:30 - cmdstanpy - INFO - Chain [1] done processing


DOC (mg/l) - Slope: 0.001327527406964717
DOC (mg/l) - P-value: 2.3527427875949215e-06


18:26:30 - cmdstanpy - INFO - Chain [1] start processing
18:26:30 - cmdstanpy - INFO - Chain [1] done processing


Dissolved Oxygen (mg/l) - Slope: -0.002588972488691719
Dissolved Oxygen (mg/l) - P-value: 0.00010891803173775114


18:26:31 - cmdstanpy - INFO - Chain [1] start processing
18:26:31 - cmdstanpy - INFO - Chain [1] done processing


E.Coli (MPN/100ml) - Slope: -1.4894228837337196
E.Coli (MPN/100ml) - P-value: 2.960782315080725e-32


18:26:31 - cmdstanpy - INFO - Chain [1] start processing
18:26:31 - cmdstanpy - INFO - Chain [1] done processing


Enterococcus (MPN/100ml) - Slope: -0.17339120712899941
Enterococcus (MPN/100ml) - P-value: 2.068355008748598e-54


18:26:31 - cmdstanpy - INFO - Chain [1] start processing
18:26:31 - cmdstanpy - INFO - Chain [1] done processing


Flow River Rate (m³/s) - Slope: -0.013196839413570907
Flow River Rate (m³/s) - P-value: 0.14645593298415527


18:26:31 - cmdstanpy - INFO - Chain [1] start processing
18:26:31 - cmdstanpy - INFO - Chain [1] done processing


Nitrate (mg/l) - Slope: -0.000475098330214073
Nitrate (mg/l) - P-value: 5.7018218301421106e-08


18:26:32 - cmdstanpy - INFO - Chain [1] start processing
18:26:32 - cmdstanpy - INFO - Chain [1] done processing


TOC (mg/l) - Slope: 0.003539512407941551
TOC (mg/l) - P-value: 2.952990714627697e-07


18:26:32 - cmdstanpy - INFO - Chain [1] start processing
18:26:32 - cmdstanpy - INFO - Chain [1] done processing


Water Temperature (°C) - Slope: 0.0010572622056606632
Water Temperature (°C) - P-value: 0.5952387192040904


18:26:32 - cmdstanpy - INFO - Chain [1] start processing
18:26:32 - cmdstanpy - INFO - Chain [1] done processing


pH - Slope: -4.679034920155305e-05
pH - P-value: 0.593263042454521


18:26:32 - cmdstanpy - INFO - Chain [1] start processing
18:26:32 - cmdstanpy - INFO - Chain [1] done processing


Air Temperature (°C) - Slope: 0.0014587088815852115
Air Temperature (°C) - P-value: 0.44605271994428564


18:26:33 - cmdstanpy - INFO - Chain [1] start processing
18:26:33 - cmdstanpy - INFO - Chain [1] done processing


Ammonium (mg/l) - Slope: -0.002732697448204437
Ammonium (mg/l) - P-value: 7.213939837976501e-65


18:26:33 - cmdstanpy - INFO - Chain [1] start processing
18:26:33 - cmdstanpy - INFO - Chain [1] done processing


BOD (mg/l) - Slope: -0.0012621948684427768
BOD (mg/l) - P-value: 0.002078584000599702


18:26:33 - cmdstanpy - INFO - Chain [1] start processing
18:26:33 - cmdstanpy - INFO - Chain [1] done processing


Coliform Bacteria (MPN/100ml) - Slope: -1.5924656223998357
Coliform Bacteria (MPN/100ml) - P-value: 9.73224809945897e-08


18:26:33 - cmdstanpy - INFO - Chain [1] start processing
18:26:33 - cmdstanpy - INFO - Chain [1] done processing


Conductivity (µS/cm) - Slope: 0.39316157291142767
Conductivity (µS/cm) - P-value: 2.9682418261558855e-56


18:26:34 - cmdstanpy - INFO - Chain [1] start processing
18:26:34 - cmdstanpy - INFO - Chain [1] done processing


Cumulated Rainfall (mm) - Slope: -0.0001428630357821253
Cumulated Rainfall (mm) - P-value: 0.14001398694196676


18:26:34 - cmdstanpy - INFO - Chain [1] start processing
18:26:34 - cmdstanpy - INFO - Chain [1] done processing


DOC (mg/l) - Slope: -9.258528368491388e-05
DOC (mg/l) - P-value: 0.7325509749017451


18:26:34 - cmdstanpy - INFO - Chain [1] start processing
18:26:34 - cmdstanpy - INFO - Chain [1] done processing


Dissolved Oxygen (mg/l) - Slope: 0.000572081569030218
Dissolved Oxygen (mg/l) - P-value: 0.4040657201321499


18:26:34 - cmdstanpy - INFO - Chain [1] start processing
18:26:34 - cmdstanpy - INFO - Chain [1] done processing


E.Coli (MPN/100ml) - Slope: -3.042150742187634
E.Coli (MPN/100ml) - P-value: 7.027928633004088e-56


18:26:35 - cmdstanpy - INFO - Chain [1] start processing
18:26:35 - cmdstanpy - INFO - Chain [1] done processing


Enterococcus (MPN/100ml) - Slope: -12.198216232133317
Enterococcus (MPN/100ml) - P-value: 2.686235662955361e-61


18:26:35 - cmdstanpy - INFO - Chain [1] start processing
18:26:35 - cmdstanpy - INFO - Chain [1] done processing


Flow River Rate (m³/s) - Slope: -0.038997918525367166
Flow River Rate (m³/s) - P-value: 5.060323863047589e-05


18:26:35 - cmdstanpy - INFO - Chain [1] start processing
18:26:35 - cmdstanpy - INFO - Chain [1] done processing


Nitrate (mg/l) - Slope: -0.00022403837243826713
Nitrate (mg/l) - P-value: 0.14405875348412434


18:26:35 - cmdstanpy - INFO - Chain [1] start processing
18:26:35 - cmdstanpy - INFO - Chain [1] done processing


TOC (mg/l) - Slope: 0.001914693519884808
TOC (mg/l) - P-value: 0.010040681334182225


18:26:35 - cmdstanpy - INFO - Chain [1] start processing
18:26:35 - cmdstanpy - INFO - Chain [1] done processing


Water Temperature (°C) - Slope: -0.0015405125740060998
Water Temperature (°C) - P-value: 0.40976428295950085


18:26:36 - cmdstanpy - INFO - Chain [1] start processing
18:26:36 - cmdstanpy - INFO - Chain [1] done processing


pH - Slope: -0.00013586623682659488
pH - P-value: 0.062767359656555


## Ground

In [67]:
for station_id in ground_df["Station"].unique():
    station_df = ground_df[ground_df["Station"] == station_id].copy()

    for column in station_df.columns.difference(diff_columns):
        df = station_df[["DateTime", column]].copy()

        df.set_index("DateTime", inplace=True)

        df.dropna(inplace=True)
        
        if df.shape[0] == 0:
            continue

        date_range = df.index
        date_range = date_range.min(), date_range.max()

        # make sure that the dataframe starts and finishes in the same month
        start_index = df[df.index.month == date_range[1].month].index[0]

        # Slice the dataframe to start from the found index
        df = df.loc[start_index:]

        # ===== Prophet =====

        df.index.name = "ds"

        df = df.reset_index()

        df.rename(columns={column: "y"}, inplace=True)

        # using prophet

        model = Prophet()
        model.fit(df)
        # Make predictions for both columns
        future = model.make_future_dataframe(periods=0)
        forecast = model.predict(future)

        # Merging forecasted data with your original data
        forecasting_final = pd.merge(
            forecast,
            df,
            how="inner",
            on="ds",
        )

        # compute linear regression on trend
        X = np.arange(df.shape[0])
        X = sm.add_constant(X)
        y = df["y"].copy()

        model = sm.OLS(y, X)
        results = model.fit()

        # plot the line of the linear regression
        line = pd.Series(results.predict(X), index=df['ds'])

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=df['ds'],
                y=df["y"],
                mode="lines",
                name="Original",
            )
        )

        fig.add_trace(
            go.Scatter(
                x=forecasting_final["ds"],
                y=forecasting_final["trend"],
                mode="lines",
                name="Trend",
            )
        )

        slope = results.params.iloc[1]
        print(f"{column} - Slope: {slope}")

        p_value = results.pvalues.iloc[1]
        print(f"{column} - P-value: {p_value}")

        fig.add_trace(
            go.Scatter(
                x=line.index,
                y=line,
                mode="lines",
                name=f"Linear Regression",
                line=dict(dash="dash", color="black"),
            ),
        )

        start_date = df['ds'].min()
        end_date = df['ds'].max()

        fig.update_layout(
            xaxis_title="Date",
            yaxis_title=column,
            font=dict(
                size=18,
            ),
            title=f"{station_id} - {column} - {start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}",
        )

        fig.show()

18:26:36 - cmdstanpy - INFO - Chain [1] start processing
18:26:36 - cmdstanpy - INFO - Chain [1] done processing


Air Temperature (°C) - Slope: 0.01139780998844154
Air Temperature (°C) - P-value: 0.0024813687795161815


18:26:36 - cmdstanpy - INFO - Chain [1] start processing
18:26:36 - cmdstanpy - INFO - Chain [1] done processing


Ammonium (mg/l) - Slope: -4.776673662079784e-05
Ammonium (mg/l) - P-value: 0.5898559455432497


18:26:36 - cmdstanpy - INFO - Chain [1] start processing
18:26:36 - cmdstanpy - INFO - Chain [1] done processing


Conductivity (µS/cm) - Slope: -1.307313623222716
Conductivity (µS/cm) - P-value: 2.9126161591718553e-63


18:26:36 - cmdstanpy - INFO - Chain [1] start processing
18:26:36 - cmdstanpy - INFO - Chain [1] done processing


Cumulated Rainfall (mm) - Slope: -0.0002897007460739931
Cumulated Rainfall (mm) - P-value: 0.288083750085126


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Nitrate (mg/l) - Slope: -0.0025885367304283474
Nitrate (mg/l) - P-value: 5.810626922788203e-18


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


UVA254 (1/m) - Slope: 0.012160599182156647
UVA254 (1/m) - P-value: 7.117995167749097e-33


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Water Temperature (°C) - Slope: 0.002824719452584054
Water Temperature (°C) - P-value: 1.4866121686982806e-23


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


pH - Slope: -0.0016585636373092154
pH - P-value: 7.445569716831995e-38


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Air Temperature (°C) - Slope: 0.00665050553241363
Air Temperature (°C) - P-value: 0.027773850810925177


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Ammonium (mg/l) - Slope: 0.00010772131460316627
Ammonium (mg/l) - P-value: 0.001441560743218243


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Conductivity (µS/cm) - Slope: -1.9188996660478055
Conductivity (µS/cm) - P-value: 1.142101286256184e-69


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


Cumulated Rainfall (mm) - Slope: -6.807334624228897e-05
Cumulated Rainfall (mm) - P-value: 0.754746781538563


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:37 - cmdstanpy - INFO - Chain [1] done processing


UVA254 (1/m) - Slope: 0.018862886856923153
UVA254 (1/m) - P-value: 3.204407814081926e-60


18:26:37 - cmdstanpy - INFO - Chain [1] start processing
18:26:38 - cmdstanpy - INFO - Chain [1] done processing


Water Temperature (°C) - Slope: 0.0010749329077905847
Water Temperature (°C) - P-value: 7.311863701559384e-23


18:26:38 - cmdstanpy - INFO - Chain [1] start processing
18:26:38 - cmdstanpy - INFO - Chain [1] done processing


pH - Slope: 0.0002045135311643432
pH - P-value: 0.016075265402993543


# Projections Comparison

## Flow River

In [68]:
column = 'Flow River Rate (m³/s)'

In [71]:
for station_id in surface_df["Station"].unique():
    station_df = surface_df[surface_df["Station"] == station_id].copy()

    df = station_df[["DateTime", column]].copy()
    
    df.dropna(inplace=True)
    
    projection_df = flow_river_projections[flow_river_projections["Station"] == station_id].copy()
    
    rcp45 = projection_df[projection_df["Type"] == "rcp45"]
    rcp85 = projection_df[projection_df["Type"] == "rcp85"]
    
    df.set_index("DateTime", inplace=True)
    
    rcp45.set_index("DateTime", inplace=True)
    rcp85.set_index("DateTime", inplace=True)
    
    # === Trend Computation ===
    
    # 1. Extract trend with Prophet
    
    # 1.1. Historical data
    
    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    hist_forecasting_final = pd.merge(
        forecast,
        df,
        how="inner",
        on="ds",
    )
    
    # 1.2. RCP 4.5
    
    rcp45.index.name = "ds"

    rcp45 = rcp45.reset_index()

    rcp45.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp45)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp45_forecasting_final = pd.merge(
        forecast,
        rcp45,
        how="inner",
        on="ds",
    )
    
    # 1.3. RCP 8.5
    
    rcp85.index.name = "ds"

    rcp85 = rcp85.reset_index()

    rcp85.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp85)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp85_forecasting_final = pd.merge(
        forecast,
        rcp85,
        how="inner",
        on="ds",
    )
    
    # 2. Compute linear regression on the trend
    
    # 2.1. Historical data
    
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    hist_line = pd.Series(results.predict(X), index=df['ds'])
    
    hist_slope = results.params.iloc[1]
    hist_p_value = results.pvalues.iloc[1]
    
    # 2.2. RCP 4.5
    
    X = np.arange(rcp45.shape[0])
    X = sm.add_constant(X)
    y = rcp45["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp45_line = pd.Series(results.predict(X), index=rcp45['ds'])
    
    rcp45_slope = results.params.iloc[1]
    rcp45_p_value = results.pvalues.iloc[1]
    
    # 2.3. RCP 8.5
    
    X = np.arange(rcp85.shape[0])
    X = sm.add_constant(X)
    y = rcp85["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp85_line = pd.Series(results.predict(X), index=rcp85['ds'])
    
    rcp85_slope = results.params.iloc[1]
    rcp85_p_value = results.pvalues.iloc[1]
    
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=df['ds'],
            y=df['y'],
            mode="lines",
            name="Historical",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45['ds'],
            y=rcp45['y'],
            mode="lines",
            name="RCP 4.5",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85['ds'],
            y=rcp85['y'],
            mode="lines",
            name="RCP 8.5",
        )
    )
    
    
    fig.add_trace(
        go.Scatter(
            x=hist_forecasting_final["ds"],
            y=hist_forecasting_final["trend"],
            mode="lines",
            name="Historical Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45_forecasting_final["ds"],
            y=rcp45_forecasting_final["trend"],
            mode="lines",
            name="RCP 4.5 Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85_forecasting_final["ds"],
            y=rcp85_forecasting_final["trend"],
            mode="lines",
            name="RCP 8.5 Trend",
        )
    )
    
    # add box with regression slope and p-value for all the scenarios
    
    fig.add_annotation(
        x=0.95,
        y=0.99,
        xref="paper",
        yref="paper",
        text=f"Historical - Slope: {hist_slope:.3f}, P-value: {hist_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.95,
        y=0.93,
        xref="paper",
        yref="paper",
        text=f"RCP 4.5 - Slope: {rcp45_slope:.3f}, P-value: {rcp45_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.95,
        y=0.87,
        xref="paper",
        yref="paper",
        text=f"RCP 8.5 - Slope: {rcp85_slope:.3f}, P-value: {rcp85_p_value:.2f}",
        showarrow=False,
    )
    
    
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        title=f"{station_id} - {column}",
    )
    
    fig.show()

18:27:53 - cmdstanpy - INFO - Chain [1] start processing
18:27:54 - cmdstanpy - INFO - Chain [1] done processing
18:27:54 - cmdstanpy - INFO - Chain [1] start processing
18:27:54 - cmdstanpy - INFO - Chain [1] done processing
18:27:54 - cmdstanpy - INFO - Chain [1] start processing
18:27:54 - cmdstanpy - INFO - Chain [1] done processing


18:27:54 - cmdstanpy - INFO - Chain [1] start processing
18:27:54 - cmdstanpy - INFO - Chain [1] done processing
18:27:54 - cmdstanpy - INFO - Chain [1] start processing
18:27:54 - cmdstanpy - INFO - Chain [1] done processing
18:27:55 - cmdstanpy - INFO - Chain [1] start processing
18:27:55 - cmdstanpy - INFO - Chain [1] done processing


18:27:55 - cmdstanpy - INFO - Chain [1] start processing
18:27:55 - cmdstanpy - INFO - Chain [1] done processing
18:27:55 - cmdstanpy - INFO - Chain [1] start processing
18:27:55 - cmdstanpy - INFO - Chain [1] done processing
18:27:55 - cmdstanpy - INFO - Chain [1] start processing
18:27:55 - cmdstanpy - INFO - Chain [1] done processing


## Air Temperature

In [46]:
column = 'Air Temperature (°C)'

In [55]:
for station_id in surface_df["Station"].unique():
    station_df = surface_df[surface_df["Station"] == station_id].copy()

    df = station_df[["DateTime", column]].copy()
    
    projection_df = air_temp_projections[air_temp_projections["Station"] == station_id].copy()
    
    rcp45 = projection_df[projection_df["Type"] == "rcp45"]
    rcp85 = projection_df[projection_df["Type"] == "rcp85"]
    
    df.set_index("DateTime", inplace=True)
    
    rcp45.set_index("DateTime", inplace=True)
    rcp85.set_index("DateTime", inplace=True)
    
    # === Trend Computation ===
    
    # 1. Extract trend with Prophet
    
    # 1.1. Historical data
    
    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    hist_forecasting_final = pd.merge(
        forecast,
        df,
        how="inner",
        on="ds",
    )
    
    # 1.2. RCP 4.5
    
    rcp45.index.name = "ds"

    rcp45 = rcp45.reset_index()

    rcp45.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp45)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp45_forecasting_final = pd.merge(
        forecast,
        rcp45,
        how="inner",
        on="ds",
    )
    
    # 1.3. RCP 8.5
    
    rcp85.index.name = "ds"

    rcp85 = rcp85.reset_index()

    rcp85.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp85)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp85_forecasting_final = pd.merge(
        forecast,
        rcp85,
        how="inner",
        on="ds",
    )
    
    # 2. Compute linear regression on the trend
    
    # 2.1. Historical data
    
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    hist_line = pd.Series(results.predict(X), index=df['ds'])
    
    hist_slope = results.params.iloc[1]
    hist_p_value = results.pvalues.iloc[1]
    
    # 2.2. RCP 4.5
    
    X = np.arange(rcp45.shape[0])
    X = sm.add_constant(X)
    y = rcp45["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp45_line = pd.Series(results.predict(X), index=rcp45['ds'])
    
    rcp45_slope = results.params.iloc[1]
    rcp45_p_value = results.pvalues.iloc[1]
    
    # 2.3. RCP 8.5
    
    X = np.arange(rcp85.shape[0])
    X = sm.add_constant(X)
    y = rcp85["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp85_line = pd.Series(results.predict(X), index=rcp85['ds'])
    
    rcp85_slope = results.params.iloc[1]
    rcp85_p_value = results.pvalues.iloc[1]
    
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=df['ds'],
            y=df['y'],
            mode="lines",
            name="Historical",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45['ds'],
            y=rcp45['y'],
            mode="lines",
            name="RCP 4.5",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85['ds'],
            y=rcp85['y'],
            mode="lines",
            name="RCP 8.5",
        )
    )
    
    
    fig.add_trace(
        go.Scatter(
            x=hist_forecasting_final["ds"],
            y=hist_forecasting_final["trend"],
            mode="lines",
            name="Historical Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45_forecasting_final["ds"],
            y=rcp45_forecasting_final["trend"],
            mode="lines",
            name="RCP 4.5 Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85_forecasting_final["ds"],
            y=rcp85_forecasting_final["trend"],
            mode="lines",
            name="RCP 8.5 Trend",
        )
    )
    
    # add box with regression slope and p-value for all the scenarios
    
    fig.add_annotation(
        x=0.95,
        y=0.99,
        xref="paper",
        yref="paper",
        text=f"Historical - Slope: {hist_slope:.3f}, P-value: {hist_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.95,
        y=0.93,
        xref="paper",
        yref="paper",
        text=f"RCP 4.5 - Slope: {rcp45_slope:.3f}, P-value: {rcp45_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.95,
        y=0.87,
        xref="paper",
        yref="paper",
        text=f"RCP 8.5 - Slope: {rcp85_slope:.3f}, P-value: {rcp85_p_value:.2f}",
        showarrow=False,
    )
    
    
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        title=f"{station_id} - {column}",
    )
    
    fig.show()

18:18:22 - cmdstanpy - INFO - Chain [1] start processing
18:18:22 - cmdstanpy - INFO - Chain [1] done processing
18:18:22 - cmdstanpy - INFO - Chain [1] start processing
18:18:22 - cmdstanpy - INFO - Chain [1] done processing
18:18:22 - cmdstanpy - INFO - Chain [1] start processing
18:18:22 - cmdstanpy - INFO - Chain [1] done processing


18:18:23 - cmdstanpy - INFO - Chain [1] start processing
18:18:23 - cmdstanpy - INFO - Chain [1] done processing
18:18:23 - cmdstanpy - INFO - Chain [1] start processing
18:18:23 - cmdstanpy - INFO - Chain [1] done processing
18:18:23 - cmdstanpy - INFO - Chain [1] start processing
18:18:23 - cmdstanpy - INFO - Chain [1] done processing


18:18:24 - cmdstanpy - INFO - Chain [1] start processing
18:18:24 - cmdstanpy - INFO - Chain [1] done processing
18:18:24 - cmdstanpy - INFO - Chain [1] start processing
18:18:24 - cmdstanpy - INFO - Chain [1] done processing
18:18:24 - cmdstanpy - INFO - Chain [1] start processing
18:18:24 - cmdstanpy - INFO - Chain [1] done processing


In [None]:
# Se si fa dal 2000 in poi, si vede che la temperatura aumenta

# Precipitation

In [39]:
column = 'Cumulated Rainfall (mm)'

In [41]:
for station_id in surface_df["Station"].unique():
    station_df = surface_df[surface_df["Station"] == station_id].copy()

    df = station_df[["DateTime", column]].copy()
    
    projection_df = precip_projections[precip_projections["Station"] == station_id].copy()
    
    rcp45 = projection_df[projection_df["Type"] == "rcp45"]
    rcp85 = projection_df[projection_df["Type"] == "rcp85"]
    
    # === Trend Computation ===
    
    # 1. Extract trend with Prophet
    
    # 1.1. Historical data
    
    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    hist_forecasting_final = pd.merge(
        forecast,
        df,
        how="inner",
        on="ds",
    )
    
    # 1.2. RCP 4.5
    
    rcp45.index.name = "ds"

    rcp45 = rcp45.reset_index()

    rcp45.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp45)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp45_forecasting_final = pd.merge(
        forecast,
        rcp45,
        how="inner",
        on="ds",
    )
    
    # 1.3. RCP 8.5
    
    rcp85.index.name = "ds"

    rcp85 = rcp85.reset_index()

    rcp85.rename(columns={column: "y"}, inplace=True)

    model = Prophet()
    model.fit(rcp85)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    rcp85_forecasting_final = pd.merge(
        forecast,
        rcp85,
        how="inner",
        on="ds",
    )
    
    # 2. Compute linear regression on the trend
    
    # 2.1. Historical data
    
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    hist_line = pd.Series(results.predict(X), index=df['ds'])
    
    hist_slope = results.params.iloc[1]
    hist_p_value = results.pvalues.iloc[1]
    
    # 2.2. RCP 4.5
    
    X = np.arange(rcp45.shape[0])
    X = sm.add_constant(X)
    y = rcp45["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp45_line = pd.Series(results.predict(X), index=rcp45['ds'])
    
    rcp45_slope = results.params.iloc[1]
    rcp45_p_value = results.pvalues.iloc[1]
    
    # 2.3. RCP 8.5
    
    X = np.arange(rcp85.shape[0])
    X = sm.add_constant(X)
    y = rcp85["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    rcp85_line = pd.Series(results.predict(X), index=rcp85['ds'])
    
    rcp85_slope = results.params.iloc[1]
    rcp85_p_value = results.pvalues.iloc[1]
    
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=df['DateTime'],
            y=df[column],
            mode="lines",
            name="Historical",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45['DateTime'],
            y=rcp45[column],
            mode="lines",
            name="RCP 4.5",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85['DateTime'],
            y=rcp85[column],
            mode="lines",
            name="RCP 8.5",
        )
    )
    
    
    fig.add_trace(
        go.Scatter(
            x=hist_forecasting_final["ds"],
            y=hist_forecasting_final["trend"],
            mode="lines",
            name="Historical Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp45_forecasting_final["ds"],
            y=rcp45_forecasting_final["trend"],
            mode="lines",
            name="RCP 4.5 Trend",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=rcp85_forecasting_final["ds"],
            y=rcp85_forecasting_final["trend"],
            mode="lines",
            name="RCP 8.5 Trend",
        )
    )
    
    # add box with regression slope and p-value for all the scenarios
    
    fig.add_annotation(
        x=0.05,
        y=0.95,
        xref="paper",
        yref="paper",
        text=f"Historical - Slope: {hist_slope:.2f}, P-value: {hist_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.05,
        y=0.90,
        xref="paper",
        yref="paper",
        text=f"RCP 4.5 - Slope: {rcp45_slope:.2f}, P-value: {rcp45_p_value:.2f}",
        showarrow=False,
    )
    
    fig.add_annotation(
        x=0.05,
        y=0.85,
        xref="paper",
        yref="paper",
        text=f"RCP 8.5 - Slope: {rcp85_slope:.2f}, P-value: {rcp85_p_value:.2f}",
        showarrow=False,
    )
    
    
    
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        title=f"{station_id} - {column}",
    )
    
    fig.show()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



DateParseError: day is out of range for month: 0, at position 0