#### In this notebook, statistical analysis is performed between the different station sites in order to assess similarity between them and to study the possibility to integrate data from a station to another.

More specifically, we want to study if the Rainfall measurements available only for the Airport site can be integrated into the Surface and Ground Water stations.

In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go
import matplotlib.pyplot as plt

from scipy import stats
import pingouin as pg
import scikit_posthocs as sp


import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.seasonal import STL
from statsmodels.stats import anova

# Load Data

In [None]:
data_folder = os.path.join("..", "..", "data", "berlin")

integration_analysis_folder = os.path.join(data_folder, "integration_analysis")

In [None]:
meteo_df = pd.read_excel(os.path.join(integration_analysis_folder, "meteo.xlsx"))

sw_df = pd.read_excel(os.path.join(integration_analysis_folder, "sw.xlsx"))
gw_df = pd.read_excel(os.path.join(integration_analysis_folder, "gw.xlsx"))

# Add Meteo Data to Surface and Ground Water

In [None]:
meteo_df.head()

In [None]:
meteo_df.set_index("DateTime", inplace=True)

In [None]:
sw_df.head()

## Surface Water

### Build Metrics Table

The chosen metrics are: Pearson Coefficient, RMSE, Cosine Similarity and Cointegration Test p-value.

In [None]:
from plotly.subplots import make_subplots
import pymannkendall as mk
from prophet import Prophet
from statsmodels.tsa.stattools import coint

In [None]:
# Need to compare the air temperature between the airport and the stations first

# plot the air temperature for the airport and the stations

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])

for station_id in sw_df["Station"].unique():
    
    fig = go.Figure()
    
    fig.add_trace(
    go.Scatter(
        x=meteo_df.loc[common_dates].index,
        y=meteo_df.loc[common_dates]["Temperature Mean (°C)"],
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
        )
    )
    
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]

    fig.add_trace(
        go.Scatter(
            x=station_df["DateTime"],
            y=station_df["Air Temperature (°C)"],
            mode="lines",
            name=f"Station {station_id}",
        )
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Temperature (°C)",
        font=dict(
            size=18,
        ),
        title="Temperature",
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )
    
    fig.show()

In [None]:
# Deseason the data

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])
    
    
fig = go.Figure()

# deseason the Airport data
stl = STL(meteo_df.loc[common_dates]["Temperature Mean (°C)"], period=12).fit()

deseason_meteo_df = meteo_df.loc[common_dates]["Temperature Mean (°C)"] - stl.seasonal

fig.add_trace(
    go.Scatter(
        x=deseason_meteo_df.index,
        y=deseason_meteo_df,
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
    )
)

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    station_df.set_index("DateTime", inplace=True)
    
    stl = STL(station_df["Air Temperature (°C)"], period=12).fit()
    
    deseason_station_df = station_df["Air Temperature (°C)"] - stl.seasonal

    fig.add_trace(
        go.Scatter(
            x=deseason_station_df.index,
            y=deseason_station_df,
            mode="lines",
            name=f"Station {station_id}",
        )
    )

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

In [None]:
# perform mann kendall test for trend detection
print("Airport")

result = mk.original_test(meteo_df.loc[common_dates]["Temperature Mean (°C)"])

print(result)
print("\n")

for station_id in sw_df["Station"].unique():
    
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    print(f"Station {station_id}")
    
    result = mk.original_test(station_df["Air Temperature (°C)"])
    
    print(result)
    
    print("\n")

In [None]:
# Deseason the data

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])
    
    
fig = make_subplots(rows=4, cols=1)

# deseason the Airport data
stl = STL(meteo_df.loc[common_dates]["Temperature Mean (°C)"], period=12).fit()

deseason_meteo_df = meteo_df.loc[common_dates]["Temperature Mean (°C)"] - stl.seasonal

fig.add_trace(
    go.Scatter(
        x=meteo_df.loc[common_dates].index,
        y=meteo_df.loc[common_dates]["Temperature Mean (°C)"],
        mode="lines",
        name="Airport",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=stl.trend.index,
        y=stl.trend,
        mode="lines",
        name="Trend Airport",
    ),
    row=2,
    col=1
)

fig.add_trace(
    go.Scatter(
        x=stl.seasonal.index,
        y=stl.seasonal,
        mode="lines",
        name="Seasonal Airport",
    ),
    row=3,
    col=1
)

fig.add_trace(
    go.Scatter(
        x=stl.resid.index,
        y=stl.resid,
        mode="lines",
        name="Residual Airport",
    ),
    row=4,
    col=1
)

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    station_df.set_index("DateTime", inplace=True)
    
    stl = STL(station_df["Air Temperature (°C)"], period=12).fit()
    
    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="lines",
            name=f"Station {station_id}",
        ),
        row=1,
        col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=stl.trend.index,
            y=stl.trend,
            mode="lines",
            name=f"Trend Station {station_id}",
        ),
        row=2,
        col=1
    )

    fig.add_trace(
        go.Scatter(
            x=stl.seasonal.index,
            y=stl.seasonal,
            mode="lines",
            name=f"Seasonal Station {station_id}",
        ),
        row=3,
        col=1
    )

    fig.add_trace(
        go.Scatter(
            x=stl.resid.index,
            y=stl.resid,
            mode="lines",
            name=f"Residual Station {station_id}",
        ),
        row=4,
        col=1
    )

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
)

fig.show()

In [None]:
# Deseason the data

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])
    
    
fig = go.Figure()

# deseason the Airport data
stl = STL(meteo_df.loc[common_dates]["Temperature Mean (°C)"], period=12).fit()


df = meteo_df.loc[common_dates].reset_index()
df = df.rename(columns={"DateTime": "ds", "Temperature Mean (°C)": "y"})
df = df[["ds", "y"]]

model = Prophet()
model.fit(df)
# Make predictions for both columns
future = model.make_future_dataframe(periods=0)
forecast = model.predict(future)

fig.add_trace(
    go.Scatter(
        x=forecast["ds"],
        y=forecast["trend"],
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
    )
)

# add trend lower and upper
fig.add_trace(
    go.Scatter(
        x=forecast["ds"],
        y=forecast["trend_lower"],
        mode="lines",
        name="Airport Lower",
        line=dict(color="blue", dash="dash"),
    )
)

fig.add_trace(
    go.Scatter(
        x=forecast["ds"],
        y=forecast["trend_upper"],
        mode="lines",
        name="Airport Upper",
        line=dict(color="blue", dash="dash"),
    )
)

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    df = station_df.reset_index()
    df = df.rename(columns={"DateTime": "ds", "Air Temperature (°C)": "y"})
    df = df[["ds", "y"]]
    
    model = Prophet()
    model.fit(df)
    
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)
    
    fig.add_trace(
        go.Scatter(
            x=forecast["ds"],
            y=forecast["trend"],
            mode="lines",
            name=f"Station {station_id}",
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=forecast["ds"],
            y=forecast["trend_lower"],
            mode="lines",
            name=f"Station {station_id} Lower",
            line=dict(dash="dash"),
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=forecast["ds"],
            y=forecast["trend_upper"],
            mode="lines",
            name=f"Station {station_id} Upper",
            line=dict(dash="dash"),
        )
    )
    

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
)

fig.show()

In [None]:
# create table to store the metrics results
metrics_df = pd.DataFrame(
    index=pd.Index(
        ["Pearson Coeff.", "RMSE", "Cosine Similarity", "Engle-Granger Coint. p-value"]
    ),
    columns=pd.MultiIndex.from_product(
        [["Airport - 105", "Airport - 305", "Airport - 325"], ["Time Series", "Trend", "Seasonal"]]
    )
)

In [None]:
# compute Pearson, Cosine Similarity and RMSE between the airport and the stations

from sklearn.metrics import mean_squared_error
from sklearn.metrics import pairwise
from scipy import stats
from statsmodels.tsa.stattools import grangercausalitytests

for station_id in sw_df["Station"].unique():
    
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    print(f"Station {station_id}")
    
    pearson = stats.pearsonr(meteo_df.loc[common_dates]["Temperature Mean (°C)"], station_df["Air Temperature (°C)"])
    rmse = np.sqrt(mean_squared_error(meteo_df.loc[common_dates]["Temperature Mean (°C)"], station_df["Air Temperature (°C)"]))
    cosine = pairwise.cosine_similarity(meteo_df.loc[common_dates]["Temperature Mean (°C)"].values.reshape(1, -1), station_df["Air Temperature (°C)"].values.reshape(1, -1))
    
    df = pd.DataFrame({"Airport": meteo_df.loc[common_dates]["Temperature Mean (°C)"].values})
    df.loc[:, f"Station {station_id}"] = station_df["Air Temperature (°C)"].values
    
    granger = grangercausalitytests(df, 1)
    
    print()
    print(f"Station {station_id} Pearson: {pearson}")
    print(f"Station {station_id} RMSE: {rmse}")
    print(f"Station {station_id} Cosine: {cosine}")
    
    
    metrics_df.loc["Pearson Coeff.", (f"Airport - {station_id}", "Time Series")] = pearson[0].round(2)
    metrics_df.loc["RMSE", (f"Airport - {station_id}", "Time Series")] = rmse.round(2)
    metrics_df.loc["Cosine Similarity", (f"Airport - {station_id}", "Time Series")] = cosine[0][0].round(2)

    
    print("\n")
    


In [None]:
for station_id in sw_df["Station"].unique():
    
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    print(f"Station {station_id}")
    
    result = coint(meteo_df.loc[common_dates]["Temperature Mean (°C)"], station_df["Air Temperature (°C)"])
    
    metrics_df.loc["Engle-Granger Coint. p-value", (f"Airport - {station_id}", "Time Series")] = result[1].round(4)
    
    print(result)
    
    print("\n")

In [None]:
for station_id in sw_df["Station"].unique():
    
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    print(f"Station {station_id}")
    
    result = coint(station_df["Air Temperature (°C)"], meteo_df.loc[common_dates]["Temperature Mean (°C)"])
    
    print(result)
    
    print("\n")

In [None]:
from pmdarima.arima import auto_arima

In [None]:
# find the best ARIMA model for the airport and the stations
print("Airport")
model = auto_arima(
    meteo_df.loc[common_dates]["Temperature Mean (°C)"],
    seasonal=True,
    m=12,
    start_p=1,
    start_q=1,
    max_p=1,
    max_q=1,
    start_P=2,
    max_P=2,
    start_Q=12,
    max_Q=12,
    error_action="ignore",
)

print(model.summary())

for station_id in sw_df["Station"].unique():
        
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    station_df.set_index("DateTime", inplace=True)
    
    print(f"Station {station_id}")
    
    model = auto_arima(
        station_df["Air Temperature (°C)"],
        seasonal=True,
        m=12,
        start_p=1,
        start_q=1,
        max_p=12,
        max_q=12,
        error_action="ignore",
    )
    print(model.summary())
        
    print("\n")

In [None]:
# Deseason the data

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])
    
    
# === TREND PART ===

# deseason the Airport data
meteo_stl = STL(meteo_df.loc[common_dates]["Temperature Mean (°C)"], period=12).fit()


for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    station_df.set_index("DateTime", inplace=True)
    
    stl = STL(station_df["Air Temperature (°C)"], period=12).fit()
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=meteo_stl.trend.index,
            y=meteo_stl.trend,
            mode="lines",
            name="Trend Airport",
        ),
    )
    
    fig.add_trace(
        go.Scatter(
            x=stl.trend.index,
            y=stl.trend,
            mode="lines",
            name=f"Trend Station {station_id}",
        ),
    )
    
    print(f"Station {station_id} TREND")
    
    pearson = stats.pearsonr(meteo_stl.trend, stl.trend)
    rmse = np.sqrt(mean_squared_error(meteo_stl.trend, stl.trend))
    cosine = pairwise.cosine_similarity(meteo_stl.trend.values.reshape(1, -1), stl.trend.values.reshape(1, -1))
    
    result = coint(meteo_stl.trend.values, stl.trend.values)
    
    print(f"Station {station_id} Pearson: {pearson}")
    print(f"Station {station_id} RMSE: {rmse}")
    print(f"Station {station_id} Cosine: {cosine}")
    
    metrics_df.loc["Pearson Coeff.", (f"Airport - {station_id}", "Trend")] = pearson[0].round(2)
    metrics_df.loc["RMSE", (f"Airport - {station_id}", "Trend")] = rmse.round(2)
    metrics_df.loc["Cosine Similarity", (f"Airport - {station_id}", "Trend")] = cosine[0][0].round(2)
    metrics_df.loc["Engle-Granger Coint. p-value", (f"Airport - {station_id}", "Trend")] = result[1].round(4)
    
    print("\n")
    

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Temperature (°C)",
        font=dict(
            size=18,
        ),
        title="Trend Temperature",
    )

    fig.show() 
    

# === SEASONAL PART ===

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df = station_df[station_df["DateTime"].isin(common_dates)]
    
    station_df.set_index("DateTime", inplace=True)
    
    stl = STL(station_df["Air Temperature (°C)"], period=12).fit()
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=meteo_stl.seasonal.index,
            y=meteo_stl.seasonal,
            mode="lines",
            name="Season Airport",
        ),
    )
    
    fig.add_trace(
        go.Scatter(
            x=stl.seasonal.index,
            y=stl.seasonal,
            mode="lines",
            name=f"Season Station {station_id}",
        ),
    )
    
    print(f"Station {station_id} SEASONAL")
    
    pearson = stats.pearsonr(meteo_stl.seasonal, stl.seasonal)
    rmse = np.sqrt(mean_squared_error(meteo_stl.seasonal, stl.seasonal))
    cosine = pairwise.cosine_similarity(meteo_stl.seasonal.values.reshape(1, -1), stl.seasonal.values.reshape(1, -1))
    
    result = coint(meteo_stl.seasonal.values, stl.seasonal.values)
    
    print(f"Station {station_id} Pearson: {pearson}")
    print(f"Station {station_id} RMSE: {rmse}")
    print(f"Station {station_id} Cosine: {cosine}")
    
    metrics_df.loc["Pearson Coeff.", (f"Airport - {station_id}", "Seasonal")] = pearson[0].round(2)
    metrics_df.loc["RMSE", (f"Airport - {station_id}", "Seasonal")] = rmse.round(2)
    metrics_df.loc["Cosine Similarity", (f"Airport - {station_id}", "Seasonal")] = cosine[0][0].round(2)
    
    metrics_df.loc["Engle-Granger Coint. p-value", (f"Airport - {station_id}", "Seasonal")] = result[1].round(4)
    
    print("\n")
    

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Temperature (°C)",
        font=dict(
            size=18,
        ),
        title="Seasonal Temperature",
    )

    fig.show()

In [None]:
metrics_df

In [None]:
metrics_df.to_excel(os.path.join(integration_analysis_folder, "metrics.xlsx"))

### Statistical Tests to integrate Rainfall

In [None]:
# Build df for anova test

# H0: The means of the air temperature are equal
# H1: At least one mean is different

# get common dates

common_dates = meteo_df.index

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]

    common_dates = common_dates.intersection(station_df["DateTime"])

anova_df = pd.DataFrame()

anova_df["DateTime"] = common_dates

anova_df.set_index("DateTime", inplace=True)

stl = STL(meteo_df.loc[common_dates]["Temperature Mean (°C)"], period=12).fit()

anova_df["Airport"] = meteo_df.loc[common_dates]["Temperature Mean (°C)"]

for station_id in sw_df["Station"].unique():
    station_df = sw_df[sw_df["Station"] == station_id]
    
    station_df.set_index("DateTime", inplace=True)
    
    stl = STL(station_df["Air Temperature (°C)"], period=12).fit()
    
    anova_df[f"Station {station_id}"] = station_df["Air Temperature (°C)"]
anova_df.reset_index(inplace=True)

In [None]:
anova_df

In [None]:
# build dataset for repeated measures anova
id_vars = ["DateTime"]
value_vars = anova_df.columns.difference(["DateTime"])

rm_anova_df = anova_df.melt(id_vars=id_vars, value_vars=value_vars, value_name="Temperature")

In [None]:
# rm_anova_df['DateTime'] = rm_anova_df['DateTime'].apply(lambda x: x.timestamp())

#### Assuming iid samples

##### Normality Tests

Data is not normally distributed

In [None]:
lambdas = []

for station in anova_df.columns.difference(["DateTime"]):
    data, lambda_ = stats.yeojohnson(anova_df[station])
    lambdas.append(lambda_)
    
mean_lambda = np.mean(lambdas)

In [None]:
# Hist plot

for station in anova_df.columns.difference(["DateTime"]):
    
    fig = go.Figure()
    
    data = stats.yeojohnson(anova_df[station], lmbda=mean_lambda)
    
    fig.add_trace(
        go.Histogram(
            x=data,
            name="Transformed",
            marker=dict(color="blue")
        )
    )
    
    fig.add_trace(
        go.Histogram(
            x=anova_df[station],
            name="Original",
            marker=dict(color="red")
        )
    )
    
    fig.update_layout(
        xaxis_title="Temperature",
        yaxis_title="Frequency",
        font=dict(
            size=18,
        ),
        title=f"Station {station} Temperature Distribution",
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )
    
    fig.show()

In [None]:
# Q-Q plot

for station in anova_df.columns.difference(["DateTime"]):
    
    data = stats.yeojohnson(anova_df[station], lmbda=mean_lambda)
    
    stats.probplot(data, dist="norm", plot=plt)
    
    statistic, p_value = stats.shapiro(data)
    
    print(f"TRANSFORMED: Station {station} - Shapiro-Wilk Test - Statistic: {statistic}, P-value: {p_value}")
    
    statistic, p_value = stats.shapiro(anova_df[station])
    
    print(f"RAW: Station {station} - Shapiro-Wilk Test - Statistic: {statistic}, P-value: {p_value}")
    
    plt.title(f"Q-Q plot for station {station}")
    
    plt.show()

##### Assuming Normality

In [None]:
# perform Barlett test for homogeneity of variances
statistic, p_value = stats.bartlett(*[anova_df[station] for station in anova_df.columns.difference(["DateTime"])])

print(f"Barlett Test - Statistic: {statistic}, P-value: {p_value}")

In [None]:
# the variances are not homogeneous, so we need to perform a Welch's ANOVA

In [None]:
# perform Welch's ANOVA
anova_results = pg.welch_anova(data=rm_anova_df, dv="Temperature", between="variable")

print(anova_results)

In [None]:
# perform post-hoc test (Games-Howell)
posthoc_results = pg.pairwise_gameshowell(data=rm_anova_df, dv="Temperature", between="variable")

posthoc_results

In [None]:
# The Airport is not significantly different from the stations, so we can integrate the data
# Station 325 is significantly different from the other SW stations

##### Non-Normality Case

In [None]:
# perform levene test to check the homogeneity of variance
statistic, p_value = stats.levene(*[anova_df[station] for station in anova_df.columns.difference(["DateTime"])])

print(f"Levene test - p-value: {p_value}")

In [None]:
# the p-value is less than 0.05, so we reject the null hypothesis that the variances are equal

In [None]:
# perform Welch's one-way test
welch_test = pg.welch_anova(
    data=rm_anova_df,
    dv="Temperature",
    between="variable",
)

print(welch_test)

In [None]:
# the p-value is less than 0.05, so we reject the null hypothesis that the means are equal

In [None]:
# perform Conover's post-hoc test
posthoc = sp.posthoc_conover(
    rm_anova_df,
    val_col="Temperature",
    group_col="variable",
)

print(posthoc)

In [None]:
# The Airport is not significantly different from the stations, so we can integrate the data
# Station 325 is significantly different from the other SW stations

In [None]:
# perform the repeated measures ANOVA test
rm_anova = anova.AnovaRM(
    data=rm_anova_df,
    depvar="Temperature",
    subject="DateTime",
    within=["variable"],
)

In [None]:
rm_anova

#### Accounting for time dependance

In [None]:
# perform the repeated measures ANOVA test

model = smf.mixedlm("Temperature ~ DateTime", rm_anova_df, groups=rm_anova_df["variable"])

result = model.fit()

print(result.summary())

In [None]:
# perform the repeated measures ANOVA test
rm_anova = pg.rm_anova(
    data=rm_anova_df,
    dv="Temperature",
    within="variable",
    subject="DateTime",
)

print(rm_anova)

In [None]:
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

In [None]:
# Abilita la conversione automatica tra pandas DataFrame e R DataFrame
pandas2ri.activate()

# Importa i pacchetti R necessari


In [None]:
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

# Abilita la conversione automatica tra pandas DataFrame e R DataFrame
pandas2ri.activate()

# Importa i pacchetti R necessari
base = importr('base')
stats = importr('stats')
nlme = importr('nlme')

# Carica i dati (assumendo che 'rm_anova_df' sia il DataFrame pandas)


# Converte il DataFrame pandas in un DataFrame R
r_data = pandas2ri.py2rpy(rm_anova_df)

In [None]:
print(r_data)

In [None]:


# Definisci il modello di base
formula = ro.Formula('Temperature ~ factor(variable) * factor(DateTime)')

# Definisci le strutture di correlazione in R
cor_structures = {
    "Hyp0": nlme.corCompSymm(form=ro.Formula('~ 1')),
    "Hyp1": nlme.corCompSymm(form=ro.Formula('~ 1 | station')),
    "Hyp2": nlme.corAR1(form=ro.Formula('~ 1 | station')),
    "Hyp3": nlme.corARMA(form=ro.Formula('~ 1 | station')),
    "Hyp4": nlme.corSymm(form=ro.Formula('~ 1 | station'))
}

# Crea e confronta i modelli
results = {}
for name, cor_struct in cor_structures.items():
    model = nlme.gls(
        formula,
        data=r_data,
        correlation=cor_struct,
        weights=nlme.varIdent(form=ro.Formula('~ 1 | DateTime'))
    )
    results[name] = model
    print(f'{name} - AIC: {nlme.AIC(model)[0]}, BIC: {nlme.BIC(model)[0]}')

# Confronto dei modelli
aic_values = {name: nlme.AIC(result)[0] for name, result in results.items()}
bic_values = {name: nlme.BIC(result)[0] for name, result in results.items()}

best_aic_model = min(aic_values, key=aic_values.get)
best_bic_model = min(bic_values, key=bic_values.get)

print(f'Best model by AIC: {best_aic_model}')
print(f'Best model by BIC: {best_bic_model}')


### Interaction Plot

In [None]:
# build interaction plot

months = anova_df["DateTime"].dt.month.unique()
months.sort()

# get the names of the months
months_name = [pd.to_datetime(f"{month}-01-2021").strftime("%B") for month in months]

fig = make_subplots(
    4,
    3,
    subplot_titles=months_name,
    vertical_spacing=0.08,  # Reduce vertical spacing
    horizontal_spacing=0.05  # Reduce horizontal spacing
)

colors = ["blue", "red", "green", "orange"]


legend_added = set()

for month in months:
    
    data = anova_df[anova_df["DateTime"].dt.month == month]
    
    for i, station in enumerate(data.columns.difference(["DateTime"])):
        
        color = colors[i]
        
        # get the row and column where the rows are 3 and the columns are 4
        
        row = (month - 1) // 3 + 1
        col = (month - 1) % 3 + 1
        
        showlegend = station not in legend_added
        if showlegend:
            legend_added.add(station)

        fig.add_trace(
            go.Scatter(
                x=data["DateTime"].dt.year,
                y=data[station],
                mode="lines",
                name=f"{station}",
                line=dict(color=color),
                showlegend=showlegend,
            ),
            row=row,
            col=col,
    
        )   
 
fig.add_annotation(dict(
    x=0.5,
    y=-0.08,
    showarrow=False,
    text="Year",
    xref="paper",
    yref="paper",
    font=dict(size=20)
))

fig.add_annotation(dict(
    x=-0.08,
    y=0.5,
    showarrow=False,
    text="Temperature (°C)",
    textangle=-90,
    xref="paper",
    yref="paper",
    font=dict(size=20)
))  


fig.update_layout(
    height=1000,
    width=1200,
    title={
        'text': "Temperature by Station and Month of the Year ",
        'y': 0.98,  # Vertical position
        'x': 0.5,  # Horizontal position
        'xanchor': 'center',
        'yanchor': 'top'
    },
    font=dict(size=18),
    margin=dict(t=80, l=80, b=100, r=40)  # Adjust margins
)
        
fig.show()

## Ground Water

In [None]:
# Need to compare the air temperature between the airport and the stations first

# plot the air temperature for the airport and the stations
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=meteo_df.index,
        y=meteo_df["Temperature Mean (°C)"],
        mode="lines",
        name="Airport",
        line=dict(color="blue"),
    )
)

for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="lines",
            name=f"Station {station_id}",
        )
    )

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    font=dict(
        size=18,
    ),
    title="Temperature",
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

In [None]:
# compute pearson correlation

for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    station_df = station_df.resample("M").median()

    # get dates for the station where the temperature is not nan
    dates = station_df["Air Temperature (°C)"].dropna().index

    station_df = station_df.loc[dates]

    airport_df = meteo_df.loc[dates].copy()

    # compute pearson correlation
    corr, _ = stats.pearsonr(
        airport_df["Temperature Mean (°C)"],
        station_df["Air Temperature (°C)"],
    )

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df.index,
            y=airport_df["Temperature Mean (°C)"],
            mode="lines",
            name="Airport",
            line=dict(color="blue"),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="markers",
            name=f"Station {station_id}",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df["Air Temperature (°C)"],
            mode="lines",
        )
    )

    # add the correlation to the plot
    fig.add_annotation(
        x=0.01,
        y=0.95,
        xref="paper",
        yref="paper",
        text=f"Pearson Correlation: {corr:.2f}",
        showarrow=False,
        font=dict(
            size=18,
        ),
    )

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Air Temperature (°C)",
        font=dict(
            size=18,
        ),
    )

    fig.show()

In [None]:
# scatter plot of the air temperature between the airport and the stations
for station_id in gw_stations_dict.keys():
    station_df = gw_stations_dict[station_id]

    station_df = station_df.resample("M").median()

    # get dates for the station where the temperature is not nan
    dates = station_df["Air Temperature (°C)"].dropna().index

    station_df = station_df.loc[dates]

    airport_df = meteo_df.loc[dates].copy()

    X = airport_df["Temperature Mean (°C)"].copy()

    # X = scaler.fit_transform(X.values.reshape(-1, 1))

    X = sm.add_constant(X)
    y = station_df["Air Temperature (°C)"].copy()

    # y = scaler.fit_transform(y.values.reshape(-1, 1))

    model = sm.OLS(y, X)
    results = model.fit()

    line = pd.Series(results.predict(X), index=df.index)

    slope = results.params[1]
    p_value = results.pvalues[1]

    print(f"Station {station_id} - Slope: {slope}")
    print(f"Station {station_id} - P-value: {p_value}")

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=airport_df["Temperature Mean (°C)"],
            y=station_df["Air Temperature (°C)"],
            mode="markers",
            name="Data",
            marker=dict(size=8, color="blue", opacity=0.7),
        )
    )

    # add line on bisector
    fig.add_trace(
        go.Scatter(
            x=[-10, 40],
            y=[-10, 40],
            mode="lines",
            name="Bisector",
            line=dict(color="red", width=2, dash="dash"),
        )
    )

    fig.update_layout(
        xaxis_title="Airport",
        yaxis_title=f"Station {station_id}",
        font=dict(
            size=18,
        ),
        title="Air Temperature",
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    fig.show()

In [None]:
meteo_df.columns.to_list()

In [None]:
# reindex the sw_df first to have unique indices
gw_df.reset_index(inplace=True)

In [None]:
# The correlation is high between the airport and the stations,
# so we can add the airport data variables to the stations

# add the rainfall data to the stations
gw_df["Cumulated Rainfall (mm)"] = np.nan

for station_id in gw_df["Station"].unique():
    start_date = gw_df[gw_df["Station"] == station_id]["DateTime"].min()
    end_date = gw_df[gw_df["Station"] == station_id]["DateTime"].max()

    # take the common date range with the airport
    start_date = max(start_date, meteo_df.index.min())
    end_date = min(end_date, meteo_df.index.max())

    airport_df = meteo_df[start_date:end_date].copy()

    # take the common date range with the station
    # Identify the indices in sw_df that match the station_id and are within the date range
    indices = gw_df[
        (gw_df["Station"] == station_id)
        & (gw_df["DateTime"] >= start_date)
        & (gw_df["DateTime"] <= end_date)
    ].index

    # Directly update sw_df for the matching indices
    gw_df.loc[indices, "Cumulated Rainfall (mm)"] = airport_df[
        "Cumulated Rainfall (mm)"
    ].values

In [None]:
gw_df.isna().sum()

In [None]:
gw_df.drop(columns="index", inplace=True)