# Clean Data Analysis

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

import networkx as nx

import numpy as np
from scipy import stats
from sklearn.metrics import pairwise
from sklearn.metrics import mean_squared_error

import statsmodels.tsa.seasonal as smt
from statsmodels.tsa.api import VAR

from statsmodels.tsa.stattools import adfuller, grangercausalitytests, kpss

import pymannkendall as mk

In [None]:
plt.rcParams["figure.figsize"] = [20, 10]
plt.rcParams.update({"font.size": 18})

# Load Data

In [None]:
data_folder = os.path.join("..", "..", "data", "tarragona")

clean_data_folder = os.path.join(data_folder, "clean_data")

correlation_folder = os.path.join(data_folder, "correlation_timeseries")

variation_folder = os.path.join(data_folder, "daily_variation_by_year")

trend_folder = os.path.join(data_folder, "trend")

paper_plot_folder = os.path.join("..", "..", "paper plots")

In [None]:
tortosa_df = pd.read_excel(
    os.path.join(clean_data_folder, "tortosa.xlsx")
)
guiamets_df = pd.read_excel(
    os.path.join(clean_data_folder, "guiamets.xlsx")
)
mequinenza_df = pd.read_excel(
    os.path.join(clean_data_folder, "mequinenza.xlsx")
)
xerta_df = pd.read_excel(os.path.join(clean_data_folder, "xerta.xlsx"))

In [None]:
# use same time period for all data
min_date = max(
    tortosa_df["DateTime"].min(),
    guiamets_df["DateTime"].min(),
    mequinenza_df["DateTime"].min(),
    xerta_df["DateTime"].min(),
)
max_date = min(
    tortosa_df["DateTime"].max(),
    guiamets_df["DateTime"].max(),
    mequinenza_df["DateTime"].max(),
    xerta_df["DateTime"].max(),
)

tortosa_df = tortosa_df[
    (tortosa_df["DateTime"] >= min_date)
    & (tortosa_df["DateTime"] <= max_date)
]
guiamets_df = guiamets_df[
    (guiamets_df["DateTime"] >= min_date)
    & (guiamets_df["DateTime"] <= max_date)
]
mequinenza_df = mequinenza_df[
    (mequinenza_df["DateTime"] >= min_date)
    & (mequinenza_df["DateTime"] <= max_date)
]
xerta_df = xerta_df[
    (xerta_df["DateTime"] >= min_date)
    & (xerta_df["DateTime"] <= max_date)
]

# Compare common variables

Common variables are:
* cumulated_rainfall_24h
* watertemperature
* conductivity

The idea is to combine every variable in a single dataset, which in this case is the Xerta dataset, firstly by comparing the redundant variables between each site and secondly by merging the selected variables into the Xerta df.

To compare common variables, the same time period must be used.

## Cumulated Rainfall

In [None]:
common_variable = "cumulated_rainfall_24h"

In [None]:
%%script false --no-raise-error
# get common datetimes where both tortosa_df and guiamets_df have positive values
common_datetimes = tortosa_df[
    (tortosa_df[common_variable] > 0)
    & (guiamets_df[common_variable] > 0)
]["DateTime"]

tortosa_df = tortosa_df[
    tortosa_df["DateTime"].isin(common_datetimes)
].sort_values("DateTime")

guiamets_df = guiamets_df[
    guiamets_df["DateTime"].isin(common_datetimes)
].sort_values("DateTime")


In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=guiamets_df, label="Guiamets"
)
# sns.lineplot(
#     x="DateTime",
#     y=common_variable,
#     data=mequinenza_df,
#     label="Mequinenza",
# )

### Pearson

#### Tortosa - Mequinenza

In [None]:
stats.pearsonr(
    tortosa_df[common_variable],
    mequinenza_df[common_variable],
)

#### Tortosa - Guiamets

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable],
    guiamets_df[common_variable],
)

#### Mequinenza - Guiamets

In [None]:
stats.pearsonr(
    mequinenza_df[common_variable], guiamets_df[common_variable]
)

### Cosine Similarity

#### Tortosa - Mequinenza

In [None]:
pairwise.cosine_similarity(
    tortosa_df[common_variable].values.reshape(1, -1),
    mequinenza_df[common_variable].values.reshape(1, -1),
)

#### Tortosa - Guiamets

In [None]:
pairwise.cosine_similarity(
    tortosa_df[common_variable].values.reshape(1, -1),
    guiamets_df[common_variable].values.reshape(1, -1),
)

#### Mequinenza - Guiamets

In [None]:
pairwise.cosine_similarity(
    mequinenza_df[common_variable].values.reshape(1, -1),
    guiamets_df[common_variable].values.reshape(1, -1),
)

### RMSE

#### Tortosa - Mequinenza

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], mequinenza_df[common_variable]
    )
)
rmse / np.std(tortosa_df[common_variable])

#### Tortosa - Guiamets

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], guiamets_df[common_variable]
    )
)
rmse / np.std(tortosa_df[common_variable])

### Kendall-Tau

#### Tortosa - Mequinenza

In [None]:
stats.kendalltau(
    tortosa_df[common_variable], mequinenza_df[common_variable]
)

#### Tortosa - Guiamets

In [None]:
stats.kendalltau(
    tortosa_df[common_variable], guiamets_df[common_variable]
)

#### Mequinenza - Guiamets

In [None]:
stats.kendalltau(
    mequinenza_df[common_variable], guiamets_df[common_variable]
)

### Plot

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=guiamets_df, label="Guiamets"
)
# sns.lineplot(
#     x="DateTime",
#     y=common_variable,
#     data=mequinenza_df,
#     label="Mequinenza",
# )

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.2f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[0],
    85,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Daily Cumulated Rainfall (mm)")
plt.title("Daily Cumulated Rainfall: Tortosa vs Guiamets")

plt.show()

## Water Temperature

In [None]:
common_variable = "watertemperature"

In [None]:
value, p_value = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

# add textbox with correlation value
text_string = "\n".join(
    [
        f"Pearson correlation = {value:.4f}",
        f"P-value = {p_value:.4f}",
    ]
)

# props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

# plt.text(
#     xerta_df["DateTime"].iloc[60],
#     29,
#     s=text_string,
#     fontsize=12,
#     bbox=props,
# )

plt.xlabel("Year")
plt.ylabel("Water temperature (°C)")
plt.title("Water temperature: Xerta vs Tortosa")
plt.show()

### Pearson

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

### Cosine Similarity

In [None]:
pairwise.cosine_similarity(
    tortosa_df[common_variable].values.reshape(1, -1),
    xerta_df[common_variable].values.reshape(1, -1),
)

### RMSE

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], xerta_df[common_variable]
    )
)
# rmse = rmse / np.std(tortosa_df[common_variable])

### Kendall-Tau

In [None]:
stats.kendalltau(tortosa_df[common_variable], xerta_df[common_variable])

### Plot

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.3f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[60],
    29,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Water temperature (°C)")
plt.title("Water temperature: Xerta vs Tortosa")
plt.show()

## Conductivity

In [None]:
common_variable = "conductivity"

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

plt.xlabel("Year")
plt.ylabel("Conductivity (µS/cm)")
plt.title("Conductivity: Xerta vs Tortosa")
plt.show()

### Pearson

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

### Cosine Similarity

In [None]:
pairwise.cosine_similarity(
    tortosa_df[common_variable].values.reshape(1, -1),
    xerta_df[common_variable].values.reshape(1, -1),
)

### RMSE

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], xerta_df[common_variable]
    )
)
# rmse = rmse / np.std(tortosa_df[common_variable])

### Kendall-Tau

In [None]:
stats.kendalltau(tortosa_df[common_variable], xerta_df[common_variable])

### Plot

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.3f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[0],
    1800,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Conductivity (µS/cm)")
plt.title("Conductivity: Xerta vs Tortosa")
plt.show()

## Turbidity

In [None]:
common_variable = "turbidity"

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

plt.xlabel("Year")
plt.ylabel("Turbidity (NTU)")
plt.title("Turbidity: Xerta vs Tortosa")
plt.show()

### Pearson

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

### Cosine Similarity

In [None]:
pairwise.cosine_similarity(
    tortosa_df[common_variable].values.reshape(1, -1),
    xerta_df[common_variable].values.reshape(1, -1),
)

### RMSE

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], xerta_df[common_variable]
    )
)
# rmse / np.std(tortosa_df[common_variable])

### Kendall-Tau

In [None]:
stats.kendalltau(tortosa_df[common_variable], xerta_df[common_variable])

### Plot

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.3f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[0],
    143,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Turbidity (NTU)")
plt.title("Turbidity: Xerta vs Tortosa")
plt.show()

## Compute Correlation Matrix

In [None]:
common_variables = {
    "conductivity": "Conductivity\n(µS/cm)",
    "turbidity": "Turbidity\n(NTU)",
    "watertemperature": "Water Temperature\n(°C)",
}
# rename columns

In [None]:
# sort columns
tortosa_corr = tortosa_df[tortosa_df.columns.difference(["DateTime"])][
    common_variables.keys()
].corr()
xerta_corr = xerta_df[xerta_df.columns.difference(["DateTime"])][
    common_variables.keys()
].corr()

# rename columns to common_variables values
tortosa_corr = tortosa_corr.rename(
    columns=common_variables, index=common_variables
)
xerta_corr = xerta_corr.rename(
    columns=common_variables, index=common_variables
)

fig, ax = plt.subplots(1, 2)

sns.heatmap(tortosa_corr, annot=True, cmap="coolwarm", ax=ax[0])
ax[0].set_title("Tortosa")

# center the axis label

sns.heatmap(xerta_corr, annot=True, cmap="coolwarm", ax=ax[1])
ax[1].set_title("Xerta")


plt.show()

# Build unique dataset

In [None]:
# water temperature and conductivity are better in the xerta dataset so no need to merge with tortosa

# I decided to take the rainfall from tortosa since it is the closest to the xerta station

xerta_df["cumulated_rainfall_24h"] = tortosa_df[
    "cumulated_rainfall_24h"
].values
xerta_df["environment_temperature"] = guiamets_df[
    "environmental_temperature"
].values
xerta_df["flowriver"] = tortosa_df["flowriver"].values

xerta_df.rename(
    columns={
        "cumulated_rainfall_24h": "Daily Cumulated Rainfall",
        "watertemperature": "Water Temperature",
        "environment_temperature": "Air Temperature",
        "flowriver": "Flow River",
        "conductivity": "Conductivity",
        "dissolvedoxygen": "Dissolved Oxygen",
        "nitrate": "Nitrate",
        "redoxpotential": "Redox Potential",
        "turbidity": "Turbidity",
        "Ammonium": "Ammonium",
        "ABS254": "Absorbance 254nm",
    },
    inplace=True,
)

In [None]:
# drop first 3 rows of xerta_df since they are the only rows for august 2012
xerta_df = xerta_df.iloc[3:]

In [None]:
xerta_df.set_index("DateTime", inplace=True)

In [None]:
# add unit of measurement to the columns
xerta_df.rename(
    columns={
        "Daily Cumulated Rainfall": "Daily Cumulated Rainfall (L/m²)",
        "Water Temperature": "Water Temperature (°C)",
        "Air Temperature": "Air Temperature (°C)",
        "Flow River": "Flow River (m³/s)",
        "Conductivity": "Conductivity (µS/cm)",
        "Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
        "Nitrate": "Nitrate (mg/L)",
        "Redox Potential": "Redox Potential (mV)",
        "Turbidity": "Turbidity (NTU)",
        "Ammonium": "Ammonium (mg/L)",
        "Absorbance 254nm": "UVA254",
    },
    inplace=True,
)

In [None]:
xerta_df.to_excel(os.path.join(clean_data_folder, "full_dataset.xlsx"))

In [None]:
# create an info dataframe to store the information about the dataset
info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N Samples",
            "% Missing Values",
            "Frequency (days)",
            "Mean",
            "Std",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=xerta_df.columns,
)


In [None]:
# store the information in the station_info_df
for column in xerta_df.columns:
    df = xerta_df[column].copy()

    start_date = df.dropna().index.min().strftime("%Y-%m-%d")
    end_date = df.dropna().index.max().strftime("%Y-%m-%d")

    df = df[start_date:end_date]

    missing_values = df.isna().sum() / df.shape[0] * 100

    info_df.loc["N Samples", column] = (
        xerta_df[column].dropna().shape[0]
    )
    info_df.loc[
        "% Missing Values", column
    ] = missing_values
    info_df.loc["Frequency (days)", column] = (
        xerta_df.index.to_series().diff().value_counts().index[0].days
    )
    
    info_df.loc["Mean",  column] = df.mean()
    info_df.loc["Std", column] = df.std()
    
    info_df.loc["Start Date", column] = start_date
    info_df.loc["End Date", column] = end_date

In [None]:
info_df

In [None]:
info_df.to_excel(os.path.join(clean_data_folder, "info.xlsx"))

# Overall timeseries decomposition

In [None]:
min_date = xerta_df.index.min()
max_date = xerta_df.index.max()

print("Min date:", min_date)
print("Max date:", max_date)

In [None]:
# compute time difference between min and max date of the dataset in years
time_diff = xerta_df.index.max() - xerta_df.index.min()
time_diff.total_seconds() / (60 * 60 * 24 * 365)

In [None]:
for feature in xerta_df.columns.difference(["DateTime"]):
    result = smt.STL(xerta_df[feature], period=365).fit()
    fig, axs = plt.subplots(4, 1, figsize=(40, 20))
    sns.lineplot(data=result.observed, ax=axs[0])
    sns.lineplot(data=result.trend, ax=axs[1])
    sns.lineplot(data=result.seasonal, ax=axs[2])
    sns.lineplot(data=result.resid, ax=axs[3])
    fig.suptitle(feature)

# Yearly Seasonal Decomposition

Every variables seems to have a yearly seasonal component, so the analysis is performed year-by-year.

In [None]:
xerta_df["month"] = xerta_df.index.month
xerta_df["year"] = xerta_df.index.year
monthly_average = xerta_df.groupby(["year", "month"]).mean()

In [None]:
monthly_average.reset_index(inplace=True)

In [None]:
monthly_average["DateTime"] = pd.to_datetime(
    monthly_average[["year", "month"]].assign(day=1)
)
monthly_average.drop(columns=["year", "month"], inplace=True)
monthly_average.set_index("DateTime", inplace=True)

In [None]:
xerta_df = monthly_average

In [None]:
xerta_df.reset_index(inplace=True)

In [None]:
xerta_df

In [None]:
# add solar radiation data

solar_radiation_df = pd.read_excel(os.path.join(clean_data_folder, "solar_radiation.xlsx"))   

In [None]:
solar_radiation_df

In [None]:
# change DateTime day to 1
solar_radiation_df["DateTime"] = solar_radiation_df["DateTime"].apply(
    lambda x: x.replace(day=1)
)

In [None]:
# merge xerta_df with solar_radiation_df

xerta_df = pd.merge(xerta_df, solar_radiation_df, on="DateTime", how="left")

In [None]:
xerta_df

In [None]:
xerta_df.set_index("DateTime", inplace=True)

In [None]:
rgb = (200, 2, 110)
color = tuple(map(lambda x: x / 255, rgb))

In [None]:
plt.rcParams["font.size"] = 30

In [None]:
trend_results = {}


years = xerta_df.index.year.unique()
for year in years:
    trend_results[year] = {}


# year by year extrapolate the trend
# for year in years:
#     trend_results[year] = {}
#     df_year = xerta_df[xerta_df.index.year == year]
#     for feature in xerta_df.columns.difference(["DateTime"]):
#         result = smt.seasonal_decompose(df_year[feature], period=30)
#         trend_results[year][feature] = result.trend.dropna()


for feature in xerta_df.columns.difference(["DateTime"]):
    result = smt.STL(xerta_df[feature], period=12).fit()
    trend = result.trend.dropna()
    for year in years:
        trend_results[year][feature] = trend[trend.index.year == year]

    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=xerta_df.index,
            y=xerta_df[feature],
            mode="lines",
            name="Monthly Average",
            line=dict(color="black", width=1.0),
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=trend.index,
            y=trend,
            mode="lines",
            name="Trend (Moving Average)",
            line=dict(color='rgb(200,2,110)', width=4.0),
        )
    )
    
    fig.update_layout(
        title=f"{feature}",
        xaxis_title="Year",
        yaxis_title=feature,
    )

    fig.show()

In [None]:
trends_df = pd.DataFrame(trend_results)

In [None]:
trends_df

## Linear Estimation with insights

In [None]:
import statsmodels.api as sm

linear_estimations = {}

for feature, row in trends_df.iterrows():
    linear_estimations[feature] = {}
    for year in trends_df.columns:
        result = sm.OLS(
            row[year].values,
            sm.add_constant(row[year].index.to_julian_date()),
        ).fit()
        # result = stats.linregress(row[year].index.to_julian_date(), row[year].values)
        linear_estimations[feature][year] = result

In [None]:
# save slope and se for each feature and year
slopes = {}
ses = {}
for feature in linear_estimations:
    slopes[feature] = {}
    ses[feature] = {}
    for year in linear_estimations[feature]:
        slopes[feature][year] = (
            linear_estimations[feature][year].params[1].round(5)
        )
        ses[feature][year] = (
            linear_estimations[feature][year].bse[1].round(5)
        )

slopes_df = pd.DataFrame(slopes)
ses_df = pd.DataFrame(ses)

# create unique table where each entry is a string like 'slope +/- se'
slope_se_df = pd.DataFrame()
for feature in slopes_df:
    slope_se_df[feature] = (
        slopes_df[feature].map(str) + " +/- " + ses_df[feature].map(str)
    )

slope_se_df.to_excel(os.path.join(clean_data_folder, "slope_se.xlsx"))

In [None]:
for feature in linear_estimations.keys():
    # for each year I have to plot the slope and the intercept with their confidence intervals
    fig, ax = plt.subplots(figsize=(20, 10))
    feature_data = {}
    for year in linear_estimations[feature].keys():
        result = linear_estimations[feature][year]
        intercept, slope = result.params
        beta_0_se, beta_1_se = result.bse
        feature_data[year] = {
            "intercept": intercept,
            "slope": slope,
            "intercept_se": beta_0_se,
            "slope_se": beta_1_se,
        }
    # plot slope and intercept with their confidence intervals
    # as if it is a boxplot
    feature_df = pd.DataFrame(feature_data).T
    feature_df["year"] = feature_df.index
    # add confidence intervals as error bars
    # ax.scatter(feature_df["year"], feature_df["slope"], color="black")
    ax.errorbar(
        feature_df["year"],
        feature_df["slope"],
        yerr=feature_df["slope_se"] * 2,
        color="red",
        linestyle="None",
    )
    sns.lineplot(
        data=feature_df,
        x="year",
        y="slope",
        ax=ax,
        marker="o",
        markersize=10,
        linestyle="--",
    )
    ax.set_ylabel("Average Variation (unit/day)")

    # plot horizontal line at 0
    ax.axhline(0, color="red", linestyle="--")

    # add every year to the x axis
    ax.set_xticks(feature_df["year"])
    ax.set_xticklabels(feature_df["year"])
    ax.set_xlabel("Year")

    plt.title(feature + " - Daily Variation per Year")

    plt.show()

## Daily time period

In [None]:
# plot each trend of each variable for each year
from seaborn import color_palette
import matplotlib.ticker as ticker


for index, row in trends_df.iterrows():
    fig, ax = plt.subplots(figsize=(30, 20))
    fig.set_facecolor("black")
    ax.set_facecolor("black")
    ax.grid(True, linestyle="--", alpha=0.6)
    # ax.set_xticks(range(1, 13))
    # ax.set_xticklabels(
    #     [
    #         "January",
    #         "February",
    #         "March",
    #         "April",
    #         "May",
    #         "June",
    #         "July",
    #         "August",
    #         "September",
    #         "October",
    #         "November",
    #         "December",
    #     ],
    # )
    ax.yaxis.label.set_color("white")
    ax.yaxis.set_major_locator(ticker.MaxNLocator(20))
    ax.tick_params(axis="y", colors="white")
    ax.tick_params(axis="x", colors="white")

    for column in trends_df.columns:
        color = color_palette("Set3", trends_df.columns.size)[
            trends_df.columns.get_loc(column)
        ]
        sns.lineplot(
            # NB: month performs the mean of every month
            # dayofyear plots every point
            x=row[column].index.dayofyear,
            y=row[column].values,
            linewidth=2.0,
            color=color,
            # marker="o",
            # markersize=10,
            ax=ax,
        )
        # Add a dotted line at the end of each line
        # NB: if dayofyear, change x_start to row[column].index.dayofyear[0]
        # and y_start to row[column].values[0]
        # else x_start = row[column].index.month[0] and y_start = row[column].groupby(row[column].index.month).mean().values[0]

        x_start = row[column].index.dayofyear[0]
        y_start = row[column].values[0]
        x_end = x_start - 1
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # start line label
        ax.text(
            x_end - 10,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

        # NB: if dayofyear, change x_start to row[column].index.dayofyear[-1]
        # and y_start to row[column].values[-1]
        # else x_start = row[column].index.month[-1] and y_start = row[column].groupby(row[column].index.month).mean().values[-1]

        x_start = row[column].index.dayofyear[-1]
        y_start = row[column].values[-1]
        x_end = x_start + 1
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # end line label
        ax.text(
            x_end,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

    plt.xlabel("Day", color="white")
    plt.ylabel("Trend", color="white")
    plt.title(str(index), fontsize=20, weight="bold", color="white")

## Monthly time period

In [None]:
# plot each trend of each variable for each year
from seaborn import color_palette
import matplotlib.ticker as ticker


for index, row in trends_df.iterrows():
    fig, ax = plt.subplots(figsize=(30, 20))
    fig.set_facecolor("black")
    ax.set_facecolor("black")
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(
        [
            "January",
            "February",
            "March",
            "April",
            "May",
            "June",
            "July",
            "August",
            "September",
            "October",
            "November",
            "December",
        ],
    )
    ax.yaxis.label.set_color("white")
    ax.yaxis.set_major_locator(ticker.MaxNLocator(20))
    ax.tick_params(axis="y", colors="white")
    ax.tick_params(axis="x", colors="white")

    for column in trends_df.columns:
        color = color_palette("Set3", trends_df.columns.size)[
            trends_df.columns.get_loc(column)
        ]
        sns.lineplot(
            # NB: month performs the mean of every month
            # dayofyear plots every point
            x=row[column].index.month,
            y=row[column].values,
            linewidth=2.0,
            color=color,
            marker="o",
            markersize=10,
            ax=ax,
        )
        # Add a dotted line at the end of each line
        # NB: if dayofyear, change x_start to row[column].index.dayofyear[0]
        # and y_start to row[column].values[0]
        # else x_start = row[column].index.month[0] and y_start = row[column].groupby(row[column].index.month).mean().values[0]

        x_start = row[column].index.month[0]
        y_start = (
            row[column]
            .groupby(row[column].index.month)
            .mean()
            .values[0]
        )
        x_end = x_start - 0.5
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # start line label
        ax.text(
            x_end - 0.3,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

        # NB: if dayofyear, change x_start to row[column].index.dayofyear[-1]
        # and y_start to row[column].values[-1]
        # else x_start = row[column].index.month[-1] and y_start = row[column].groupby(row[column].index.month).mean().values[-1]

        x_start = row[column].index.month[-1]
        y_start = (
            row[column]
            .groupby(row[column].index.month)
            .mean()
            .values[-1]
        )
        x_end = x_start + 0.5
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # end line label
        ax.text(
            x_end,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

    plt.xlabel("Month", color="white")
    plt.ylabel("Trend", color="white")
    plt.title(str(index), fontsize=20, weight="bold", color="white")

## Combine Trends

In [None]:
rgb = (200, 2, 110)
color = tuple(map(lambda x: x / 255, rgb))

for feature, row in trends_df.iterrows():
    plt.figure(figsize=(20, 10))
    for column in trends_df.columns:
        sns.lineplot(
            data=row[column],
            linewidth=4.0,
            color=color,
        )
    sns.lineplot(
        data=xerta_df[feature], linewidth=1.0, color="black", alpha=0.5
    )

    # result = smt.STL(xerta_df[feature], period=365).fit()
    # sns.lineplot(data=result.trend + 0.1, linewidth=2.0, color="red")

    plt.title(str(feature) + " - Trend")
    plt.xlabel("Year")
    plt.ylabel(feature)

    plt.show()

## Statistical Tests on trends

### Kruskal-Wallis Test on Trends

In [None]:
# Pairwise year comparison of the trends
pairwise_kw_results = {}
for feature, row in trends_df.iterrows():
    pairwise_kw_results[feature] = {}
    # compute the Kruskal-Wallis H-test for every pair of years
    df = pd.DataFrame(
        columns=trends_df.columns, index=trends_df.columns
    )
    for year1 in trends_df.columns:
        for year2 in trends_df.columns:
            if year1 != year2:
                stat, p = stats.kruskal(row[year1], row[year2])
                df.loc[year1, year2] = (stat, p)
    pairwise_kw_results[feature] = df

In [None]:
pairwise_kw_results["Water Temperature (°C)"]

In [None]:
# overall Kruskal-Wallis H-test for each feature
overall_kw_results = {}
for feature, row in trends_df.iterrows():
    stat, p = stats.kruskal(*[row[year] for year in trends_df.columns])
    overall_kw_results[feature] = (stat, p)

In [None]:
overall_kw_results_df = pd.DataFrame(overall_kw_results).T

### Dunn Test

In [None]:
import scikit_posthocs as sp

#### Trend

In [None]:
trend_dunn_results = {}

for feature, row in trends_df.iterrows():
    result_df = sp.posthoc_dunn(row.to_list(), p_adjust="holm")
    result_df.columns = trends_df.columns
    result_df.index = trends_df.columns
    trend_dunn_results[feature] = result_df

In [None]:
for feature, df in trend_dunn_results.items():
    plt.figure(figsize=(30, 15))

    # Create a mask for values > 0.05
    mask = df <= 0.05

    sns.heatmap(df, annot=True, cmap="coolwarm", center=0, mask=mask)
    plt.title(feature)

#### Data

In [None]:
data_dunn_results = {}
for feature in xerta_df.columns.difference(["DateTime"]):
    # split the data for each year
    data = []
    for year in years:
        data.append(xerta_df[feature][xerta_df.index.year == year])
    result_df = sp.posthoc_dunn(data, p_adjust="holm")
    result_df.columns = years
    result_df.index = years
    data_dunn_results[feature] = result_df

In [None]:
for feature, df in data_dunn_results.items():
    plt.figure(figsize=(30, 15))

    # Create a mask for values > 0.05
    mask = df <= 0.05

    sns.heatmap(df, annot=True, cmap="coolwarm", center=0, mask=mask)
    plt.title(feature)

# Monthly Seasonal Decomposition

For completeness, the analysis is performed also month-by-month.

In [None]:
trend_results = {}

for feature in xerta_df.columns.difference(["DateTime"]):
    result = smt.STL(xerta_df[feature], period=365).fit()
    trend = result.trend.dropna()
    trend_results[feature] = {}
    for year in years:
        # get the months of the year
        months = trend[trend.index.year == year].index.month.unique()
        for month in months:
            trend_results[feature][(year, month)] = trend[
                (trend.index.year == year)
                & (trend.index.month == month)
            ]

In [None]:
trends_df = pd.DataFrame(trend_results)

In [None]:
trends_df

## Daily time period

In [None]:
# plot each trend of each variable for each year
from seaborn import color_palette
import matplotlib.ticker as ticker


for index, row in trends_df.iterrows():
    fig, ax = plt.subplots(figsize=(30, 20))
    fig.set_facecolor("black")
    ax.set_facecolor("black")
    ax.grid(True, linestyle="--", alpha=0.6)
    # ax.set_xticks(range(1, 13))
    # ax.set_xticklabels(
    #     [
    #         "January",
    #         "February",
    #         "March",
    #         "April",
    #         "May",
    #         "June",
    #         "July",
    #         "August",
    #         "September",
    #         "October",
    #         "November",
    #         "December",
    #     ],
    # )
    ax.yaxis.label.set_color("white")
    ax.yaxis.set_major_locator(ticker.MaxNLocator(20))
    ax.tick_params(axis="y", colors="white")
    ax.tick_params(axis="x", colors="white")

    for column in trends_df.columns:
        color = color_palette("Set3", trends_df.columns.size)[
            trends_df.columns.get_loc(column)
        ]
        sns.lineplot(
            # NB: month performs the mean of every month
            # dayofyear plots every point
            x=row[column].index.dayofyear,
            y=row[column].values,
            linewidth=2.0,
            color=color,
            # marker="o",
            # markersize=10,
            ax=ax,
        )
        # Add a dotted line at the end of each line
        # NB: if dayofyear, change x_start to row[column].index.dayofyear[0]
        # and y_start to row[column].values[0]
        # else x_start = row[column].index.month[0] and y_start = row[column].groupby(row[column].index.month).mean().values[0]

        x_start = row[column].index.dayofyear[0]
        y_start = row[column].values[0]
        x_end = x_start - 1
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # start line label
        ax.text(
            x_end - 10,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

        # NB: if dayofyear, change x_start to row[column].index.dayofyear[-1]
        # and y_start to row[column].values[-1]
        # else x_start = row[column].index.month[-1] and y_start = row[column].groupby(row[column].index.month).mean().values[-1]

        x_start = row[column].index.dayofyear[-1]
        y_start = row[column].values[-1]
        x_end = x_start + 1
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # end line label
        ax.text(
            x_end,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

    plt.xlabel("Month", color="white")
    plt.ylabel("Trend", color="white")
    plt.title(str(index), fontsize=20, weight="bold", color="white")

## Monthly time period

In [None]:
# plot each trend of each variable for each year
from seaborn import color_palette
import matplotlib.ticker as ticker


for index, row in trends_df.iterrows():
    fig, ax = plt.subplots(figsize=(30, 20))
    fig.set_facecolor("black")
    ax.set_facecolor("black")
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(
        [
            "January",
            "February",
            "March",
            "April",
            "May",
            "June",
            "July",
            "August",
            "September",
            "October",
            "November",
            "December",
        ],
    )
    ax.yaxis.label.set_color("white")
    ax.yaxis.set_major_locator(ticker.MaxNLocator(20))
    ax.tick_params(axis="y", colors="white")
    ax.tick_params(axis="x", colors="white")

    for column in trends_df.columns:
        color = color_palette("Set3", trends_df.columns.size)[
            trends_df.columns.get_loc(column)
        ]
        sns.lineplot(
            # NB: month performs the mean of every month
            # dayofyear plots every point
            x=row[column].index.month,
            y=row[column].values,
            linewidth=2.0,
            color=color,
            marker="o",
            markersize=10,
            ax=ax,
        )
        # Add a dotted line at the end of each line
        # NB: if dayofyear, change x_start to row[column].index.dayofyear[0]
        # and y_start to row[column].values[0]
        # else x_start = row[column].index.month[0] and y_start = row[column].groupby(row[column].index.month).mean().values[0]

        x_start = row[column].index.month[0]
        y_start = (
            row[column]
            .groupby(row[column].index.month)
            .mean()
            .values[0]
        )
        x_end = x_start - 0.5
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # start line label
        ax.text(
            x_end - 0.3,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

        # NB: if dayofyear, change x_start to row[column].index.dayofyear[-1]
        # and y_start to row[column].values[-1]
        # else x_start = row[column].index.month[-1] and y_start = row[column].groupby(row[column].index.month).mean().values[-1]

        x_start = row[column].index.month[-1]
        y_start = (
            row[column]
            .groupby(row[column].index.month)
            .mean()
            .values[-1]
        )
        x_end = x_start + 0.5
        y_end = y_start
        ax.plot(
            [x_start, x_end],
            [y_start, y_end],
            "--",
            color=color,
            linewidth=2.0,
        )

        # end line label
        ax.text(
            x_end,
            y_end,
            str(column),
            color=color,
            fontsize=14,
            weight="bold",
            va="center",
        )

    plt.xlabel("Month", color="white")
    plt.ylabel("Trend", color="white")
    plt.title(str(index), fontsize=20, weight="bold", color="white")

## Combine Trends

In [None]:
for feature, row in trends_df.iterrows():
    plt.figure(figsize=(20, 10))
    for column in trends_df.columns:
        sns.lineplot(
            data=row[column],
            linewidth=2.0,
            color="red",
        )
    sns.lineplot(data=xerta_df[feature], linewidth=2.0, color="blue")

    plt.title(feature)

## Statistical Tests on trends

### Kruskal-Wallis Test on Trends

In [None]:
# Pairwise year comparison of the trends
pairwise_kw_results = {}
for feature, row in trends_df.iterrows():
    pairwise_kw_results[feature] = {}
    # compute the Kruskal-Wallis H-test for every pair of years
    df = pd.DataFrame(
        columns=trends_df.columns, index=trends_df.columns
    )
    for year1 in trends_df.columns:
        for year2 in trends_df.columns:
            if year1 != year2:
                stat, p = stats.kruskal(row[year1], row[year2])
                df.loc[year1, year2] = (stat, p)
    pairwise_kw_results[feature] = df

In [None]:
pairwise_kw_results["Water Temperature"]

In [None]:
# overall Kruskal-Wallis H-test for each feature
overall_kw_results = {}
for feature, row in trends_df.iterrows():
    stat, p = stats.kruskal(*[row[year] for year in trends_df.columns])
    overall_kw_results[feature] = (stat, p)

In [None]:
overall_kw_results_df = pd.DataFrame(overall_kw_results).T

### Dunn Test

In [None]:
import scikit_posthocs as sp

#### Trend

In [None]:
trend_dunn_results = {}

for feature, row in trends_df.iterrows():
    result_df = sp.posthoc_dunn(row.to_list(), p_adjust="holm")
    result_df.columns = trends_df.columns
    result_df.index = trends_df.columns
    trend_dunn_results[feature] = result_df

In [None]:
trend_dunn_results["ABS254"]

In [None]:
for feature, df in trend_dunn_results.items():
    plt.figure(figsize=(30, 15))

    # Create a mask for values > 0.05
    mask = df <= 0.05

    sns.heatmap(df, annot=True, cmap="coolwarm", center=0, mask=mask)
    plt.title(feature)

#### Data

In [None]:
data_dunn_results = {}
for feature in xerta_df.columns.difference(["DateTime"]):
    # split the data for each year
    data = []
    for year in years:
        data.append(xerta_df[feature][xerta_df.index.year == year])
    result_df = sp.posthoc_dunn(data, p_adjust="holm")
    result_df.columns = years
    result_df.index = years
    data_dunn_results[feature] = result_df

In [None]:
data_dunn_results["ABS254"]

In [None]:
for feature, df in data_dunn_results.items():
    plt.figure(figsize=(30, 15))

    # Create a mask for values > 0.05
    mask = df <= 0.05

    sns.heatmap(df, annot=True, cmap="coolwarm", center=0, mask=mask)
    plt.title(feature)

# Decomposition of diff timeseries

In [None]:
xerta_diff_df = xerta_df.diff()

In [None]:
for feature in xerta_diff_df.columns.difference(["DateTime"]):
    result = smt.STL(xerta_diff_df[feature], period=365).fit()
    fig, axs = plt.subplots(4, 1, figsize=(40, 20))
    sns.lineplot(data=result.observed, ax=axs[0])
    sns.lineplot(data=result.trend, ax=axs[1])
    sns.lineplot(data=result.seasonal, ax=axs[2])
    sns.lineplot(data=result.resid, ax=axs[3])
    fig.suptitle(feature)

# Year by Year Correlation

In [None]:
xerta_df

In [None]:
xerta_df.rename(
    columns={
        "Daily Cumulated Rainfall (L/m²)": "Daily Cumulated Rainfall (mm)",
    },
    inplace=True,
)

In [None]:
# perform year by year correlation
correlation_results = {}

for year in xerta_df.index.year.unique():
    correlation_results[year] = {}
    year_df = xerta_df[xerta_df.index.year == year]
    variable_names = year_df.columns.difference(["DateTime"]).to_list()
    corr_matrix = stats.spearmanr(year_df)
    corr_matrix = pd.DataFrame(
        corr_matrix.correlation,
        columns=variable_names,
        index=variable_names,
    )
    correlation_results[year] = corr_matrix

In [None]:
for year, df in correlation_results.items():
    plt.figure(figsize=(30, 15))
    sns.heatmap(df, annot=True, cmap="coolwarm", center=0)
    plt.title(year)

# Correlation Coefficients Plots

In [None]:
# for each variable, create a plot with the year by year correlation with the other variables
for variable in xerta_df.columns.difference(["DateTime"]):
    # store the correlation of the variable with the other variables
    other_variables = {}

    for year, df in correlation_results.items():
        # take the correlation of the variable with the other variables
        correlation = df[variable].drop(variable)

        for other_variable, value in correlation.items():
            if other_variable not in other_variables:
                other_variables[other_variable] = []
            # append every year
            other_variables[other_variable].append(value)

    # plot the correlation of the variable with the other variables
    for other_variable, values in other_variables.items():
        plt.figure(figsize=(30, 15))
        sns.lineplot(
            x=years,
            y=values,
            label=other_variable,
            marker="o",
            markersize=10,
            linewidth=2.0,
            linestyle="--",
        )
        # plot horizontal line at 0
        plt.axhline(0, color="red", linestyle="--")
        plt.xlabel("Year")
        plt.ylabel("Spearman Correlation Coefficient")
        plt.title(variable + " - " + "Spearman Correlation by Year")
        # increase the x axis year ticks
        plt.xticks(years)

        # # save the plot to the correlation folder
        # if not os.path.exists(os.path.join(correlation_folder, variable)):
        #     os.makedirs(os.path.join(correlation_folder, variable))
        plt.show()

In [None]:
raw_data_folder = os.path.join(data_folder, "raw_data")

In [None]:
# xerta_df = pd.read_excel(
#     os.path.join(raw_data_folder, "raw_full_dataset.xlsx"), index_col=0
# )

In [None]:
# for each variable, create a plot with the year by year correlation with the other variables
variable = "UVA254"
# store the correlation of the variable with the other variables
other_variables = {}

skip_variables = [
    "pH",
    "Ammonium (mg/L)",
    "Conductivity (µS/cm)",
    "Dissolved Oxygen (mg/L)",
    "Nitrate (mg/L)",
    "Redox Potential (mV)",
    "Turbidity (NTU)",
    "Conductivity (µS/cm)",
    "Nitrate (mg/L)",
]

for year, df in correlation_results.items():
    # take the correlation of the variable with the other variables
    correlation = df[variable].drop(variable)

    for other_variable, value in correlation.items():
        if other_variable not in skip_variables:
            if other_variable not in other_variables:
                other_variables[other_variable] = []
            # append every year
            other_variables[other_variable].append(value)

plt.figure(figsize=(30, 15))
# plot the correlation of the variable with the other variables
# give me 4 colors
colors = sns.color_palette("Set1", 5)

for other_variable, values in other_variables.items():
    sns.lineplot(
        x=years,
        y=values,
        label=other_variable,
        marker="o",
        markersize=10,
        linewidth=2.0,
        linestyle="--",
        color=colors.pop(0),
    )
# plot horizontal line at 0
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Year")
plt.ylabel("Spearman Correlation Coefficient")
plt.title(variable + " - " + "Spearman Correlation by Year")
# increase the x axis year ticks
plt.xticks(years)

# # save the plot to the correlation folder
# if not os.path.exists(os.path.join(correlation_folder, variable)):
#     os.makedirs(os.path.join(correlation_folder, variable))
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# plot the solar radiation and the UVA254

plt.figure(figsize=(30, 15))

scaled_uv = MinMaxScaler().fit_transform(xerta_df["UVA254"].values.reshape(-1, 1))
scaled_solar = MinMaxScaler().fit_transform(xerta_df["Solar Radiation (W/m^2)"].values.reshape(-1, 1))

plt.plot(xerta_df.index, scaled_uv, label="UVA254")
plt.plot(xerta_df.index, scaled_solar, label="Solar Radiation")


plt.title("UVA254 vs Solar Radiation")
plt.xlabel("Year")
plt.ylabel("UVA254 and Solar Radiation")

plt.show()

In [None]:
# compute correlation between UVA254 and Solar Radiation

# first difference the data
xerta_diff_df = xerta_df.diff()

# compute the correlation
corr = xerta_diff_df["UVA254"].corr(xerta_diff_df["Solar Radiation (W/m^2)"])

print("Correlation between UVA254 and Solar Radiation:", corr)

# Plots for the paper

In [None]:
# scatter plot between UVA nad Ammonium with best fit line

scatter_folder = os.path.join(paper_plot_folder, 'Tarragona', 'Scatters')

fig = px.scatter(
    xerta_df,
    x="Ammonium (mg/L)",
    y="UVA254",
    trendline="ols",
    trendline_color_override="red",
)

# udpate the scale of the y axis from 0 to 0.7
# fig.update_yaxes(range=[0, 0.7])

fig.update_layout(
    title="UVA254 vs Ammonium",
    xaxis_title="Ammonium (mg/l)",
    yaxis_title="UVA254 (1/m)",
)


fig.write_image(
    os.path.join(scatter_folder, "UVA254_vs_Ammonium.png"),
    scale=6
)


# compute correlation between UVA254 and Ammonium

corr = xerta_df["UVA254"].corr(xerta_df["Ammonium (mg/L)"])

print("Correlation between UVA254 and Ammonium:", corr)

In [None]:
from plotly.subplots import make_subplots

from prophet import Prophet

In [None]:
xerta_df.columns

In [None]:
# one plot for each variable
# do it just for the 305 station since it is the one used in the paper

trend_folder = os.path.join(paper_plot_folder, 'Tarragona', 'Trends', 'Single')

# sort the columns into climatic, water quality and DOC as last one
climatic_variables = ['Air Temperature (°C)', 'Cumulated Rainfall (mm)']
water_quality_variables = ['Ammonium (mg/l)', 'Conductivity (µS/cm)', 'Dissolved Oxygen (mg/l)', 'Flow River Rate (m³/s)', 'Nitrate (mg/l)', 'pH', 'Water Temperature (°C)']
doc_variable = ['UVA254 (1/m)']

columns = climatic_variables + water_quality_variables + doc_variable

colors = px.colors.qualitative.Plotly

color = 'rgb(200,2,110)'

color_mapping = {
    'Air Temperature (°C)': colors[0],
    'Cumulated Rainfall (mm)': colors[1],
    'Ammonium (mg/l)': colors[2],
    'Conductivity (µS/cm)': colors[3],
    'Dissolved Oxygen (mg/l)': colors[4],
    'Flow River Rate (m³/s)': colors[5],
    'Nitrate (mg/l)': colors[6],
    'pH': colors[7],
    'Water Temperature (°C)': colors[8],
    'UVA254 (1/m)': colors[9]
}

station_df = xerta_df.copy()

station_df.rename(
    columns={
        "UVA254": "UVA254 (1/m)",
        "Ammonium (mg/L)": "Ammonium (mg/l)",
        "Daily Cumulated Rainfall (mm)": "Cumulated Rainfall (mm)",
        "Flow River (m³/s)": "Flow River Rate (m³/s)",
        "Dissolved Oxygen (mg/L)": "Dissolved Oxygen (mg/l)",
        "Nitrate (mg/L)": "Nitrate (mg/l)",
        
    },
    inplace=True,
)

for i, column in enumerate(columns):
    
    fig = go.Figure()
    
    df = pd.DataFrame({
        'ds': station_df.index,
        'y': station_df[column]
    })
    
    model = Prophet(weekly_seasonality=False, daily_seasonality=False)
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)
    

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode='lines',
            name=column,
            line=dict(
                color=color_mapping[column],
                width=1.5
            ),
            showlegend=False
        ),
    )
    
    fig.add_trace(
        go.Scatter(
            x=forecast['ds'],
            y=forecast['trend'],
            mode='lines',
            name='Trend',
            line=dict(color=color, width=1),
            showlegend=False, # change to trend_show if you want to show the legend
            legendrank=np.inf
        ),
    )
    
        
    # fig.update_yaxes(title_text=column)
    
    if column == 'Ammonium (mg/l)':  # Replace with the actual column name
        fig.update_yaxes(range=[0, 0.7])
    
    if column == 'Air Temperature (°C)':
        fig.update_yaxes(range=[-10, 30])
    
    if column == 'Cumulated Rainfall (mm)':
        fig.update_yaxes(range=[0, 8])
        
    # if column == 'Conductivity (µS/cm)':
    #     fig.update_yaxes(range=[0, 1700])
    
    # if column == 'Nitrate (mg/l)':
    #     fig.update_yaxes(range=[0, 20])
    
    if column == 'pH':
        fig.update_yaxes(range=[7.6, 8.8])
    
    if column == 'Water Temperature (°C)':
        fig.update_yaxes(range=[0, 30])
        
    # if column == 'Flow River Rate (m³/s)':
    #     fig.update_yaxes(range=[0, 1300])
        
    if column == 'Dissolved Oxygen (mg/l)':
        fig.update_yaxes(range=[4, 18])


    start_year = station_df.index.year.min()
    end_year = station_df.index.year.max()
    tickvals = [pd.Timestamp(f'{year}-01-01') for year in range(start_year, end_year + 1, 4)]
    ticktext = [str(year) for year in range(start_year, end_year + 1, 4)]   

    fig.update_xaxes(
        tickvals=tickvals,
        ticktext=ticktext,
        title_text="Time"
    )
    
    fig.update_yaxes(title_text=column)  

    fig.update_layout(
        title=dict(
            text='Tarragona',
            x=0.5,
            xanchor='center',
            yanchor='top',
        ),
        legend=dict(
            traceorder='normal',
        ),
    )

    #reduce the font size of the subplot titles
    for annotation in fig['layout']['annotations']:
        annotation['font'] = dict(size=8)

    
    column_ = column.replace("/", "_")

    fig.write_image(
        os.path.join(trend_folder, f"{column_}.png"),
        scale=10,
    )

    # fig.show()


In [None]:
trend_folder = os.path.join(paper_plot_folder, 'Tarragona', 'Trends')

climatic_variables = ['Air Temperature (°C)', 'Cumulated Rainfall (mm)']
water_quality_variables = ['Ammonium (mg/l)', 'Conductivity (µS/cm)', 'Dissolved Oxygen (mg/l)', 'Flow River Rate (m³/s)', 'Nitrate (mg/l)', 'pH', 'Water Temperature (°C)']
doc_variable = ['UVA254 (1/m)']

columns = climatic_variables + water_quality_variables + doc_variable

colors = px.colors.qualitative.Plotly

color = 'rgb(200,2,110)'

color_mapping = {
    'Air Temperature (°C)': colors[0],
    'Cumulated Rainfall (mm)': colors[1],
    'Ammonium (mg/l)': colors[2],
    'Conductivity (µS/cm)': colors[3],
    'Dissolved Oxygen (mg/l)': colors[4],
    'Flow River Rate (m³/s)': colors[5],
    'Nitrate (mg/l)': colors[6],
    'pH': colors[7],
    'Water Temperature (°C)': colors[8],
    'UVA254 (1/m)': colors[9]
}

station_df = xerta_df.copy()

station_df.rename(
    columns={
        "UVA254": "UVA254 (1/m)",
        "Ammonium (mg/L)": "Ammonium (mg/l)",
        "Daily Cumulated Rainfall (L/m²)": "Cumulated Rainfall (mm)",
        "Flow River (m³/s)": "Flow River Rate (m³/s)",
        "Dissolved Oxygen (mg/L)": "Dissolved Oxygen (mg/l)",
        "Nitrate (mg/L)": "Nitrate (mg/l)",
        
    },
    inplace=True,
)
    
fig = make_subplots(
    len(columns),
    1,
    shared_xaxes=True,
    subplot_titles=columns,
    vertical_spacing=0.02
)

trend_show = True

for i, column in enumerate(columns):
    
    df = pd.DataFrame({
        'ds': station_df.index,
        'y': station_df[column]
    })
    
    model = Prophet(weekly_seasonality=False, daily_seasonality=False)
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)
    

    fig.add_trace(
        go.Scatter(
            x=station_df.index,
            y=station_df[column],
            mode='lines',
            name=column,
            line=dict(
                color=color_mapping[column],
                width=1.5
            ),
            legendrank=i,
            showlegend=False
        ),
        row=i + 1,
        col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=forecast['ds'],
            y=forecast['trend'],
            mode='lines',
            name='Trend',
            line=dict(color=color, width=1),
            showlegend=False, # Change to trend_show to show the legend
            legendrank=np.inf
        ),
        row=i + 1,
        col=1
    )
    
    if trend_show:
        trend_show = False
        
    # fig.update_yaxes(title_text=column, row=i + 1, col=1)
    
    if column == 'Ammonium (mg/l)':  # Replace with the actual column name
            fig.update_yaxes(range=[0, 0.7], row=i + 1, col=1)
        
    if column == 'Air Temperature (°C)':
        fig.update_yaxes(range=[-10, 30], row=i + 1, col=1)
    
    if column == 'Cumulated Rainfall (mm)':
        fig.update_yaxes(range=[0, 8], row=i + 1, col=1)
        
    # if column == 'Conductivity (µS/cm)':
    #     fig.update_yaxes(range=[0, 1700], row=i + 1, col=1)
    
    # if column == 'Nitrate (mg/l)':
    #     fig.update_yaxes(range=[0, 20], row=i + 1, col=1)
    
    if column == 'pH':
        fig.update_yaxes(range=[7.6, 8.8], row=i + 1, col=1)
    
    if column == 'Water Temperature (°C)':
        fig.update_yaxes(range=[0, 30], row=i + 1, col=1)
        
    # if column == 'Flow River Rate (m³/s)':
    #     fig.update_yaxes(range=[0, 1300], row=i + 1, col=1)
    
    if column == 'Dissolved Oxygen (mg/l)':
            fig.update_yaxes(range=[4, 18], row=i + 1, col=1)
    
fig.update_xaxes(title_text="Time", row=len(columns), col=1)
    
start_year = station_df.index.year.min()
end_year = station_df.index.year.max()
tickvals = [pd.Timestamp(f'{year}-01-01') for year in range(start_year, end_year + 1, 3)]
ticktext = [str(year) for year in range(start_year, end_year + 1, 3)]   

fig.update_xaxes(
    tickvals=tickvals,
    ticktext=ticktext,
) 
    
fig.update_layout(
    title=dict(
        text="Tarragona",
        x=0.5,
        xanchor='center',
        yanchor='top',
        y=0.99,
        font=dict(size=10),
    ),
    font=dict(size=8),
    legend=dict(
        traceorder='normal',
    ),
    margin=dict(
        l=30,  # Left margin
        r=30,  # Right margin
        t=50,  # Top margin
        b=150  # Bottom margin (increase to add blank space)
    )
    )

#reduce the font size of the subplot titles
for annotation in fig['layout']['annotations']:
    annotation['font'] = dict(size=8)

fig.write_image(
    os.path.join(trend_folder, "trends.png"),
    scale=10,
    width=400,
    height=220 * len(columns)
    
)

In [None]:
from scipy.stats import pearsonr

columns = ['Air Temperature (°C)', 'Ammonium (mg/l)', 'UVA254 (1/m)']

station_df = xerta_df[['Air Temperature (°C)', 'Ammonium (mg/L)', 'UVA254']].copy()

station_df.rename(
    columns={
        "UVA254": "UVA254 (1/m)",
        "Ammonium (mg/L)": "Ammonium (mg/l)",
    },
    inplace=True,
)

# perform year by year correlation
correlation_results = pd.DataFrame(
    index=pd.MultiIndex.from_product([columns, sorted(station_df.index.year.unique())]),
    columns=columns
)


    
# normalize the data
for column in columns:
    scaler = MinMaxScaler()
    station_df[column] = scaler.fit_transform(station_df[[column]])

for year in station_df.index.year.unique():
    
    year_df = station_df[station_df.index.year == year]
    
    year_df = year_df[columns]
    
    for column in year_df.columns:
        for column2 in year_df.columns:
            if column == column2:
                continue
            
            result = pearsonr(year_df[column], year_df[column2])
            
            correlation_results.loc[(column, year), column2] = result

In [None]:
# plot the correlation results

correlation_folder = os.path.join(paper_plot_folder, 'Tarragona', 'Correlations')

color = 'rgb(200,2,110)'

for column in columns:
    
    fig = go.Figure()
    
    for column2 in columns:
        
        if column == column2:
            continue
        
        years = correlation_results.loc[(column, slice(None)), column2].index.get_level_values(1)
        
        correlation = correlation_results.loc[(column, slice(None)), column2]        
        
        fig.add_trace(
            go.Scatter(
                x=correlation.index.get_level_values(1),
                y=correlation.apply(lambda x: x[0]),
                mode='lines+markers',
                name=column2
            )
        )
        
    fig.update_layout(
        title=f"{column} vs Other Parameters",
        xaxis_title="Year",
        yaxis_title="Pearson Correlation Coefficient",
        legend=dict(
            x=0.01,
            y=0.99,
        )
    )
    
    
    fig.update_yaxes(range=[-1, 1])
    
    # add a horizontal line at 0
    fig.add_shape(
        type="line",
        x0=years.min(),
        y0=0,
        x1=years.max(),
        y1=0,
        line=dict(
            color="black",
            width=1,
            dash="dashdot"
        )
    )
    
    column_ = column.replace('/', '_')
    
    fig.write_image(
        os.path.join(correlation_folder, f"{column_}_correlation.png"),
        scale=6
    )

In [None]:
xerta_df.columns

In [None]:
rainfall_df = xerta_df[['Daily Cumulated Rainfall (mm)']].copy()

# define classes as [0,1], (1, 2], (2, 3], (3, inf)

rainfall_df['Class'] = pd.cut(rainfall_df['Daily Cumulated Rainfall (mm)'], bins=[0, 2, np.inf], labels=['Low', 'High'])

rainfall_df['Year'] = rainfall_df.index.year
rainfall_df['Month'] = rainfall_df.index.month

# analyze if the frequency of the classes changes over time

# create a pivot table
pivot_table = rainfall_df.pivot_table(index='Year', columns='Class', aggfunc='size', fill_value=0)

# plot the pivot table
fig = go.Figure()

for column in pivot_table.columns:
    fig.add_trace(
        go.Scatter(
            x=pivot_table.index,
            y=pivot_table[column],
            mode='lines+markers',
            name=column
        )
    )
    
fig.update_layout(
    title="Rainfall Classes Frequency Over Time",
    xaxis_title="Year",
    yaxis_title="Frequency"
)

fig.show()

## Granger Causality

In [None]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, kpss

def check_stationarity(dataframe, max_diff=2):
    """
    Check and make the time series stationary by differencing if required.
    """
    diff_count = 0
    while diff_count < max_diff:
        adf = adfuller(dataframe)
        kp = kpss(dataframe)
        if adf[1] > 0.05 or kp[1] < 0.05:
            print(f'Data is non-stationary. Differencing the data. Attempt: {diff_count + 1}')
            dataframe = dataframe.diff().dropna()
            diff_count += 1
        else:
            break
    return dataframe.dropna()

def grangers_causation_matrix_multivariate(data, maxlag=12, test='ssr_chi2test', verbose=False):
    """
    Check Granger Causality in a multivariate setting.
    """
    variables = data.columns
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
        
    new_data = pd.DataFrame()
    # Ensure each series is stationary
    for column in data.columns:
        new_data[column] = check_stationarity(data[[column]])
        
    new_data = new_data.dropna()

    # Fit the VAR model with automatic lag selection
    model = VAR(new_data)
    model_fitted = model.fit(ic='bic')

    # Create Granger causality matrix
    for r in variables:
        for c in variables:
            if c != r:
                test_result = model_fitted.test_causality(c, r)
                p_value = round(test_result.pvalue, 4)
                df.loc[r, c] = p_value
                if verbose:
                    print(f'Y = {r}, X = {c}, P-Value = {p_value}')
            else:
                df.loc[r, c] = 1

    # Rename columns and indexes for clarity
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]

    # Optional: Return model summary and impulse response functions
    return df, model_fitted.summary(), model_fitted.irf(12)


In [None]:
xerta_df.columns

In [None]:
xerta_df = xerta_df[sorted(xerta_df.columns)]

In [None]:
xerta_df.rename(
    columns={
        "UVA254": "UVA254 (1/m)",
        "Ammonium (mg/L)": "Ammonium (mg/l)",
        "Daily Cumulated Rainfall (L/m²)": "Cumulated Rainfall (mm)",
        "Flow River (m³/s)": "Flow River Rate (m³/s)",
        "Dissolved Oxygen (mg/L)": "Dissolved Oxygen (mg/l)",
        "Nitrate (mg/L)": "Nitrate (mg/l)",
        
    },
    inplace=True,
)


xerta_df.drop(
    columns=[
        "Redox Potential (mV)",
        "Turbidity (NTU)",
        "Solar Radiation (W/m^2)",
    ],
    inplace=True,
)

In [None]:
causality_matrix, summary, irf = grangers_causation_matrix_multivariate(xerta_df, maxlag=1)

print(summary)

ax = irf.plot(orth=True)
ax.set_size_inches(40, 20)

plt.show()
    

In [None]:
causality_matrix

In [None]:
causality_matrix.to_excel(os.path.join(data_folder, 'granger', 'causality_matrix.xlsx'))

In [None]:
matrix = causality_matrix.copy()

# remove the _x and _y from the column names
matrix.columns = matrix.columns.str.replace('_x', '')
matrix.index = matrix.index.str.replace('_y', '')

# Create a directed graph
G = nx.DiGraph()

# Add nodes for all effects (rows) and causes (columns)
G.add_nodes_from(matrix.columns, bipartite=0)  # Causes
G.add_nodes_from(matrix.index, bipartite=1)    # Effects

# Add edges for significant Granger causality (p-value < 0.05)
threshold = 0.05
for cause in matrix.columns:
    for effect in matrix.index:
        if matrix.loc[effect, cause] < threshold:
            G.add_edge(cause, effect)

plt.figure(figsize=(12, 6))
pos = nx.circular_layout(G)
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=3000, edge_color='black',
        arrows=True, arrowsize=20, font_size=12, font_color='darkblue')
plt.title(f"Granger Causality Graph - Tarragona")

# make the plot wider
plt.savefig(
    os.path.join(paper_plot_folder, 'Tarragona', "granger_causality.png"),
    bbox_inches='tight',
    dpi=600
)

## Statistical Tests on trend

In [None]:
# create dataframe to store the adf and mann-kendall test results for each station

statistics_df = pd.DataFrame(
    index=xerta_df.columns,
    columns=['ADF p-value', 'ADF result', 'MK p-value', 'MK result', 'Slope', 'Slope p-value']
)

In [None]:
for column in xerta_df.columns:
    df = xerta_df[[column]].copy()

    df.dropna(inplace=True)

    date_range = df.index
    date_range = date_range.min(), date_range.max()

    # make sure that the dataframe starts and finishes in the same month
    start_index = df[df.index.month == date_range[1].month].index[0]

    # Slice the dataframe to start from the found index
    df = df.loc[start_index:]

    # ===== Prophet =====

    df.index.name = "ds"

    df = df.reset_index()

    df.rename(columns={column: "y"}, inplace=True)

    # using prophet

    model = Prophet()
    model.fit(df)
    # Make predictions for both columns
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Merging forecasted data with your original data
    forecasting_final = pd.merge(
        forecast,
        df,
        how="inner",
        on="ds",
    )

    # compute linear regression on trend
    X = np.arange(df.shape[0])
    X = sm.add_constant(X)
    y = df["y"].copy()

    model = sm.OLS(y, X)
    results = model.fit()

    # plot the line of the linear regression
    line = pd.Series(results.predict(X), index=df['ds'])

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df['ds'],
            y=df["y"],
            mode="lines",
            name="Original",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=forecasting_final["ds"],
            y=forecasting_final["trend"],
            mode="lines",
            name="Trend",
        )
    )    
    
    # perfrom Augmented Dickey-Fuller test
    adf_result = adfuller(df["y"], autolag="AIC")
    # perform KPSS test
    kpss_result = kpss(df["y"])
    
    # perfrom Mann-Kendall test        
    mk_result = mk.original_test(df["y"] - forecasting_final['yearly'])
    
    print()
    print(f"{column} - Augmented Dickey-Fuller Test")
    print(f"ADF P-value: {adf_result[1]:.4f}")
    print(f"Lag used: {adf_result[2]}")
    if adf_result[1] > 0.05:
        print("Unit root present, data is non-stationary")
    print()
    
    print(f"{column} - KPSS Test")
    print(f"KPSS P-value: {kpss_result[1]:.4f}")
    if kpss_result[1] < 0.05:
        print("Unit root present, data is non-stationary")
    print()
    
    if (adf_result[1] > 0.05 and kpss_result[1] < 0.05) or (adf_result[1] < 0.05 and kpss_result[1] > 0.05):
        print("=== Consistency between tests! ===")
        print()
    
    print(f"{column} - Mann-Kendall Test")
    print(f"Monotonic Trend: {mk_result.trend}")
    print(f"p-value: {mk_result.p:.4f}")
    print()
    slope = results.params.iloc[1]
    print(f"{column} - Slope: {slope}")

    p_value = results.pvalues.iloc[1]
    print(f"{column} - P-value: {p_value}")
    
    statistics_df.loc[column, 'ADF p-value'] = adf_result[1]
    statistics_df.loc[column, 'ADF result'] = 'Stationary' if adf_result[1] < 0.05 else 'Non-Stationary'
    
    statistics_df.loc[column, 'MK p-value'] = mk_result.p
    statistics_df.loc[column, 'MK result'] = mk_result.trend
    
    # store the slope
    statistics_df.loc[column, 'Slope'] = slope
    statistics_df.loc[column, 'Slope p-value'] = p_value

    fig.add_trace(
        go.Scatter(
            x=line.index,
            y=line,
            mode="lines",
            name=f"Linear Regression",
            line=dict(dash="dash", color="black"),
        ),
    )

    start_date = df['ds'].min()
    end_date = df['ds'].max()

    fig.update_layout(
        xaxis_title="Date",
        yaxis_title=column,
        font=dict(
            size=18,
        ),
        title=f"{column} - {start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')} - Slope: {slope:.4f}",
    )

    fig.show()

In [None]:
statistics_df.to_excel(os.path.join(data_folder, 'statistics', 'statistics.xlsx'))