# Preprocess Daily Raw Data

In [None]:
import os

import random
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from scipy import stats

from sklearn.metrics import mean_squared_error

In [None]:
plt.rcParams.update({"font.size": 14})

In [None]:
random.seed(42)
np.random.seed(42)

## Load Data

In [None]:
data_folder = os.path.join("..", "data", "tarragona")
raw_data_folder = os.path.join(data_folder, "raw")
clean_data_folder = os.path.join(data_folder, "clean")

In [None]:
datasets_dict = {}

datasets_dict["TORTOSA"] = {}
datasets_dict["GUIAMETS"] = {}
datasets_dict["MEQUINENZA"] = {}
datasets_dict["XERTA"] = {}

In [None]:
for file in os.listdir(raw_data_folder):
    location = file.split("_")[0]
    feature_name = "_".join(file.split("_")[1:-2])
    if file.endswith(".csv"):
        datasets_dict[location][feature_name] = pd.read_csv(
            filepath_or_buffer=os.path.join(raw_data_folder, file),
            sep=";",
            decimal=",",
            header=0,
            encoding="utf-8",
        )
    elif (file.endswith(".xlsx")):
        datasets_dict[location][feature_name] = pd.read_excel(
            os.path.join(raw_data_folder, file),
            header=0,
        )

In [None]:
tortosa_dfs = datasets_dict["TORTOSA"]
guiamets_dfs = datasets_dict["GUIAMETS"]
mequinenza_dfs = datasets_dict["MEQUINENZA"]
xerta_dfs = datasets_dict["XERTA"]

# Tortosa Preprocessing

In [None]:
tortosa_dfs.keys()

In [None]:
# The water temperature has two datasets, but the excel one has no missing values
tortosa_dfs["watertemperature"].isna().sum() / tortosa_dfs[
    "watertemperature"
].shape[0]

In [None]:
tortosa_dfs["water_temperature"].isna().sum() / tortosa_dfs[
    "water_temperature"
].shape[0]

In [None]:
tortosa_dfs.pop("water_temperature")

In [None]:
# Check cumulated rainfall data since it is the only csv file
tortosa_dfs["cumulated_rainfall_24h"]

In [None]:
# fecha column is the one to take into account since
# it is equal to the Fecha acumulado column in the same dataframe
# but it has no missing values
mask = (
    tortosa_dfs["cumulated_rainfall_24h"]["Fecha acumulado"]
    == tortosa_dfs["cumulated_rainfall_24h"]["fecha"]
)
tortosa_dfs["cumulated_rainfall_24h"][mask == False]

In [None]:
tortosa_dfs["cumulated_rainfall_24h"].isna().sum() / tortosa_dfs[
    "cumulated_rainfall_24h"
].shape[0]

In [None]:
tortosa_dfs["cumulated_rainfall_24h"] = tortosa_dfs[
    "cumulated_rainfall_24h"
][["fecha", "Acumulado"]].rename(
    columns={"fecha": "DateTime", "Acumulado": "Average"}
)

In [None]:
tortosa_dfs["conductivity"].isna().sum() / tortosa_dfs[
    "conductivity"
].shape[0]

In [None]:
tortosa_dfs["flowriver"].isna().sum() / tortosa_dfs["flowriver"].shape[
    0
]

In [None]:
tortosa_dfs["turbidity"].isna().sum() / tortosa_dfs["turbidity"].shape[
    0
]

In [None]:
for feature, df in tortosa_dfs.items():
    if feature != "cumulated_rainfall_24h":
        df.rename(
            columns={
                "Fecha": "DateTime",
                "Promedio": "Average",
                "Máximo": "Maximum",
                "Mínimo": "Minimum",
            },
            inplace=True,
        )

In [None]:
for df in tortosa_dfs.values():
    df["DateTime"] = pd.to_datetime(df["DateTime"])
    df[df.columns.difference(["DateTime"])] = df[
        df.columns.difference(["DateTime"])
    ].apply(pd.to_numeric, errors="coerce")

In [None]:
# Check for missing values in the datasets
for feature, df in tortosa_dfs.items():
    print(f"{feature}: {df.isna().sum().sum()}")

In [None]:
# For the moment, drop the missing values
for feature, df in tortosa_dfs.items():
    tortosa_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

In [None]:
fig, axs = plt.subplots(1, len(tortosa_dfs.keys()), figsize=(30, 10))

for i, (feature, df) in enumerate(tortosa_dfs.items()):
    if feature == "cumulated_rainfall_24h":
        gtz_df = df[df["Average"] > 0]
        sns.boxplot(data=gtz_df, y="Average", ax=axs[i])
        # number of values = 0
        print(df[df["Average"] == 0].shape[0] / df.shape[0])
        print(df.shape[0])
    else:
        sns.boxplot(data=df, y="Average", ax=axs[i])

    # remove y-axis label
    axs[i].set_ylabel("")
    axs[i].set_title(feature)

### Inspect Data

#### Histograms

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df["Average"], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y="Average")
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x="DateTime", y="Average", label="Average")
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data - Feature-wise

How I determined acceptable ranges of values for each feature

* Turbidity: [DataStream](https://datastream.org/en-ca/guidebook/turbidity#:~:text=Turbidity%20values%20less%20than%2010,be%20more%20than%20100%20NTU.) says that high levels of turbidity are > 100 NTU. [In-Situ](https://in-situ.com/us/faq/water-quality-information/what-are-typical-turbidity-values-in-natural-environments#:~:text=Turbidity%20values%20in%20natural%20environments%20can%20range%20from%20as%20low,a%20major%20storm%20runoff%20event.) says that levels of turbidity > 100 NTU are unsafe for most aquatic life. [Wikipedia](https://in-situ.com/us/faq/water-quality-information/what-are-typical-turbidity-values-in-natural-environments#:~:text=Turbidity%20values%20in%20natural%20environments%20can%20range%20from%20as%20low,a%20major%20storm%20runoff%20event.) says that the Ebro river has a wide ecosystem. Given these considerations, **I decided to consider valid the range of values between 0 and 150 NTU**.

* Daily Cumulated Rainfall: This [link](https://weather-and-climate.com/average-monthly-precipitation-Rainfall,tarragona-catalonia-es,Spain) says that the average monthly amount of precipitation ranges from 20 mm to 80 mm. On worst case scenario (80 mm in a month, which is october), considering that [here](https://weatherspark.com/y/45958/Average-Weather-in-Tarragona-Spain-Year-Round) it says that on average on october it rains ~ 6 days, we can assume that in a day it can rain up to ~ **13 mm. I decided to take this value as a threshold.**

* Flow River: [Wikipedia](https://en.wikipedia.org/wiki/Ebro#Flow_and_floods) confirms the domain of the plot, so no cleaning is required.

* Water Temperature: from the plot it is clear that at the beginning of the time series there is an unusual spike. Therefore, **only the values of that spike are removed.**

* Conductivity: same as water temperature.

#### Turbidity

In [None]:
turb_df = tortosa_dfs["turbidity"].copy()

In [None]:
threshold = 150

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=turb_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
turb_df = turb_df[turb_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=turb_df, x="DateTime", y="Average", label="Average")

In [None]:
tortosa_dfs["turbidity"] = turb_df

#### Daily Cumulated Rainfall

UPDATE: no removal of data since I found out that those values are in a valid range.

In [None]:
rain_df = tortosa_dfs["cumulated_rainfall_24h"].copy()

In [None]:
threshold = 13

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
# rain_df = rain_df[rain_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")

In [None]:
tortosa_dfs["cumulated_rainfall_24h"] = rain_df

#### Water Temperature

In [None]:
water_df = tortosa_dfs["watertemperature"].copy()

In [None]:
threshold = 35

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=water_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
water_df = water_df[water_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=water_df, x="DateTime", y="Average", label="Average")

In [None]:
tortosa_dfs["watertemperature"] = water_df

#### Conductivity

In [None]:
cond_df = tortosa_dfs["conductivity"].copy()

In [None]:
upper_threshold = 2500

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=cond_df, x="DateTime", y="Average", label="Average")
plt.axhline(
    y=upper_threshold, color="r", linestyle="--", label="Threshold"
)
plt.legend()
plt.show()

In [None]:
cond_df = cond_df[cond_df["Average"] < upper_threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=cond_df, x="DateTime", y="Average", label="Average")

In [None]:
lower_threshold = 250

In [None]:
cond_df = cond_df[cond_df["Average"] > lower_threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=cond_df, x="DateTime", y="Average", label="Average")

In [None]:
tortosa_dfs["conductivity"] = cond_df

### Show uncovered days

In [None]:
for feature, df in tortosa_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    tortosa_dfs[feature] = df

In [None]:
for feature, df in tortosa_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("-" * 50)

In [None]:
missing_values_df = {}

for feature, df in tortosa_dfs.items():
    df["is_missing"] = df["Average"].isna()

    lower_threshold = None
    upper_threshold = None

    label = ""
    if feature == "cumulated_rainfall_24h":
        label = "Daily Cumulated Rainfall (mm)"

    elif feature == "conductivity":
        label = "Conductivity (µS/cm)"
        upper_threshold = 2500
        lower_threshold = 250

    elif feature == "flowriver":
        label = "Flow River (m³/s)"

    elif feature == "turbidity":
        label = "Turbidity (NTU)"
        upper_threshold = 150

    elif feature == "watertemperature":
        label = "Water Temperature (°C)"
        upper_threshold = 35

    missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
    missing_values_perc = missing_values_perc.round(2)

    missing_values_df[feature] = missing_values_perc

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df, x="DateTime", y="Average", label="Observed Values"
    )

    # Get current axis
    ax = plt.gca()

    # Set major ticks format
    years = mdates.YearLocator()  # every year
    years_fmt = mdates.DateFormatter("%Y")
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(years_fmt)

    for date in df[df["is_missing"]]["DateTime"]:
        plt.axvline(
            x=date, ymin=0.01, ymax=0.99, color="grey", alpha=0.1
        )

    # Plot an empty line with a label for the legend
    plt.plot([], [], color="grey", alpha=0.1, label="Missing values")

    if lower_threshold:
        plt.axhline(
            y=lower_threshold,
            color="purple",
            linestyle="--",
            label=f"Lower Threshold: {lower_threshold}",
        )

    if upper_threshold:
        plt.axhline(
            y=upper_threshold,
            color="r",
            linestyle="--",
            label=f"Upper Threshold: {upper_threshold}",
        )

    plt.title(
        label + " - " + str(missing_values_perc) + "% of missing values"
    )

    plt.ylabel(label)

    plt.legend(loc="upper right")
    
    plt.show()

    print(feature)
    print("N samples:", df.shape[0])

    # compute the number of suspicious values
    n_sus = 0
    if lower_threshold:
        n_sus += df[df["Average"] < lower_threshold].shape[0]

    if upper_threshold:
        n_sus += df[df["Average"] > upper_threshold].shape[0]

    print("N suspicious:", n_sus)


missing_values_df = pd.DataFrame(
    missing_values_df.items(), columns=["Feature", "Missing values (%)"]
)

In [None]:
missing_values_df

### Combine Datasets

In [None]:
# Rename the Average column to the feature name for each dataframe
# and keep only the DateTime and the feature column
for feature, df in tortosa_dfs.items():
    df.rename(columns={"Average": feature}, inplace=True)
    df = df[["DateTime", feature, "is_missing"]]
    df.rename(
        columns={"is_missing": feature + "_is_missing"}, inplace=True
    )
    tortosa_dfs[feature] = df

In [None]:
# combine all xerta datasets into a single dataframe

from functools import reduce

# Get a list of all dataframes
dfs = list(tortosa_dfs.values())

# Use reduce to merge all dataframes
tortosa_df = reduce(
    lambda left, right: pd.merge(left, right, on="DateTime"), dfs
)

### Fill NaNs

For non-seasonal data a linear interpolation is performed.
For seasonal data the interpolation is performed by first removing the season component and then perform a linear interpolation on the trend + residuals.

In [None]:
# fill missing values with interpolation
df_fill = tortosa_df.copy()
df_fill.set_index("DateTime", inplace=True)

In [None]:
df_fill = df_fill.interpolate(method="time", limit_direction="both")
df_fill.reset_index(inplace=True)

In [None]:
# plot the data for each feature
for feature in tortosa_dfs.keys():
    df_fill[feature + "_is_missing"] = df_fill[feature].isna()

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df_fill,
        x="DateTime",
        y=feature,
        label=feature + " imputed",
    )
    sns.lineplot(
        data=tortosa_df,
        x="DateTime",
        y=feature,
        label=feature,
        alpha=0.5,
    )

    missing_values_perc = (
        df_fill[feature + "_is_missing"].sum() / df_fill.shape[0]
    ) * 100
    missing_values_perc = missing_values_perc.round(2)

    plt.title(
        feature
        + " - "
        + str(missing_values_perc)
        + "% of missing values"
    )
    plt.legend()
    plt.show()

In [None]:
tortosa_df = df_fill

# Guiamets Preprocessing

In [None]:
guiamets_dfs.keys()

In [None]:
# Percentage of missing values
guiamets_dfs["cumulated_rainfall_24h"].isna().sum() / guiamets_dfs[
    "cumulated_rainfall_24h"
].shape[0]

In [None]:
guiamets_dfs["environmental_temperature"].isna().sum() / guiamets_dfs[
    "environmental_temperature"
].shape[0]

In [None]:
guiamets_dfs["cumulated_rainfall_24h"].columns.to_list()

In [None]:
guiamets_dfs["cumulated_rainfall_24h"].drop(
    columns=["Fecha m�ximo", "M�ximo", "Fecha acumulado"], inplace=True
)
guiamets_dfs["environmental_temperature"].drop(
    columns=["Fecha m�ximo", "Fecha m�nimo"], inplace=True
)

In [None]:
guiamets_dfs["cumulated_rainfall_24h"].rename(
    columns={"fecha": "DateTime", "Acumulado": "Average"}, inplace=True
)

guiamets_dfs["environmental_temperature"].rename(
    columns={
        "fecha": "DateTime",
        "Media": "Average",
        "M�nimo": "Minimum",
        "M�ximo": "Maximum",
    },
    inplace=True,
)

In [None]:
for df in guiamets_dfs.values():
    df["DateTime"] = pd.to_datetime(df["DateTime"])
    df[df.columns.difference(["DateTime"])] = df[
        df.columns.difference(["DateTime"])
    ].apply(pd.to_numeric, errors="coerce")

In [None]:
# Check for missing values in the datasets
for feature, df in guiamets_dfs.items():
    print(f"{feature}: {df.isna().sum().sum()}")

In [None]:
# For the moment, drop the missing values
for feature, df in guiamets_dfs.items():
    guiamets_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df["Average"], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y="Average")
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x="DateTime", y="Average", label="Average")
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data - Feature-wise

How I determined acceptable ranges of values for each feature

* Daily Cumulated Rainfall: This [link](https://weather-and-climate.com/average-monthly-precipitation-Rainfall,tarragona-catalonia-es,Spain) says that the average monthly amount of precipitation ranges from 20 mm to 80 mm. On worst case scenario (80 mm in a month, which is october), considering that [here](https://weatherspark.com/y/45958/Average-Weather-in-Tarragona-Spain-Year-Round) it says that on average on october it rains ~ 6 days, we can assume that in a day it can rain up to ~ **13 mm. I decided to take this value as a threshold.**

* Air Temperature: the range of values is valid. Therefore, **no removal is necessary.**

#### Daily Cumulated Rainfall

UPDATE: no removal of data since I found out that those values are in a valid range.

In [None]:
rain_df = guiamets_dfs["cumulated_rainfall_24h"].copy()

In [None]:
threshold = 13

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
# rain_df = rain_df[rain_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")

In [None]:
guiamets_dfs["cumulated_rainfall_24h"] = rain_df

### Show uncovered days

In [None]:
for feature, df in guiamets_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    guiamets_dfs[feature] = df

In [None]:
for feature, df in guiamets_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("-" * 50)

In [None]:
missing_values_df = {}

for feature, df in guiamets_dfs.items():
    df["is_missing"] = df["Average"].isna()

    label = ""
    if feature == "cumulated_rainfall_24h":
        label = "Daily Cumulated Rainfall (mm)"

    elif feature == "environmental_temperature":
        label = "Air Temperature (°C)"

    missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
    missing_values_perc = missing_values_perc.round(2)

    missing_values_df[feature] = missing_values_perc

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df, x="DateTime", y="Average", label="Observed Values"
    )

    # Get current axis
    ax = plt.gca()

    # Set major ticks format
    years = mdates.YearLocator()  # every year
    years_fmt = mdates.DateFormatter("%Y")
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(years_fmt)

    for date in df[df["is_missing"]]["DateTime"]:
        plt.axvline(
            x=date, ymin=0.01, ymax=0.99, color="grey", alpha=0.1
        )

    # Plot an empty line with a label for the legend
    plt.plot([], [], color="grey", alpha=0.1, label="Missing values")

    plt.title(
        label + " - " + str(missing_values_perc) + "% of missing values"
    )

    plt.ylabel(label)

    plt.legend(loc="upper right")
    plt.show()

    print(feature)
    print("N samples:", df.shape[0])

    # compute the number of suspicious values
    n_sus = 0
    if lower_threshold:
        n_sus += df[df["Average"] < lower_threshold].shape[0]

    if upper_threshold:
        n_sus += df[df["Average"] > upper_threshold].shape[0]

    print("N suspicious:", n_sus)

missing_values_df = pd.DataFrame(
    missing_values_df.items(), columns=["Feature", "Missing values (%)"]
)

In [None]:
missing_values_df

### Combine Datasets

In [None]:
# Rename the Average column to the feature name for each dataframe
# and keep only the DateTime and the feature column
for feature, df in guiamets_dfs.items():
    df.rename(columns={"Average": feature}, inplace=True)
    df = df[["DateTime", feature, "is_missing"]]
    df.rename(
        columns={"is_missing": feature + "_is_missing"}, inplace=True
    )
    guiamets_dfs[feature] = df

In [None]:
# combine all xerta datasets into a single dataframe

from functools import reduce

# Get a list of all dataframes
dfs = list(guiamets_dfs.values())

# Use reduce to merge all dataframes
guiamets_df = reduce(
    lambda left, right: pd.merge(left, right, on="DateTime"), dfs
)

### Fill NaNs

In [None]:
# fill missing values with interpolation
df_fill = guiamets_df.copy()
df_fill.set_index("DateTime", inplace=True)
df_fill = df_fill.interpolate(method="time", limit_direction="both")
df_fill.reset_index(inplace=True)

In [None]:
# plot the data for each feature
for feature in guiamets_dfs.keys():
    df_fill[feature + "_is_missing"] = df_fill[feature].isna()

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df_fill,
        x="DateTime",
        y=feature,
        label=feature + " imputed",
    )
    sns.lineplot(
        data=guiamets_df,
        x="DateTime",
        y=feature,
        label=feature,
        alpha=0.5,
    )

    missing_values_perc = (
        df_fill[feature + "_is_missing"].sum() / df_fill.shape[0]
    ) * 100
    missing_values_perc = missing_values_perc.round(2)

    plt.title(
        feature
        + " - "
        + str(missing_values_perc)
        + "% of missing values"
    )
    plt.legend()
    plt.show()

In [None]:
guiamets_df = df_fill

# Mequinenza Preprocessing

In [None]:
mequinenza_dfs.keys()

In [None]:
mequinenza_dfs["cumulated_rainfall_24h"].isna().sum() / mequinenza_dfs[
    "cumulated_rainfall_24h"
].shape[0]

In [None]:
mequinenza_dfs["cumulated_rainfall_24h"].drop(
    columns=["Fecha m�ximo", "M�ximo", "Fecha acumulado"], inplace=True
)

In [None]:
mequinenza_dfs["cumulated_rainfall_24h"].rename(
    columns={"fecha": "DateTime", "Acumulado": "Average"}, inplace=True
)

In [None]:
mequinenza_dfs["cumulated_rainfall_24h"]["DateTime"] = pd.to_datetime(
    mequinenza_dfs["cumulated_rainfall_24h"]["DateTime"]
)
mequinenza_dfs["cumulated_rainfall_24h"][
    mequinenza_dfs["cumulated_rainfall_24h"].columns.difference(
        ["DateTime"]
    )
] = mequinenza_dfs["cumulated_rainfall_24h"][
    mequinenza_dfs["cumulated_rainfall_24h"].columns.difference(
        ["DateTime"]
    )
].apply(
    pd.to_numeric, errors="coerce"
)

In [None]:
# Check for missing values in the datasets
for feature, df in mequinenza_dfs.items():
    print(f"{feature}: {df.isna().sum().sum()}")

In [None]:
# For the moment, drop the missing values
for feature, df in mequinenza_dfs.items():
    mequinenza_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df["Average"], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y="Average")
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x="DateTime", y="Average", label="Average")
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data - Feature-wise

How I determined acceptable ranges of values for each feature

* Daily Cumulated Rainfall: This [link](https://weather-and-climate.com/average-monthly-precipitation-Rainfall,tarragona-catalonia-es,Spain) says that the average monthly amount of precipitation ranges from 20 mm to 80 mm. On worst case scenario (80 mm in a month, which is october), considering that [here](https://weatherspark.com/y/45958/Average-Weather-in-Tarragona-Spain-Year-Round) it says that on average on october it rains ~ 6 days, we can assume that in a day it can rain up to ~ **13 mm. I decided to take this value as a threshold.**

#### Daily Cumulated Rainfall

UPDATE: no removal of data since I found out that those values are in a valid range.

In [None]:
rain_df = mequinenza_dfs["cumulated_rainfall_24h"].copy()

In [None]:
threshold = 13

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
# rain_df = rain_df[rain_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=rain_df, x="DateTime", y="Average", label="Average")

In [None]:
mequinenza_dfs["cumulated_rainfall_24h"] = rain_df

### Show uncovered days

In [None]:
for feature, df in mequinenza_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    mequinenza_dfs[feature] = df

In [None]:
for feature, df in mequinenza_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("-" * 50)

In [None]:
missing_values_df = {}

for feature, df in mequinenza_dfs.items():
    df["is_missing"] = df["Average"].isna()

    label = ""
    if feature == "cumulated_rainfall_24h":
        label = "Daily Cumulated Rainfall (mm)"

    missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
    missing_values_perc = missing_values_perc.round(2)

    missing_values_df[feature] = missing_values_perc

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df, x="DateTime", y="Average", label="Observed Values"
    )

    # Get current axis
    ax = plt.gca()

    # Set major ticks format
    years = mdates.YearLocator()  # every year
    years_fmt = mdates.DateFormatter("%Y")
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(years_fmt)

    for date in df[df["is_missing"]]["DateTime"]:
        plt.axvline(
            x=date, ymin=0.01, ymax=0.99, color="grey", alpha=0.1
        )

    # Plot an empty line with a label for the legend
    plt.plot([], [], color="grey", alpha=0.1, label="Missing values")

    plt.title(
        label + " - " + str(missing_values_perc) + "% of missing values"
    )

    plt.ylabel(label)

    plt.legend(loc="upper right")
    plt.show()

    print(feature)
    print("N samples:", df.shape[0])

    # compute the number of suspicious values
    n_sus = 0
    if lower_threshold:
        n_sus += df[df["Average"] < lower_threshold].shape[0]

    if upper_threshold:
        n_sus += df[df["Average"] > upper_threshold].shape[0]

    print("N suspicious:", n_sus)

missing_values_df = pd.DataFrame(
    missing_values_df.items(), columns=["Feature", "Missing values (%)"]
)

### Combine Datasets

In [None]:
# Rename the Average column to the feature name for each dataframe
# and keep only the DateTime and the feature column
for feature, df in mequinenza_dfs.items():
    df.rename(columns={"Average": feature}, inplace=True)
    df = df[["DateTime", feature, "is_missing"]]
    df.rename(
        columns={"is_missing": feature + "_is_missing"}, inplace=True
    )
    mequinenza_dfs[feature] = df

In [None]:
# combine all xerta datasets into a single dataframe

from functools import reduce

# Get a list of all dataframes
dfs = list(mequinenza_dfs.values())

# Use reduce to merge all dataframes
mequinenza_df = reduce(
    lambda left, right: pd.merge(left, right, on="DateTime"), dfs
)

### Fill NaNs

In [None]:
# fill missing values with interpolation
df_fill = mequinenza_df.copy()
df_fill.set_index("DateTime", inplace=True)
df_fill = df_fill.interpolate(method="time", limit_direction="both")
df_fill.reset_index(inplace=True)

In [None]:
# plot the data for each feature
for feature in mequinenza_dfs.keys():
    df_fill[feature + "_is_missing"] = df_fill[feature].isna()

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df_fill,
        x="DateTime",
        y=feature,
        label=feature + " imputed",
    )
    sns.lineplot(
        data=mequinenza_df,
        x="DateTime",
        y=feature,
        label=feature,
        alpha=0.5,
    )

    missing_values_perc = (
        df_fill[feature + "_is_missing"].sum() / df_fill.shape[0]
    ) * 100
    missing_values_perc = missing_values_perc.round(2)

    plt.title(
        feature
        + " - "
        + str(missing_values_perc)
        + "% of missing values"
    )
    plt.legend()
    plt.show()

In [None]:
mequinenza_df = df_fill

# Xerta Preprocessing

In [None]:
xerta_dfs.keys()
xerta_dfs["conductivity"] = xerta_dfs.pop("Conductivity")

In [None]:
for feature, df in xerta_dfs.items():
    print("Feature:", feature)
    print()
    print("% missing values:")
    print()
    print(df.isna().sum() / df.shape[0])
    print()
    print("Column names:", df.columns.to_list())
    print()
    print("-" * 100)
    print()

In [None]:
for df in xerta_dfs.values():
    df.rename(
        columns={
            "Fecha": "DateTime",
            "Promedio": "Average",
            "Máximo": "Maximum",
            "Mínimo": "Minimum",
        },
        inplace=True,
    )

    df["DateTime"] = pd.to_datetime(df["DateTime"])
    df[df.columns.difference(["DateTime"])] = df[
        df.columns.difference(["DateTime"])
    ].apply(pd.to_numeric, errors="coerce")

In [None]:
# Check for missing values in the datasets
for feature, df in xerta_dfs.items():
    print(f"{feature}: {df.isna().sum().sum()}")

In [None]:
# For the moment, drop the missing values
for feature, df in xerta_dfs.items():
    xerta_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df["Average"], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y="Average")
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x="DateTime", y="Average", label="Average")
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data - Feature-wise

How I determined acceptable ranges of values for each feature

* Nitrate: the EU [analyzed](https://water.jrc.ec.europa.eu/pdf/ebro-fs.pdf) the nitrate concentration in the Ebro river showing evidence that the range of values measured is valid, with minimum measures around 5 mg/L and maximum of 39 mg/L. Therefore, **no removal is necessary**.

* ABS254: unit of measure needed in order to understand if the range of values is valid. If it is measured in m^-1 then it the values are in a valid domain. **We dediced to take 43 as a threshold value as it removes peak measurements that consists of a single day measurement and the nearby days have much lower measurements.**

* pH: no outliers are visible from the plot, the range of values is in a valid domain.

* Ammonium: the EU [analyzed](https://www.eea.europa.eu/publications/topic_report_1996_4/) the average ammonium concentration for the biggest rivers in Europe, **with an average value for the Ebro river of 1 mg/L. So I decided to take 1 as the threshold for this measure.** It is also visible from the plot that there are some outliers. 

* Dissolved Oxygen: compared to other European rivers ([[1]](https://www.eea.europa.eu/help/glossary/semide-emwis-thesaurus/dissolved-oxygen) [[2]](https://www.nature.com/articles/s41558-023-01793-3.epdf?sharing_token=9FHw4vs9ayQDshsDGgw2YdRgN0jAjWel9jnR3ZoTv0N_UAFixjh8yBKAAv5SFFi5TZqeEarq8OCLvF2MOwUvnjpgszm-R5dkD1f1gBUn4ekry_rdvkaYaFttq-a3c_LSIIKRC1QfCVCpMu_ayGcOH4TMz8rleqgElh88xKQM0dBT-DGm7KbFzOvy-bkWM6Jk9T5xJFx05CGT-dZ63W2867oF1IE9pLwJuzpmyfBZaJg%3D&tracking_referrer=www.newscientist.com)), the range of values is considered valid.

* Conductivity: the range of values is valid. Therefore, **no removal is necessary.**

* Redox Potential: in [this](https://link.springer.com/chapter/10.1007/978-3-662-04080-5_1) ORP book, it is said that the range of ORP can vary from -400 to 800 mV, so the domain is considered valid.

* Turbidity: [DataStream](https://datastream.org/en-ca/guidebook/turbidity#:~:text=Turbidity%20values%20less%20than%2010,be%20more%20than%20100%20NTU.) says that high levels of turbidity are > 100 NTU. [In-Situ](https://in-situ.com/us/faq/water-quality-information/what-are-typical-turbidity-values-in-natural-environments#:~:text=Turbidity%20values%20in%20natural%20environments%20can%20range%20from%20as%20low,a%20major%20storm%20runoff%20event.) says that levels of turbidity > 100 NTU are unsafe for most aquatic life. [Wikipedia](https://in-situ.com/us/faq/water-quality-information/what-are-typical-turbidity-values-in-natural-environments#:~:text=Turbidity%20values%20in%20natural%20environments%20can%20range%20from%20as%20low,a%20major%20storm%20runoff%20event.) says that the Ebro river has a wide ecosystem. Given these considerations, **I decided to consider valid the range of values between 0 and 150 NTU**.

* Water Temperature: same as conductivity.

#### ABS254

In [None]:
abs_df = xerta_dfs["ABS254"].copy()

In [None]:
threshold = 43

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=abs_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
abs_df = abs_df[abs_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=abs_df, x="DateTime", y="Average", label="Average")

In [None]:
xerta_dfs["ABS254"] = abs_df

#### Ammonium

In [None]:
ammon_df = xerta_dfs["Ammonium"].copy()

In [None]:
threshold = 1

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=ammon_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
ammon_df = ammon_df[ammon_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=ammon_df, x="DateTime", y="Average", label="Average")

In [None]:
xerta_dfs["Ammonium"] = ammon_df

#### Turbidity

In [None]:
turb_df = xerta_dfs["turbidity"].copy()

In [None]:
threshold = 150

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(data=turb_df, x="DateTime", y="Average", label="Average")
plt.axhline(y=threshold, color="r", linestyle="--", label="Threshold")
plt.legend()
plt.show()

In [None]:
turb_df = turb_df[turb_df["Average"] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(data=turb_df, x="DateTime", y="Average", label="Average")

In [None]:
xerta_dfs["turbidity"] = turb_df

### Show uncovered days

In [None]:
for feature, df in xerta_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    xerta_dfs[feature] = df

In [None]:
for feature, df in xerta_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("-" * 50)

In [None]:
missing_values_df = {}

for feature, df in xerta_dfs.items():
    df["is_missing"] = df["Average"].isna()

    lower_threshold = None
    upper_threshold = None

    label = feature
    if feature == "nitrate":
        label = "Nitrate (mg/L)"

    elif feature == "ABS254":
        label = "UVA254"
        upper_threshold = 43

    elif feature == "Ammonium":
        label = "Ammonium (mg/L)"
        upper_threshold = 1

    elif feature == "conductivity":
        label = "Conductivity (µS/cm)"

    elif feature == "turbidity":
        label = "Turbidity (NTU)"
        upper_threshold = 150

    elif feature == "dissolvedoxygen":
        label = "Dissolved Oxygen (mg/L)"

    elif feature == "redoxpotential":
        label = "Redox Potential (mV)"

    elif feature == "watertemperature":
        label = "Water Temperature (°C)"

    missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
    missing_values_perc = missing_values_perc.round(2)

    missing_values_df[feature] = missing_values_perc

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df, x="DateTime", y="Average", label="Observed Values"
    )

    # Get current axis
    ax = plt.gca()

    # Set major ticks format
    years = mdates.YearLocator()  # every year
    years_fmt = mdates.DateFormatter("%Y")
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(years_fmt)

    for date in df[df["is_missing"]]["DateTime"]:
        plt.axvline(
            x=date, ymin=0.01, ymax=0.99, color="grey", alpha=0.1
        )

    # Plot an empty line with a label for the legend
    plt.plot([], [], color="grey", alpha=0.1, label="Missing values")

    if lower_threshold:
        plt.axhline(
            y=lower_threshold,
            color="purple",
            linestyle="--",
            label=f"Lower Threshold: {lower_threshold}",
        )

    if upper_threshold:
        plt.axhline(
            y=upper_threshold,
            color="r",
            linestyle="--",
            label=f"Upper Threshold: {upper_threshold}",
        )

    plt.title(
        label + " - " + str(missing_values_perc) + "% of missing values"
    )

    plt.ylabel(label)

    plt.legend(loc="upper right")
    plt.show()

    print(feature)
    print("N samples:", df.shape[0])

    # compute the number of suspicious values
    n_sus = 0
    if lower_threshold:
        n_sus += df[df["Average"] < lower_threshold].shape[0]

    if upper_threshold:
        n_sus += df[df["Average"] > upper_threshold].shape[0]

    print("N suspicious:", n_sus)

missing_values_df = pd.DataFrame(
    missing_values_df.items(), columns=["Feature", "Missing values (%)"]
)

In [None]:
missing_values_df

### Combine Datasets

In [None]:
# Rename the Average column to the feature name for each dataframe
# and keep only the DateTime and the feature column
for feature, df in xerta_dfs.items():
    df.rename(columns={"Average": feature}, inplace=True)
    df = df[["DateTime", feature, "is_missing"]]
    df.rename(
        columns={"is_missing": feature + "_is_missing"}, inplace=True
    )
    xerta_dfs[feature] = df

In [None]:
# combine all xerta datasets into a single dataframe

from functools import reduce

# Get a list of all dataframes
dfs = list(xerta_dfs.values())

# Use reduce to merge all dataframes
xerta_df = reduce(
    lambda left, right: pd.merge(left, right, on="DateTime"), dfs
)

### Fill NaNs

In [None]:
# fill missing values with interpolation
df_fill = xerta_df.copy()
df_fill.set_index("DateTime", inplace=True)
df_fill = df_fill.interpolate(method="time", limit_direction="both")
df_fill.reset_index(inplace=True)

In [None]:
# plot the data for each feature
for feature in xerta_dfs.keys():
    df_fill[feature + "_is_missing"] = df_fill[feature].isna()

    plt.figure(figsize=(20, 10))
    sns.lineplot(
        data=df_fill,
        x="DateTime",
        y=feature,
        label=feature + " imputed",
    )
    sns.lineplot(
        data=xerta_df, x="DateTime", y=feature, label=feature, alpha=0.5
    )

    missing_values_perc = (
        df_fill[feature + "_is_missing"].sum() / df_fill.shape[0]
    ) * 100
    missing_values_perc = missing_values_perc.round(2)

    plt.title(
        feature
        + " - "
        + str(missing_values_perc)
        + "% of missing values"
    )
    plt.legend()
    plt.show()

In [None]:
xerta_df = df_fill

In [None]:
# drop boolean columns
tortosa_df = tortosa_df[
    tortosa_df.columns[~tortosa_df.columns.str.contains("_is_missing")]
]

guiamets_df = guiamets_df[
    guiamets_df.columns[
        ~guiamets_df.columns.str.contains("_is_missing")
    ]
]

mequinenza_df = mequinenza_df[
    mequinenza_df.columns[
        ~mequinenza_df.columns.str.contains("_is_missing")
    ]
]

xerta_df = xerta_df[
    xerta_df.columns[~xerta_df.columns.str.contains("_is_missing")]
]

# Common Parameters Comparison, Unique Dataset Built and Monthly Resampling

In [None]:
# use same time period for all data
min_date = max(
    tortosa_df["DateTime"].min(),
    guiamets_df["DateTime"].min(),
    mequinenza_df["DateTime"].min(),
    xerta_df["DateTime"].min(),
)
max_date = min(
    tortosa_df["DateTime"].max(),
    guiamets_df["DateTime"].max(),
    mequinenza_df["DateTime"].max(),
    xerta_df["DateTime"].max(),
)

tortosa_df = tortosa_df[
    (tortosa_df["DateTime"] >= min_date)
    & (tortosa_df["DateTime"] <= max_date)
]
guiamets_df = guiamets_df[
    (guiamets_df["DateTime"] >= min_date)
    & (guiamets_df["DateTime"] <= max_date)
]
mequinenza_df = mequinenza_df[
    (mequinenza_df["DateTime"] >= min_date)
    & (mequinenza_df["DateTime"] <= max_date)
]
xerta_df = xerta_df[
    (xerta_df["DateTime"] >= min_date)
    & (xerta_df["DateTime"] <= max_date)
]

## Compare common variables

Common variables are:
* cumulated_rainfall_24h
* watertemperature
* conductivity

The idea is to combine every variable in a single dataset, which in this case is the Xerta dataset, firstly by comparing the redundant variables between each site and secondly by merging the selected variables into the Xerta df.

To compare common variables, the same time period must be used.

### Cumulated Rainfall

In [None]:
common_variable = "cumulated_rainfall_24h"

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=guiamets_df, label="Guiamets"
)
# sns.lineplot(
#     x="DateTime",
#     y=common_variable,
#     data=mequinenza_df,
#     label="Mequinenza",
# )

#### Pearson

##### Tortosa - Mequinenza

In [None]:
stats.pearsonr(
    tortosa_df[common_variable],
    mequinenza_df[common_variable],
)

##### Tortosa - Guiamets

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable],
    guiamets_df[common_variable],
)

pears

##### Mequinenza - Guiamets

In [None]:
stats.pearsonr(
    mequinenza_df[common_variable], guiamets_df[common_variable]
)

#### RMSE

##### Tortosa - Mequinenza

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], mequinenza_df[common_variable]
    )
)
rmse / (tortosa_df[common_variable].max() - tortosa_df[common_variable].min())

##### Tortosa - Guiamets

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], guiamets_df[common_variable]
    )
)
rmse / (tortosa_df[common_variable].max() - tortosa_df[common_variable].min())

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=guiamets_df, label="Guiamets"
)
# sns.lineplot(
#     x="DateTime",
#     y=common_variable,
#     data=mequinenza_df,
#     label="Mequinenza",
# )

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.2f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[0],
    85,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Daily Cumulated Rainfall (mm)")
plt.title("Daily Cumulated Rainfall: Tortosa vs Guiamets")

plt.show()

### Water Temperature

In [None]:
common_variable = "watertemperature"

In [None]:
value, p_value = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

# add textbox with correlation value
text_string = "\n".join(
    [
        f"Pearson correlation = {value:.4f}",
        f"P-value = {p_value:.4f}",
    ]
)

# props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

# plt.text(
#     xerta_df["DateTime"].iloc[60],
#     29,
#     s=text_string,
#     fontsize=12,
#     bbox=props,
# )

plt.xlabel("Year")
plt.ylabel("Water temperature (°C)")
plt.title("Water temperature: Xerta vs Tortosa")
plt.show()

#### Pearson

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

pears

#### RMSE

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], xerta_df[common_variable]
    )
)
rmse / (
    tortosa_df[common_variable].max()
    - tortosa_df[common_variable].min()
)

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.3f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[60],
    29,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Water temperature (°C)")
plt.title("Water temperature: Xerta vs Tortosa")
plt.show()

### Conductivity

In [None]:
common_variable = "conductivity"

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

plt.xlabel("Year")
plt.ylabel("Conductivity (µS/cm)")
plt.title("Conductivity: Xerta vs Tortosa")
plt.show()

#### Pearson

In [None]:
pears, _ = stats.pearsonr(
    tortosa_df[common_variable], xerta_df[common_variable]
)

pears

#### RMSE

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        tortosa_df[common_variable], xerta_df[common_variable]
    )
)
rmse / (
    tortosa_df[common_variable].max()
    - tortosa_df[common_variable].min()
)

In [None]:
plt.figure()
sns.lineplot(
    x="DateTime", y=common_variable, data=xerta_df, label="Xerta"
)
sns.lineplot(
    x="DateTime", y=common_variable, data=tortosa_df, label="Tortosa"
)

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"Pearson Coefficient = {pears:.3f}",
        f"RMSD = {rmse:.2f}",
    )
)

plt.text(
    tortosa_df["DateTime"].iloc[0],
    1800,
    s=text_string,
    fontsize=12,
    bbox=props,
)

plt.xlabel("Year")
plt.ylabel("Conductivity (µS/cm)")
plt.title("Conductivity: Xerta vs Tortosa")
plt.show()

## Build Unique Dataset

In [None]:
# water temperature and conductivity are better in the xerta dataset so no need to merge with tortosa

# I decided to take the rainfall from tortosa since it is the closest to the xerta station

xerta_df["cumulated_rainfall_24h"] = tortosa_df[
    "cumulated_rainfall_24h"
].values
xerta_df["environment_temperature"] = guiamets_df[
    "environmental_temperature"
].values
xerta_df["flowriver"] = tortosa_df["flowriver"].values

xerta_df.rename(
    columns={
        "cumulated_rainfall_24h": "Daily Cumulated Rainfall",
        "watertemperature": "Water Temperature",
        "environment_temperature": "Air Temperature",
        "flowriver": "Flow River",
        "conductivity": "Conductivity",
        "dissolvedoxygen": "Dissolved Oxygen",
        "nitrate": "Nitrate",
        "redoxpotential": "Redox Potential",
        "turbidity": "Turbidity",
        "Ammonium": "Ammonium",
        "ABS254": "Absorbance 254nm",
    },
    inplace=True,
)

In [None]:
# drop first 3 rows of xerta_df since they are the only rows for august 2012
xerta_df = xerta_df.iloc[3:]

In [None]:
xerta_df.set_index("DateTime", inplace=True)

In [None]:
# add unit of measurement to the columns
xerta_df.rename(
    columns={
        "Daily Cumulated Rainfall": "Cumulated Rainfall (mm)",
        "Water Temperature": "Water Temperature (°C)",
        "Air Temperature": "Air Temperature (°C)",
        "Flow River": "Flow River Rate (m³/s)",
        "Conductivity": "Conductivity (µS/cm)",
        "Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
        "Nitrate": "Nitrate (mg/L)",
        "Redox Potential": "Redox Potential (mV)",
        "Turbidity": "Turbidity (NTU)",
        "Ammonium": "Ammonium (mg/L)",
        "Absorbance 254nm": "UVA254 (1/m)",
    },
    inplace=True,
)

In [None]:
# create an info dataframe to store the information about the dataset
info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N Samples",
            "% Missing Values",
            "Frequency (days)",
            "Mean",
            "Std",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=xerta_df.columns,
)


In [None]:
# store the information in the station_info_df
for column in xerta_df.columns:
    df = xerta_df[column].copy()

    start_date = df.dropna().index.min().strftime("%Y-%m-%d")
    end_date = df.dropna().index.max().strftime("%Y-%m-%d")

    df = df[start_date:end_date]

    missing_values = df.isna().sum() / df.shape[0] * 100

    info_df.loc["N Samples", column] = (
        xerta_df[column].dropna().shape[0]
    )
    info_df.loc[
        "% Missing Values", column
    ] = missing_values
    info_df.loc["Frequency (days)", column] = (
        xerta_df.index.to_series().diff().value_counts().index[0].days
    )
    
    info_df.loc["Mean",  column] = df.mean()
    info_df.loc["Std", column] = df.std()
    
    info_df.loc["Start Date", column] = start_date
    info_df.loc["End Date", column] = end_date

In [None]:
info_df

## Resample to monthly

In [None]:
xerta_df = xerta_df.resample("ME").mean()

In [None]:
xerta_df.isna().sum()

In [None]:
for feature in xerta_df.columns:
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=xerta_df, x=xerta_df.index, y=feature, label=feature)
    plt.title(feature)
    plt.show()

In [None]:
xerta_df.to_excel(os.path.join(clean_data_folder, "xerta.xlsx"), index=True)