# Outliers Analysis

The data here is not imputed, the missing values are just dropped.

A deep analysis of the peaks on the Absorbance is performed to understand if the frequency of peaks is increasing or not in time.

Here the outliers for each variable of each site are analyzed in order to find some patterns between outliers of different variables. The main goal is to understand if a sample is a real outlier or an 'analytical' one due to an error in the measurement.

A unique dataset is built, appending to the Xerta one the variables that are not measured in Xerta.

The variables are:
* Air Temperature (Guiamets)
* Daily Cumulated Rainfall (Tortosa, since it is the closest city to Xerta)
* Flow River (Tortosa)

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

In [None]:
plt.rcParams["figure.figsize"] = [20, 10]
plt.rcParams.update({"font.size": 26})

# Utils

In [None]:
def build_dict_from_folder(folder_path):
    datasets_dict = {}
    for folder in os.listdir(folder_path):
        if os.path.isdir(os.path.join(folder_path, folder)):
            datasets_dict[folder] = {}
            for file in os.listdir(os.path.join(folder_path, folder)):
                feature_name = file.split(".")[0]
                datasets_dict[folder][feature_name] = pd.read_excel(
                    os.path.join(folder_path, folder, file)
                )
    return datasets_dict

# Load Data

In [None]:
data_folder = os.path.join("..", "..", "data", "tarragona")

raw_data_folder = os.path.join(data_folder, "raw_data")

intermediate_data_folder = os.path.join(
    data_folder, "intermediate_data"
)

clean_data_folder = os.path.join(data_folder, "clean_data")

probabilities_folder = os.path.join(data_folder, "probabilities")

In [None]:
datasets_dict = build_dict_from_folder(intermediate_data_folder)

In [None]:
datasets_dict.keys()

In [None]:
tortosa_dfs = datasets_dict["TORTOSA"]
guiamets_dfs = datasets_dict["GUIAMETS"]
mequinenza_dfs = datasets_dict["MEQUINENZA"]
xerta_dfs = datasets_dict["XERTA"]

## Get Common Time Range for valid analysis

In [None]:
# get common time range for all datasets
min_tortosa = max(
    min(tortosa_dfs[feature]["DateTime"])
    for feature in tortosa_dfs.keys()
)
max_tortosa = min(
    max(tortosa_dfs[feature]["DateTime"])
    for feature in tortosa_dfs.keys()
)

min_guiamets = max(
    min(guiamets_dfs[feature]["DateTime"])
    for feature in guiamets_dfs.keys()
)
max_guiamets = min(
    max(guiamets_dfs[feature]["DateTime"])
    for feature in guiamets_dfs.keys()
)

min_mequinenza = max(
    min(mequinenza_dfs[feature]["DateTime"])
    for feature in mequinenza_dfs.keys()
)
max_mequinenza = min(
    max(mequinenza_dfs[feature]["DateTime"])
    for feature in mequinenza_dfs.keys()
)

min_xerta = max(
    min(xerta_dfs[feature]["DateTime"]) for feature in xerta_dfs.keys()
)
max_xerta = min(
    max(xerta_dfs[feature]["DateTime"]) for feature in xerta_dfs.keys()
)

min_date = max(min_tortosa, min_guiamets, min_mequinenza, min_xerta)
max_date = min(max_tortosa, max_guiamets, max_mequinenza, max_xerta)

In [None]:
for feature in tortosa_dfs.keys():
    tortosa_dfs[feature] = tortosa_dfs[feature].loc[
        (tortosa_dfs[feature]["DateTime"] >= min_date)
        & (tortosa_dfs[feature]["DateTime"] <= max_date)
    ]

for feature in guiamets_dfs.keys():
    guiamets_dfs[feature] = guiamets_dfs[feature].loc[
        (guiamets_dfs[feature]["DateTime"] >= min_date)
        & (guiamets_dfs[feature]["DateTime"] <= max_date)
    ]

for feature in mequinenza_dfs.keys():
    mequinenza_dfs[feature] = mequinenza_dfs[feature].loc[
        (mequinenza_dfs[feature]["DateTime"] >= min_date)
        & (mequinenza_dfs[feature]["DateTime"] <= max_date)
    ]

for feature in xerta_dfs.keys():
    xerta_dfs[feature] = xerta_dfs[feature].loc[
        (xerta_dfs[feature]["DateTime"] >= min_date)
        & (xerta_dfs[feature]["DateTime"] <= max_date)
    ]

# Tortosa

In [None]:
for feature, df in tortosa_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    tortosa_dfs[feature] = df

In [None]:
feature, tortosa_df = list(tortosa_dfs.items())[0]

tortosa_df = tortosa_df[["DateTime", "Average"]]

tortosa_df.rename(columns={"Average": feature}, inplace=True)

for feature, df in list(tortosa_dfs.items())[1:]:
    df = df[["DateTime", "Average"]]
    df.rename(columns={"Average": feature}, inplace=True)
    tortosa_df = tortosa_df.merge(df, on="DateTime", how="inner")

In [None]:
tortosa_df

In [None]:
# drop turbidity since it has a lot of missing values
tortosa_df = tortosa_df.drop(columns=["turbidity"])

In [None]:
for feature in tortosa_df.columns[1:]:
    plt.figure()
    sns.lineplot(
        x="DateTime", y=feature, data=tortosa_df, label=feature
    )

In [None]:
# scale the data
from sklearn.preprocessing import MinMaxScaler

scaled_tortosa_df = tortosa_df.copy()

scaler = MinMaxScaler()
scaled_tortosa_df[scaled_tortosa_df.columns[1:]] = scaler.fit_transform(
    tortosa_df[tortosa_df.columns[1:]]
)

In [None]:
# plot every combination of features
from scipy import signal
import scipy


for i, feature1 in enumerate(tortosa_df.columns[1:]):
    for j, feature2 in enumerate(tortosa_df.columns[1:]):
        if i < j:
            """# cross correlation
            product_df = pd.DataFrame()
            product_df["DateTime"] = tortosa_df["DateTime"]
            product_df["product"] = (
                scaled_tortosa_df[feature1] * scaled_tortosa_df[feature2]
            )
            product_df[feature1] = tortosa_df[feature1] * product_df["product"]
            product_df[feature2] = tortosa_df[feature2] * product_df["product"]

            scaler = MinMaxScaler()
            product_df[product_df.columns[1:]] = scaler.fit_transform(
                product_df[product_df.columns[1:]]
            )

            # fig, axs = plt.subplots(2, 1, figsize=(20, 10))

            ccf = signal.correlate(
                tortosa_df[feature1], tortosa_df[feature2], mode="full"
            )

            ccf /= np.max(ccf)
            ccf = ccf[int((len(ccf) - 1) / 2) :]
            lags = np.arange(0, len(ccf))

            max_lag = lags[np.argmax(ccf)]

            peaks = scipy.signal.find_peaks(tortosa_df[feature1])"""

            sns.lineplot(
                x="DateTime",
                y=feature1,
                data=scaled_tortosa_df[
                    scaled_tortosa_df["DateTime"] > "2013"
                ],
                label=feature1,
            )
            sns.lineplot(
                x="DateTime",
                y=feature2,
                data=scaled_tortosa_df[
                    scaled_tortosa_df["DateTime"] > "2013"
                ],
                label=feature2,
            )

            plt.xlabel("lag")
            plt.ylabel("cross-correlation")
            plt.title(f"{feature1} vs {feature2}")
            plt.show()

# Guiamets

In [None]:
for feature, df in guiamets_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    guiamets_dfs[feature] = df

In [None]:
feature, guiamets_df = list(guiamets_dfs.items())[0]

guiamets_df = guiamets_df[["DateTime", "Average"]]

guiamets_df.rename(columns={"Average": feature}, inplace=True)

for feature, df in list(guiamets_dfs.items())[1:]:
    df = df[["DateTime", "Average"]]
    df.rename(columns={"Average": feature}, inplace=True)
    guiamets_df = guiamets_df.merge(df, on="DateTime", how="inner")

In [None]:
guiamets_df

In [None]:
for feature in guiamets_df.columns[1:]:
    plt.figure()
    sns.lineplot(
        x="DateTime", y=feature, data=guiamets_df, label=feature
    )

In [None]:
# scale the data
from sklearn.preprocessing import MinMaxScaler

scaled_guiamets_df = guiamets_df.copy()

scaler = MinMaxScaler()
scaled_guiamets_df[
    scaled_guiamets_df.columns[1:]
] = scaler.fit_transform(guiamets_df[guiamets_df.columns[1:]])

In [None]:
# plot every combination of features
from scipy import signal
import scipy


for i, feature1 in enumerate(guiamets_df.columns[1:]):
    for j, feature2 in enumerate(guiamets_df.columns[1:]):
        if i < j:
            """# cross correlation
            product_df = pd.DataFrame()
            product_df["DateTime"] = guiamets_df["DateTime"]
            product_df["product"] = (
                scaled_guiamets_df[feature1] * scaled_guiamets_df[feature2]
            )
            product_df[feature1] = guiamets_df[feature1] * product_df["product"]
            product_df[feature2] = guiamets_df[feature2] * product_df["product"]

            scaler = MinMaxScaler()
            product_df[product_df.columns[1:]] = scaler.fit_transform(
                product_df[product_df.columns[1:]]
            )

            # fig, axs = plt.subplots(2, 1, figsize=(20, 10))

            ccf = signal.correlate(
                guiamets_df[feature1], guiamets_df[feature2], mode="full"
            )

            ccf /= np.max(ccf)
            ccf = ccf[int((len(ccf) - 1) / 2) :]
            lags = np.arange(0, len(ccf))

            max_lag = lags[np.argmax(ccf)]

            peaks = scipy.signal.find_peaks(guiamets_df[feature1])"""

            sns.lineplot(
                x="DateTime",
                y=feature1,
                data=scaled_guiamets_df,
                label=feature1,
            )
            sns.lineplot(
                x="DateTime",
                y=feature2,
                data=scaled_guiamets_df,
                label=feature2,
            )

            plt.xlabel("lag")
            plt.ylabel("cross-correlation")
            plt.title(f"{feature1} vs {feature2}")
            plt.show()

# Mequinenza

In [None]:
for feature, df in mequinenza_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("Min date:", df["DateTime"].min())
    print()
    print("Max date:", df["DateTime"].max())
    print()
    print("-" * 100)

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x="DateTime", y="Average", label="Average")
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

# Xerta

In [None]:
for feature, df in xerta_dfs.items():
    print(feature)
    print()
    print(df.isna().sum())
    print()
    print("Min date:", df["DateTime"].min())
    print()
    print("Max date:", df["DateTime"].max())
    print()
    print("-" * 100)

In [None]:
for feature, df in xerta_dfs.items():
    df = df.set_index("DateTime")

    all_dates = pd.date_range(
        start=df.index.min(), end=df.index.max(), freq="D"
    )
    df = df.reindex(all_dates, fill_value=None)

    df.reset_index(inplace=True)
    df.rename(columns={"index": "DateTime"}, inplace=True)
    xerta_dfs[feature] = df

In [None]:
feature, xerta_df = list(xerta_dfs.items())[0]

xerta_df = xerta_df[["DateTime", "Average"]]

xerta_df.rename(columns={"Average": feature}, inplace=True)

for feature, df in list(xerta_dfs.items())[1:]:
    df = df[["DateTime", "Average"]]
    df.rename(columns={"Average": feature}, inplace=True)
    xerta_df = xerta_df.merge(df, on="DateTime", how="inner")

In [None]:
xerta_df

In [None]:
for feature in xerta_df.columns[1:]:
    plt.figure()
    sns.lineplot(x="DateTime", y=feature, data=xerta_df, label=feature)

In [None]:
sns.boxplot(data=xerta_df["ABS254"])

In [None]:
xerta_df = pd.read_excel(
    os.path.join(clean_data_folder, "full_dataset.xlsx")
)

In [None]:
xerta_df

In [None]:
for year in xerta_df["DateTime"].dt.year.unique():
    quartile_1, quartile_3 = xerta_df[
        xerta_df["DateTime"].dt.year == year
    ]["UVA254"].quantile([0.25, 0.75])

    iqr = quartile_3 - quartile_1

    lower_bound = max(quartile_1 - 1.5 * iqr, 0)
    upper_bound = quartile_3 + 1.5 * iqr

    year_data = xerta_df[xerta_df["DateTime"].dt.year == year]

    sns.lineplot(x="DateTime", y="UVA254", data=year_data)

    sns.lineplot(
        x=year_data["DateTime"],
        y=[upper_bound] * len(year_data),
        color="red",
        linestyle="--",
    )

plt.xlabel("Year")
plt.ylabel("UVA254")
plt.show()

In [None]:
# mark the outliers
abs_df = xerta_df[["DateTime", "UVA254"]].copy()
abs_df["is_outlier"] = False

for year in xerta_df["DateTime"].dt.year.unique():
    quartile_1, quartile_3 = abs_df[abs_df["DateTime"].dt.year == year][
        "UVA254"
    ].quantile([0.25, 0.75])

    iqr = quartile_3 - quartile_1

    lower_bound = max(quartile_1 - 1.5 * iqr, 0)
    upper_bound = quartile_3 + 1.5 * iqr

    abs_df.loc[abs_df["DateTime"].dt.year == year, "is_outlier"] = (
        abs_df[abs_df["DateTime"].dt.year == year]["UVA254"]
        > upper_bound
    )

In [None]:
abs_df["is_outlier"] = abs_df["is_outlier"].astype(int)

In [None]:
sns.lineplot(x="DateTime", y="UVA254", data=abs_df)
sns.scatterplot(
    x="DateTime",
    y="UVA254",
    data=abs_df[abs_df["is_outlier"] == 1],
    color="red",
)
plt.xlabel("Year")
plt.ylabel("UVA254")
plt.show()

In [None]:
# set to False is_outlier for the first 3 months of 2020
abs_df.loc[
    (abs_df["DateTime"].dt.year == 2020)
    & (abs_df["DateTime"].dt.month <= 2),
    "is_outlier",
] = False

In [None]:
sns.lineplot(x="DateTime", y="UVA254", data=abs_df)
sns.scatterplot(
    x="DateTime",
    y="UVA254",
    data=abs_df[abs_df["is_outlier"] == 1],
    color="red",
)
plt.xlabel("Year")
plt.ylabel("UVA254")
plt.title("UVA254 with outliers")
plt.show()

# Build Unique Dataset

In [None]:
tortosa_df.columns.to_list()

In [None]:
guiamets_df.columns.to_list()

In [None]:
xerta_df["cumulated_rainfall_24h"] = tortosa_df[
    "cumulated_rainfall_24h"
]
xerta_df["flowriver"] = tortosa_df["flowriver"]
xerta_df["environmental_temperature"] = guiamets_df[
    "environmental_temperature"
]

xerta_df["is_outlier"] = abs_df["is_outlier"]

In [None]:
xerta_df.to_excel(
    os.path.join(raw_data_folder, "raw_full_dataset.xlsx"), index=False
)

# ABS Outliers Analysis

In [None]:
xerta_df["Year"] = xerta_df["DateTime"].dt.year

In [None]:
# Add a timestamp column
xerta_df["Timestamp"] = xerta_df["DateTime"].apply(
    lambda x: x.timestamp()
)

In [None]:
full_df = xerta_df.copy()

In [None]:
full_df.columns.to_list()

In [None]:
full_df.drop(
    [
        "cumulated_rainfall_24h",
        "flowriver",
        "environmental_temperature",
    ],
    inplace=True,
    axis=1,
)

In [None]:
full_df.rename(
    columns={
        "cumulated_rainfall_24h": "Daily Cumulated Rainfall (L/m\u00b2)",
        "watertemperature": "Water Temperature (\u00b0C)",
        "environmental_temperature": "Air Temperature (\u00b0C)",
        "flowriver": "Flow River (m\u00b3/s)",
        "Conductivity": "Conductivity (\u00b5S/cm)",
        "dissolvedoxygen": "Dissolved Oxygen (mg/L)",
        "nitrate": "Nitrate (mg/L)",
        "redoxpotential": "Redox Potential (mV)",
        "turbidity": "Turbidity (NTU)",
        "Ammonium": "Ammonium (mg/L)",
        "ABS254": "UVA254",
    },
    inplace=True,
)

In [None]:
df = full_df.copy()
# df.drop(columns=["Timestamp", "Year"], inplace=True)
abs_col = df.pop("UVA254")
df["UVA254"] = abs_col

In [None]:
df

In [None]:
result_df = pd.DataFrame(columns=df.columns.difference(["DateTime"]))

# add total number or rows per feature
result_df.loc["Number of Samples"] = df.count()
result_df.loc["`%` of Missing Values"] = (
    df.isna().sum() / df.count() * 100
)

In [None]:
df.dropna(inplace=True)

In [None]:
result_df.loc[
    "Number of Samples after Dropping Missing Values"
] = df.count()

In [None]:
result_df.to_excel(
    os.path.join(raw_data_folder, "raw_full_dataset_summary.xlsx")
)

In [None]:
# plot the boxplots for each feature in a unique plot
fig, axs = plt.subplots(1, result_df.shape[1], figsize=(30, 7.5))

for i, feature in enumerate(result_df.columns):
    if feature == "Daily Cumulated Rainfall (L/m\u00b2)":
        sns.boxplot(y=df[df[feature] > 0][feature], ax=axs[i])
        zero_values = df[df[feature] == 0][feature].count()
    else:
        sns.boxplot(y=df[feature], ax=axs[i])
    # rotate x-axis labels
    axs[i].set_ylabel("")
    axs[i].set_xlabel(
        feature.replace(" ", "\n"), fontsize=15, labelpad=5
    )

# Add more space between the subplots
plt.subplots_adjust(wspace=0.8)
plt.show()

In [None]:
# Check for missing values and drop them
full_df.isna().sum()

In [None]:
full_df = full_df.dropna()

In [None]:
aic_dict = {}

# Timestamp

In [None]:
formula = "is_outlier ~ Timestamp"

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(x="DateTime", y="UVA254", data=full_df)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["UVA254"],
    color="red",
)

for year in full_df["DateTime"].dt.year.unique():
    plt.axvline(pd.to_datetime(str(year)), color="gray", linestyle="--")

plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

# Timestamp + Year

In [None]:
formula = "is_outlier ~ Timestamp + Year"

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

# Timestamp + Timestamp:Year + Year + All Features 

In [None]:
formula = 'is_outlier ~ Timestamp + Timestamp:Year + Year + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Flow River") + Nitrate + Ammonium + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict[
    "Timestamp + Timestamp:Year + Year + All Features"
] = results.aic

# Timestamp:Year + C(Year) + All Features

Timestamp + Timestamp:Year + C(Year) + All Features si rompe

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Flow River") + Nitrate + Ammonium + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict["Timestamp:Year + C(Year) + All Features"] = results.aic

# Remove Ammonium

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Flow River") + Nitrate + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict["Timestamp:Year + C(Year) - Ammonium"] = results.aic

# Remove Nitrate

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Flow River") + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict["Timestamp:Year + C(Year) - Ammonium - Nitrate"] = results.aic

# Remove Flow River

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict[
    "Timestamp:Year + C(Year) - Ammonium - Nitrate - Flow River"
] = results.aic

# Remove pH

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + Conductivity + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict[
    "Timestamp:Year + C(Year) - Ammonium - Nitrate - Flow River - pH"
] = results.aic

# Remove Water Temperature

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + Conductivity + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict[
    "Timestamp:Year + C(Year) - Ammonium - Nitrate - Flow River - pH - Water Temperature"
] = results.aic

# Set Years with p-value > 0.05 to 2012

In [None]:
# the years are 2014, 2015, 2016, 2018, 2019
# set these years to 2012
change_full_df = full_df.copy()
change_full_df.loc[
    change_full_df["Year"].isin([2014, 2015, 2016, 2018, 2019]), "Year"
] = 2012

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + Conductivity + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=change_full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

In [None]:
aic_dict[
    "Timestamp:Year + C(Year) - Ammonium - Nitrate - Flow River - pH - Water Temperature - Non Significant Years"
] = results.aic

# See Results and choose best AIC

In [None]:
aic_df = pd.DataFrame(aic_dict.items(), columns=["Model", "AIC"])

In [None]:
aic_df

In [None]:
# The one with all the features has almost the lowest AIC and it has the maximum likelihood
# so we will use it

In [None]:
formula = 'is_outlier ~ Timestamp:Year + C(Year) + Q("Water Temperature") + Q("Daily Cumulated Rainfall") + Q("Air Temperature") + pH + Conductivity + Q("Flow River") + Nitrate + Ammonium + Q("Dissolved Oxygen") + Turbidity + Q("Redox Potential")'

In [None]:
logit_model = sm.GLM.from_formula(
    formula=formula, data=full_df, family=sm.families.Binomial()
)

In [None]:
results = logit_model.fit(maxiter=10000)

In [None]:
print(results.summary2())

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Absorbance 254nm", data=full_df, ax=axs[0]
)
sns.scatterplot(
    x=full_df["DateTime"],
    y=full_df[full_df["is_outlier"] == 1]["Absorbance 254nm"],
    ax=axs[0],
    color="red",
)
sns.lineplot(
    x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
)

# plot vertical lines for the years on the entire plot
for year in full_df["DateTime"].dt.year.unique():
    for ax in axs:
        ax.axvline(
            pd.to_datetime(str(year)), color="gray", linestyle="--"
        )

plt.show()

# Trend Analysis of probabilities

In [None]:
import statsmodels.tsa.seasonal as smt

In [None]:
# create dataframe with datetime and the fitted values
fitted_df = pd.DataFrame()
fitted_df["DateTime"] = full_df["DateTime"]
fitted_df["Probabilities"] = results.fittedvalues

In [None]:
# set the datetime as the index
fitted_df.set_index("DateTime", inplace=True)

In [None]:
result = smt.STL(fitted_df["Probabilities"], period=365).fit()

In [None]:
fig, axs = plt.subplots(4, 1, figsize=(40, 20))
sns.lineplot(data=result.observed, ax=axs[0])
sns.lineplot(data=result.trend, ax=axs[1])
sns.lineplot(data=result.seasonal, ax=axs[2])
sns.lineplot(data=result.resid, ax=axs[3])
fig.suptitle(feature)

plt.show()

# Probabilities with other variables

In [None]:
full_df.columns.to_list()

In [None]:
for feature in full_df.columns.difference(
    ["DateTime", "is_outlier", "Year", "Timestamp", "Absorbance 254nm"]
):
    fig, axs = plt.subplots(2, 1, figsize=(30, 15))

    sns.lineplot(x="DateTime", y=feature, data=full_df, ax=axs[0])
    sns.lineplot(
        x=full_df["DateTime"], y=results.fittedvalues.values, ax=axs[1]
    )

    # plot vertical lines for the years on the entire plot
    for year in full_df["DateTime"].dt.year.unique():
        for ax in axs:
            ax.axvline(
                pd.to_datetime(str(year)), color="gray", linestyle="--"
            )

    # compute spearman correlation between the feature and the fitted values
    spearman_correlation = full_df[feature].corr(
        results.fittedvalues, method="spearman"
    )
    axs[0].set_title(
        f"{feature} - Spearman Correlation: {spearman_correlation.round(3)}"
    )

    plt.savefig(os.path.join(probabilities_folder, f"{feature}.png"))

# Outliers Distribution

In [None]:
full_df

In [None]:
projections_df = full_df[
    [
        "DateTime",
        "Year",
        "Timestamp",
        "UVA254",
        "Air Temperature (°C)",
        "Daily Cumulated Rainfall (L/m²)",
        "Flow River (m³/s)",
        "Water Temperature (°C)",
        "is_outlier",
    ]
].copy()

## Air Temperature

In [None]:
air_temp_df = projections_df[
    ["DateTime", "Air Temperature (°C)", "is_outlier"]
].copy()

In [None]:
# change is_outlier to Outlier if True else to Regular
air_temp_df["is_outlier"] = air_temp_df["is_outlier"].map(
    {0: "UVA254 Regular", 1: "UVA254 Peak"}
)

In [None]:
# boxplot of air temp based on is outlier
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

sns.boxplot(
    x="is_outlier",
    y="Air Temperature (°C)",
    data=air_temp_df,
    palette=colors,
    hue="is_outlier",
)

# put count per category
outlier_count = air_temp_df["is_outlier"].value_counts()

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"N° Regulars: {outlier_count.iloc[0]}",
        f"N° Peaks: {outlier_count.iloc[1]}",
    )
)

# add text to boxplot
plt.text(
    0.7,
    0.95,
    text_string,
    transform=plt.gca().transAxes,
    fontsize=26,
    verticalalignment="top",
    bbox=props,
)

plt.xlabel("Type")

plt.show()

In [None]:
# do the same but with a hist with kde
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

for category in np.sort(air_temp_df["is_outlier"].unique()):
    color = colors.pop()

    category_df = air_temp_df[air_temp_df["is_outlier"] == category]
    # sns.histplot(
    #     category_df["Air Temperature (°C)"],
    #     label=category,
    #     kde=True,
    #     color=color,
    #     stat="density",
    # )
    sns.kdeplot(
        category_df["Air Temperature (°C)"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Air Temperature (°C)"].mean()
    variance = category_df["Air Temperature (°C)"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Air Temperature (°C) vs UVA254")
plt.xlabel("Air Temperature (°C)")
plt.ylabel("Density")
plt.legend()
plt.show()

## Rainfall

In [None]:
rainfall_df = projections_df[
    ["DateTime", "Daily Cumulated Rainfall (L/m²)", "is_outlier"]
].copy()

In [None]:
# change is_outlier to Outlier if True else to Regular
rainfall_df["is_outlier"] = rainfall_df["is_outlier"].map(
    {0: "UVA254 Regular", 1: "UVA254 Peak"}
)

In [None]:
rainfall_df = rainfall_df[
    rainfall_df["Daily Cumulated Rainfall (L/m²)"] > 0
]

In [None]:
# boxplot of air temp based on is outlier
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

sns.boxplot(
    x="is_outlier",
    y="Daily Cumulated Rainfall (L/m²)",
    data=rainfall_df[
        (rainfall_df["Daily Cumulated Rainfall (L/m²)"] > 0)
        & (rainfall_df["Daily Cumulated Rainfall (L/m²)"] < 30)
    ],
    palette=colors,
)

# put count per category
outlier_count = rainfall_df["is_outlier"].value_counts()

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"N° Regulars: {outlier_count.iloc[0]}",
        f"N° Peaks: {outlier_count.iloc[1]}",
    )
)

# add text to boxplot
plt.text(
    0.7,
    0.95,
    text_string,
    transform=plt.gca().transAxes,
    fontsize=26,
    verticalalignment="top",
    bbox=props,
)

plt.xlabel("Type")
plt.ylabel("Rainfall (mm)")

plt.show()

In [None]:
# do the same but with a hist with kde
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

for category in np.sort(rainfall_df["is_outlier"].unique()):
    color = colors.pop()

    category_df = rainfall_df[rainfall_df["is_outlier"] == category]
    # sns.histplot(
    #     category_df["Daily Cumulated Rainfall (L/m²)"],
    #     label=category,
    #     kde=True,
    #     color=color,
    #     stat="density",
    # )
    sns.kdeplot(
        category_df["Daily Cumulated Rainfall (L/m²)"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Daily Cumulated Rainfall (L/m²)"].mean()
    variance = category_df["Daily Cumulated Rainfall (L/m²)"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Daily Cumulated Rainfall (mm) vs UVA254")
plt.xlabel("Daily Cumulated Rainfall (mm)")
plt.ylabel("Density")
plt.legend()
plt.show()

## Flow River

In [None]:
flow_df = projections_df[
    ["DateTime", "Flow River (m³/s)", "is_outlier"]
].copy()

In [None]:
# change is_outlier to Outlier if True else to Regular
flow_df["is_outlier"] = flow_df["is_outlier"].map(
    {0: "UVA254 Regular", 1: "UVA254 Peak"}
)

In [None]:
# boxplot of air temp based on is outlier
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

sns.boxplot(
    x="is_outlier", y="Flow River (m³/s)", data=flow_df, palette=colors
)

# put count per category
outlier_count = flow_df["is_outlier"].value_counts()

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"N° Regulars: {outlier_count.iloc[0]}",
        f"N° Peaks: {outlier_count.iloc[1]}",
    )
)

# add text to boxplot
plt.text(
    0.3,
    0.95,
    text_string,
    transform=plt.gca().transAxes,
    fontsize=26,
    verticalalignment="top",
    bbox=props,
)

plt.xlabel("Type")

plt.show()

In [None]:
# do the same but with a hist with kde
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

for category in np.sort(flow_df["is_outlier"].unique()):
    color = colors.pop()

    category_df = flow_df[flow_df["is_outlier"] == category]
    # sns.histplot(
    #     category_df["Flow River (m³/s)"],
    #     label=category,
    #     kde=True,
    #     color=color,
    #     stat="density",
    # )
    sns.kdeplot(
        category_df["Flow River (m³/s)"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Flow River (m³/s)"].mean()
    variance = category_df["Flow River (m³/s)"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Flow River (m³/s) vs UVA254")
plt.xlabel("Flow River (m³/s)")
plt.ylabel("Density")
plt.legend()
plt.show()

## Water Temperature

In [None]:
water_temp_df = projections_df[
    ["DateTime", "Water Temperature (°C)", "is_outlier"]
].copy()

In [None]:
# change is_outlier to Outlier if True else to Regular
water_temp_df["is_outlier"] = water_temp_df["is_outlier"].map(
    {0: "UVA254 Regular", 1: "UVA254 Peak"}
)

In [None]:
# boxplot of air temp based on is outlier
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

sns.boxplot(
    x="is_outlier",
    y="Water Temperature (°C)",
    data=water_temp_df,
    palette=colors,
)

# put count per category
outlier_count = water_temp_df["is_outlier"].value_counts()

props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)

text_string = "\n".join(
    (
        f"N° Regulars: {outlier_count.iloc[0]}",
        f"N° Peaks: {outlier_count.iloc[1]}",
    )
)

# add text to boxplot
plt.text(
    0.7,
    0.95,
    text_string,
    transform=plt.gca().transAxes,
    fontsize=26,
    verticalalignment="top",
    bbox=props,
)

plt.xlabel("Type")

plt.show()

In [None]:
# do the same but with a hist with kde
plt.figure(figsize=(20, 10))

colors = ["g", "b"]

for category in np.sort(water_temp_df["is_outlier"].unique()):
    color = colors.pop()

    category_df = water_temp_df[water_temp_df["is_outlier"] == category]
    # sns.histplot(
    #     category_df["Water Temperature (°C)"],
    #     label=category,
    #     kde=True,
    #     color=color,
    #     stat="density",
    # )
    sns.kdeplot(
        category_df["Water Temperature (°C)"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Water Temperature (°C)"].mean()
    variance = category_df["Water Temperature (°C)"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Water Temperature (°C) vs UVA254")
plt.xlabel("Water Temperature (°C)")
plt.ylabel("Density")
plt.legend()
plt.show()