For the available projections, a univariate regression is performed for every month in order to have monthly projection ranging from 2000 to 2100.

A trend analysis between Tarragona Data and Projection Data is performed in order to assess if what we found in the Tarragona Data is consistent with the projections.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.tsa.seasonal as smt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler

import scipy.stats as stats
import pingouin as pg
import scikit_posthocs as sp

plt.rcParams.update({"font.size": 20})

# Define Paths

In [None]:
data_folder = os.path.join("..", "..", "data", "tarragona")

projection_folder = os.path.join(data_folder, "future_projections")
processed_projections_folder = os.path.join(
    projection_folder, "processed"
)

interpolated_projections_folder = os.path.join(
    projection_folder, "interpolated"
)

raw_data_folder = os.path.join(data_folder, "raw_data")

# Load Data

In [None]:
# Xerta Data
xerta_df = pd.read_excel(
    os.path.join(raw_data_folder, "raw_full_dataset.xlsx")
)

In [None]:
xerta_df

In [None]:
xerta_df = xerta_df.dropna()

In [None]:
xerta_df["Year"] = xerta_df["DateTime"].dt.year
xerta_df["Month"] = xerta_df["DateTime"].dt.month

In [None]:
# take the monthly average, but for the rain, we take the sum
# UPDATE: for the moment we will only take the mean of the rain since
# the projections are also monthly averages

xerta_monthly_df = xerta_df.groupby(["Year", "Month"]).agg(
    {
        "Flow River (m³/s)": "mean",
        "Daily Cumulated Rainfall (L/m²)": "mean",
        "Air Temperature (°C)": "mean",
        "Nitrate (mg/L)": "mean",
        "pH": "mean",
        "Ammonium (mg/L)": "mean",
        "Conductivity (µS/cm)": "mean",
        "Dissolved Oxygen (mg/L)": "mean",
        "Turbidity (NTU)": "mean",
        "Water Temperature (°C)": "mean",
        "Redox Potential (mV)": "mean",
        "UVA254": "mean",
    }
)

In [None]:
xerta_df = xerta_monthly_df.reset_index()

In [None]:
xerta_df

In [None]:
xerta_df["DateTime"] = pd.to_datetime(
    xerta_df[["Year", "Month"]].assign(day=15)
)

In [None]:
# plot each variable
for col in xerta_df.columns[2:-1]:
    plt.figure(figsize=(10, 5))
    sns.lineplot(x="DateTime", y=col, data=xerta_df)
    plt.title(col)
    plt.show()

In [None]:
# Air Temperature Projections
air_temp_df = pd.read_excel(
    os.path.join(processed_projections_folder, "air_temperature.xlsx")
)
air_temp_df["Year"] = air_temp_df["time"].dt.year
air_temp_df["Month"] = air_temp_df["time"].dt.month

# Rain Projections
rain_df = pd.read_excel(
    os.path.join(processed_projections_folder, "precipitation.xlsx")
)
rain_df["Year"] = rain_df["time"].dt.year
rain_df["Month"] = rain_df["time"].dt.month

# River Flow Projections
river_flow_df = pd.read_excel(
    os.path.join(processed_projections_folder, "river_discharge.xlsx")
)
river_flow_df["Year"] = river_flow_df["time"].dt.year
river_flow_df["Month"] = river_flow_df["time"].dt.month

# Water Temperature Projections
water_temp_df = pd.read_excel(
    os.path.join(processed_projections_folder, "water_temperature.xlsx")
)

In [None]:
air_temp_df

In [None]:
air_temp_df.drop(
    columns=["y", "x", "lat", "lon", "height"], inplace=True
)

In [None]:
rain_df

In [None]:
rain_df.drop(columns=["y", "x", "lat", "lon"], inplace=True)

In [None]:
river_flow_df

In [None]:
river_flow_df.drop(columns=["y", "x", "lat", "lon"], inplace=True)

In [None]:
water_temp_df

# Data Interpolation of Projections

Linear Regression for each month

## Air Temperature

In [None]:
# Create two datasets, for rcp45 and rcp85 both containing the historical data
rcp45_df = air_temp_df[
    (air_temp_df["label"] == "rcp45")
    | (air_temp_df["label"] == "historical")
]
rcp85_df = air_temp_df[
    (air_temp_df["label"] == "rcp85")
    | (air_temp_df["label"] == "historical")
]

In [None]:
plt.rcParams.update({"font.size": 28})

In [None]:
colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in air_temp_df["label"].unique():
    for year in air_temp_df["time"].dt.year.unique():
        temp_df = air_temp_df[
            (air_temp_df["label"] == label)
            & (air_temp_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["tas_ymonmean"],
            color=colors[label],
            linewidth=3,
        )

        sns.scatterplot(
            x=temp_df["time"],
            y=temp_df["tas_ymonmean"],
            color=colors[label],
            s=150,
        )

    if label == "historical":
        plt.plot(
            [], [], color=colors[label], label="historical Copernicus"
        )
    else:
        plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Air Temperature (°C)")

plt.title("Average Monthly Air Temperature at Xerta")

plt.legend()
plt.show()

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp45_df["label"].unique():
    for year in rcp45_df["time"].dt.year.unique():
        temp_df = rcp45_df[
            (rcp45_df["label"] == label)
            & (rcp45_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["tas_ymonmean"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Air Temperature (°C)")

plt.title("Average Monthly Air Temperature at Xerta")

plt.legend()
plt.show()

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp85_df["label"].unique():
    for year in rcp85_df["time"].dt.year.unique():
        temp_df = rcp85_df[
            (rcp85_df["label"] == label)
            & (rcp85_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["tas_ymonmean"],
            color=colors[label],
            marker="o",
        )

        temp_df = rcp45_df[
            (rcp45_df["label"] == label)
            & (rcp45_df["time"].dt.year == year)
        ]

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Air Temperature (°C)")

plt.legend()
plt.show()

### RCP 4.5

In [None]:
air_temp_rcp45_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp45_df[rcp45_df["Month"] == month][
        ["time", "tas_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    air_temp_rcp45_df = pd.concat([air_temp_rcp45_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = air_temp_rcp45_df[air_temp_rcp45_df["Month"] == month]
    sns.lineplot(
        x="time",
        y="tas_ymonmean",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = air_temp_rcp45_df[
    air_temp_rcp45_df["time"].dt.year == 2040
]["time"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = air_temp_rcp45_df[
    air_temp_rcp45_df["time"].dt.year == 2070
]["time"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Air Temperature Projections for RCP 4.5")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 8.5

In [None]:
air_temp_rcp85_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp85_df[rcp85_df["Month"] == month][
        ["time", "tas_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    air_temp_rcp85_df = pd.concat([air_temp_rcp85_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = air_temp_rcp85_df[air_temp_rcp85_df["Month"] == month]
    sns.lineplot(
        x="time",
        y="tas_ymonmean",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = air_temp_rcp85_df[
    air_temp_rcp85_df["time"].dt.year == 2040
]["time"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = air_temp_rcp85_df[
    air_temp_rcp85_df["time"].dt.year == 2070
]["time"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Air Temperature Projections for RCP 8.5")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

## Rainfall

In [None]:
# Create two datasets, for rcp45 and rcp85 both containing the historical data
rcp45_df = rain_df[
    (rain_df["label"] == "rcp45") | (rain_df["label"] == "historical")
]
rcp85_df = rain_df[
    (rain_df["label"] == "rcp85") | (rain_df["label"] == "historical")
]

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp45_df["label"].unique():
    for year in rcp45_df["time"].dt.year.unique():
        temp_df = rcp45_df[
            (rcp45_df["label"] == label)
            & (rcp45_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["pr_ymonmean"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Rainfall (mm)")

plt.title("Average Monthly Rainfall at Xerta")

plt.legend()
plt.show()

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp85_df["label"].unique():
    for year in rcp85_df["time"].dt.year.unique():
        temp_df = rcp85_df[
            (rcp85_df["label"] == label)
            & (rcp85_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["pr_ymonmean"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Rainfall (mm)")

plt.legend(loc="upper left")
plt.show()

### RCP 4.5

In [None]:
rain_rcp45_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp45_df[rcp45_df["Month"] == month][
        ["time", "pr_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    rain_rcp45_df = pd.concat([rain_rcp45_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = rain_rcp45_df[rain_rcp45_df["Month"] == month]
    sns.lineplot(
        x="time", y="pr_ymonmean", data=month_df, label=f"Month {month}"
    )

# plot vline on year 2040 and 2070
timestamp = rain_rcp45_df[rain_rcp45_df["time"].dt.year == 2040][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = rain_rcp45_df[rain_rcp45_df["time"].dt.year == 2070][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Rainfall Projections for RCP 4.5")
plt.xlabel("Year")
plt.ylabel("Rainfall (mm)")
plt.show()

### RCP 8.5

In [None]:
rain_rcp85_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp85_df[rcp85_df["Month"] == month][
        ["time", "pr_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    rain_rcp85_df = pd.concat([rain_rcp85_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = rain_rcp85_df[rain_rcp85_df["Month"] == month]
    sns.lineplot(
        x="time", y="pr_ymonmean", data=month_df, label=f"Month {month}"
    )

# plot vline on year 2040 and 2070
timestamp = rain_rcp85_df[rain_rcp85_df["time"].dt.year == 2040][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = rain_rcp85_df[rain_rcp85_df["time"].dt.year == 2070][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Rainfall Projections for RCP 8.5")
plt.xlabel("Year")
plt.ylabel("Rainfall (mm)")
plt.show()

## Flow River

In [None]:
# Create two datasets, for rcp45 and rcp85 both containing the historical data
rcp45_df = river_flow_df[
    (river_flow_df["label"] == "rcp45")
    | (river_flow_df["label"] == "historical")
]
rcp85_df = river_flow_df[
    (river_flow_df["label"] == "rcp85")
    | (river_flow_df["label"] == "historical")
]

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp45_df["label"].unique():
    for year in rcp45_df["time"].dt.year.unique():
        temp_df = rcp45_df[
            (rcp45_df["label"] == label)
            & (rcp45_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["rdis_ymonmean"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Flow River Rate (m³/s)")

plt.title("Average Monthly Flow River Rate at Xerta")

plt.legend()
plt.show()

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp85_df["label"].unique():
    for year in rcp85_df["time"].dt.year.unique():
        temp_df = rcp85_df[
            (rcp85_df["label"] == label)
            & (rcp85_df["time"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["time"],
            y=temp_df["rdis_ymonmean"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Flow River Rate (m³/s)")

plt.legend(loc="upper right")
plt.show()

### RCP 4.5

In [None]:
flow_rcp45_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp45_df[rcp45_df["Month"] == month][
        ["time", "rdis_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    flow_rcp45_df = pd.concat([flow_rcp45_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = flow_rcp45_df[flow_rcp45_df["Month"] == month]
    sns.lineplot(
        x="time",
        y="rdis_ymonmean",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = flow_rcp45_df[flow_rcp45_df["time"].dt.year == 2040][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = flow_rcp45_df[flow_rcp45_df["time"].dt.year == 2070][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Flow River Rate Projections for RCP 4.5")
plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")
plt.show()

### RCP 8.5

In [None]:
flow_rcp85_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp85_df[rcp85_df["Month"] == month][
        ["time", "rdis_ymonmean"]
    ]
    # interpolate the data
    month_df.set_index("time", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    flow_rcp85_df = pd.concat([flow_rcp85_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = flow_rcp85_df[flow_rcp85_df["Month"] == month]
    sns.lineplot(
        x="time",
        y="rdis_ymonmean",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = flow_rcp85_df[flow_rcp85_df["time"].dt.year == 2040][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = flow_rcp85_df[flow_rcp85_df["time"].dt.year == 2070][
    "time"
].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Flow River Rate Projections for RCP 8.5")
plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")
plt.show()

## Water Temperature

In [None]:
# Create two datasets, for rcp45 and rcp85 both containing the historical data
rcp45_df = water_temp_df[
    (water_temp_df["label"] == "rcp45")
    | (water_temp_df["label"] == "historical")
]
rcp85_df = water_temp_df[
    (water_temp_df["label"] == "rcp85")
    | (water_temp_df["label"] == "historical")
]

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp45_df["label"].unique():
    for year in rcp45_df["DateTime"].dt.year.unique():
        temp_df = rcp45_df[
            (rcp45_df["label"] == label)
            & (rcp45_df["DateTime"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["DateTime"],
            y=temp_df["Value"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Water Temperature (°C)")

plt.title("Average Monthly Water Temperature at Xerta")

plt.legend()
plt.show()

In [None]:
# plot the raw data together with the projections and the historical data

colors = {
    "historical": "blue",
    "rcp45": "green",
    "rcp85": "red",
}

plt.figure(figsize=(30, 7.5))
for label in rcp85_df["label"].unique():
    for year in rcp85_df["DateTime"].dt.year.unique():
        temp_df = rcp85_df[
            (rcp85_df["label"] == label)
            & (rcp85_df["DateTime"].dt.year == year)
        ]
        sns.lineplot(
            x=temp_df["DateTime"],
            y=temp_df["Value"],
            color=colors[label],
            marker="o",
        )

    # set label
    plt.plot([], [], color=colors[label], label=label)

# sns.lineplot(x=full_df[full_df['Flow River'] < 700]['DateTime'], y=full_df[full_df['Flow River'] < 700]['Flow River'], color='black', marker='o', label='Observed')

plt.xlabel("Time")
plt.ylabel("Water Temperature (°C)")

# set legend top left
plt.legend(loc="upper left")
plt.show()

### RCP 4.5

In [None]:
water_temp_rcp45_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp45_df[rcp45_df["Month"] == month][
        ["DateTime", "Value"]
    ]
    # interpolate the data
    month_df.set_index("DateTime", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    water_temp_rcp45_df = pd.concat([water_temp_rcp45_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = water_temp_rcp45_df[
        water_temp_rcp45_df["Month"] == month
    ]
    sns.lineplot(
        x="DateTime",
        y="Value",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = water_temp_rcp45_df[
    water_temp_rcp45_df["DateTime"].dt.year == 2040
]["DateTime"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = water_temp_rcp45_df[
    water_temp_rcp45_df["DateTime"].dt.year == 2070
]["DateTime"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Water Temperature Projections for RCP 4.5")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 8.5

In [None]:
water_temp_rcp85_df = pd.DataFrame()

for month in range(1, 13):
    month_df = rcp85_df[rcp85_df["Month"] == month][
        ["DateTime", "Value"]
    ]
    # interpolate the data
    month_df.set_index("DateTime", inplace=True)
    month_df = month_df.resample("Y").mean()
    month_df = month_df.interpolate(
        method="linear", limit_direction="both"
    )
    month_df.reset_index(inplace=True)
    month_df["Month"] = month

    water_temp_rcp85_df = pd.concat([water_temp_rcp85_df, month_df])

In [None]:
plt.figure(figsize=(20, 10))

for month in range(1, 13):
    month_df = water_temp_rcp85_df[
        water_temp_rcp85_df["Month"] == month
    ]
    sns.lineplot(
        x="DateTime",
        y="Value",
        data=month_df,
        label=f"Month {month}",
    )

# plot vline on year 2040 and 2070
timestamp = water_temp_rcp85_df[
    water_temp_rcp85_df["DateTime"].dt.year == 2040
]["DateTime"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")
timestamp = water_temp_rcp85_df[
    water_temp_rcp85_df["DateTime"].dt.year == 2070
]["DateTime"].values[0]
plt.axvline(x=timestamp, color="r", linestyle="--")

plt.title("Month-wise Water Temperature Projections for RCP 8.5")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

In [None]:
plt.rcParams.update({"font.size": 26})

# Trend Analysis

## Air Temperature

### Xerta

In [None]:
temp_df = xerta_df[
    ["Year", "Month", "Air Temperature (°C)", "DateTime"]
]

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Air Temperature (°C)",
    data=temp_df,
    label="Historical Data",
)

In [None]:
result = smt.seasonal_decompose(
    x=temp_df.set_index("DateTime")["Air Temperature (°C)"],
    model="multiplicative",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
data_trend = result.trend
data_seasonal = result.seasonal
data_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    temp_df.set_index("DateTime")["Air Temperature (°C)"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(data_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(data_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(data_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Air Temperature (°C)",
    data=temp_df,
    label="Historical Data",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Trend",
)

plt.title("Air Temperature Trend")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 4.5

In [None]:
# set the month to the time column
air_temp_rcp45_df["DateTime"] = air_temp_rcp45_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
air_temp_rcp45_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="tas_ymonmean",
    data=air_temp_rcp45_df,
    label="RCP 4.5",
    color="g",
)

sns.lineplot(
    x="DateTime",
    y="Air Temperature (°C)",
    data=temp_df,
    label="Xerta Data",
    color="black",
)

plt.title("Air Temperature")
plt.xlabel("Year")
plt.ylabel("Air Temperature (°C)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=air_temp_rcp45_df.set_index("DateTime")[["tas_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp45_trend = result.trend
rcp45_seasonal = result.seasonal
rcp45_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    air_temp_rcp45_df.set_index("DateTime")["tas_ymonmean"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp45_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp45_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp45_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="tas_ymonmean",
    data=air_temp_rcp45_df,
    label="RCP 4.5",
)

sns.lineplot(
    x=air_temp_rcp45_df["DateTime"],
    y=rcp45_trend.values,
    label="Trend",
)

plt.title("Air Temperature Trend")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 8.5

In [None]:
# set the month to the time column
air_temp_rcp85_df["DateTime"] = air_temp_rcp85_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
air_temp_rcp85_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="tas_ymonmean",
    data=air_temp_rcp85_df,
    label="RCP 8.5",
)

sns.lineplot(
    x="DateTime",
    y="Air Temperature (°C)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

plt.xlabel("Year")
plt.ylabel("Air Temperature (°C)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=air_temp_rcp85_df.set_index("DateTime")[["tas_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp85_trend = result.trend
rcp85_seasonal = result.seasonal
rcp85_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    air_temp_rcp85_df.set_index("DateTime")["tas_ymonmean"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp85_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp85_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp85_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="tas_ymonmean",
    data=air_temp_rcp85_df,
    label="RCP 8.5",
)

sns.lineplot(
    x=air_temp_rcp85_df["DateTime"],
    y=rcp85_trend.values,
    label="RCP 8.5 Trend",
)

sns.lineplot(
    x="DateTime",
    y="Air Temperature (°C)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Tarragona Trend",
    color="red",
)

plt.legend(loc="lower right")


plt.xlabel("Year")
plt.ylabel("Air Temperature (°C)")
plt.show()

### Compare Trends

Linear Regression estimation for each trend, then t-test on the slope.

#### Xerta

In [None]:
df = pd.DataFrame()
timestamps = temp_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = data_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

data_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(data_model.summary())

In [None]:
data_intercept = data_model.params["const"]
data_slope = data_model.params["timestamps"]

#### RCP 4.5

In [None]:
df = pd.DataFrame()
timestamps = (
    air_temp_rcp45_df["DateTime"].values.astype(float) / 10**9
)

df["timestamps"] = timestamps
df["trend"] = rcp45_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp45_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp45_model.summary())

In [None]:
rcp45_intercept = rcp45_model.params["const"]
rcp45_slope = rcp45_model.params["timestamps"]

#### RCP 8.5

In [None]:
df = pd.DataFrame()
timestamps = (
    air_temp_rcp85_df["DateTime"].values.astype(float) / 10**9
)

df["timestamps"] = timestamps
df["trend"] = rcp85_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp85_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp85_model.summary())

In [None]:
rcp85_intercept = rcp85_model.params["const"]
rcp85_slope = rcp85_model.params["timestamps"]

#### Final Comparison

$t = \frac{b_1 - b_2}{\sqrt{s_{b_1}^2 + s_{b_2}^2}}$,
$df = n_1 + n_2 - 4$

In [None]:
# plot regression lines using intercept and slope
plt.figure(figsize=(20, 10))

xx = np.linspace(-2, 2, 100)
yy = data_intercept + data_slope * xx
sns.lineplot(x=xx, y=yy, label="Historical Data")

yy = rcp45_intercept + rcp45_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 4.5")

yy = rcp85_intercept + rcp85_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 8.5")

plt.title("Air Temperature Trend Regression Lines")

plt.show()

##### Data vs RCP 4.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp45_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp45_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid

# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

##### Data vs RCP 8.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp85_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp85_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid
# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

## Rainfall

### Xerta

In [None]:
temp_df = xerta_df[
    ["Year", "Month", "Daily Cumulated Rainfall (L/m²)", "DateTime"]
]

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Daily Cumulated Rainfall (L/m²)",
    data=temp_df,
    label="Historical Data",
)

In [None]:
result = smt.seasonal_decompose(
    x=temp_df.set_index("DateTime")["Daily Cumulated Rainfall (L/m²)"],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
data_trend = result.trend
data_seasonal = result.seasonal
data_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    temp_df.set_index("DateTime")["Daily Cumulated Rainfall (L/m²)"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(data_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(data_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(data_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Daily Cumulated Rainfall (L/m²)",
    data=temp_df,
    label="Historical Data",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Trend",
)

plt.title("Rainfall Trend")
plt.xlabel("Year")
plt.ylabel("Average Monthly Rainfall (mm)")
plt.show()

### RCP 4.5

In [None]:
# set the month to the time column
rain_rcp45_df["DateTime"] = rain_rcp45_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
rain_rcp45_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="pr_ymonmean",
    data=rain_rcp45_df,
    label="RCP 4.5",
    color="g",
)

sns.lineplot(
    x="DateTime",
    y="Daily Cumulated Rainfall (L/m²)",
    data=temp_df,
    label="Xerta Data",
    color="black",
)

plt.title("Rainfall")
plt.xlabel("Year")
plt.ylabel("Rainfall (mm)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=rain_rcp45_df.set_index("DateTime")[["pr_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp45_trend = result.trend
rcp45_seasonal = result.seasonal
rcp45_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    rain_rcp45_df.set_index("DateTime")["pr_ymonmean"], label="Original"
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp45_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp45_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp45_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="pr_ymonmean", data=rain_rcp45_df, label="RCP 4.5"
)

sns.lineplot(
    x=rain_rcp45_df["DateTime"],
    y=rcp45_trend.values,
    label="Trend",
)

plt.title("Rainfall Trend")
plt.xlabel("Year")
plt.ylabel("Average Monthly Rainfall (mm)")
plt.show()

### RCP 8.5

In [None]:
# set the month to the time column
rain_rcp85_df["DateTime"] = rain_rcp85_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
rain_rcp85_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="pr_ymonmean",
    data=rain_rcp85_df,
    label="RCP 8.5",
)

sns.lineplot(
    x="DateTime",
    y="Daily Cumulated Rainfall (L/m²)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

plt.xlabel("Year")
plt.ylabel("Rainfall (mm)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=rain_rcp85_df.set_index("DateTime")[["pr_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp85_trend = result.trend
rcp85_seasonal = result.seasonal
rcp85_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    rain_rcp85_df.set_index("DateTime")["pr_ymonmean"], label="Original"
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp85_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp85_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp85_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="pr_ymonmean", data=rain_rcp85_df, label="RCP 8.5"
)

sns.lineplot(
    x=rain_rcp85_df["DateTime"],
    y=rcp85_trend.values,
    label="RCP 8.5 Trend",
)

sns.lineplot(
    x="DateTime",
    y="Daily Cumulated Rainfall (L/m²)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Tarragona Trend",
    color="red",
)

plt.xlabel("Year")
plt.ylabel("Rainfall (mm)")
plt.show()

### Compare Trends

Linear Regression estimation for each trend, then t-test on the slope.

#### Xerta

In [None]:
df = pd.DataFrame()
timestamps = temp_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = data_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

data_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(data_model.summary())

In [None]:
data_intercept = data_model.params["const"]
data_slope = data_model.params["timestamps"]

#### RCP 4.5

In [None]:
df = pd.DataFrame()
timestamps = rain_rcp45_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = rcp45_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp45_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp45_model.summary())

In [None]:
rcp45_intercept = rcp45_model.params["const"]
rcp45_slope = rcp45_model.params["timestamps"]

#### RCP 8.5

In [None]:
df = pd.DataFrame()
timestamps = rain_rcp85_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = rcp85_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp85_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp85_model.summary())

In [None]:
rcp85_intercept = rcp85_model.params["const"]
rcp85_slope = rcp85_model.params["timestamps"]

#### Final Comparison

$t = \frac{b_1 - b_2}{\sqrt{s_{b_1}^2 + s_{b_2}^2}}$,
$df = n_1 + n_2 - 4$

In [None]:
# plot regression lines using intercept and slope
plt.figure(figsize=(20, 10))

xx = np.linspace(-2, 2, 100)
yy = data_intercept + data_slope * xx
sns.lineplot(x=xx, y=yy, label="Historical Data")

yy = rcp45_intercept + rcp45_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 4.5")

yy = rcp85_intercept + rcp85_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 8.5")

plt.title("Rainfall Trend Regression Lines")

plt.show()

##### Data vs RCP 4.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp45_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp45_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid

# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

##### Data vs RCP 8.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp85_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp85_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid
# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

## Flow River

### Xerta

In [None]:
temp_df = xerta_df[["Year", "Month", "Flow River (m³/s)", "DateTime"]]

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Flow River (m³/s)",
    data=temp_df,
    label="Historical Data",
)

In [None]:
result = smt.seasonal_decompose(
    x=temp_df.set_index("DateTime")["Flow River (m³/s)"],
    model="multiplicative",
    period=12,
    two_sided=False,
    extrapolate_trend="freq",
)

In [None]:
data_trend = result.trend
data_seasonal = result.seasonal
data_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    temp_df.set_index("DateTime")["Flow River (m³/s)"], label="Original"
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(data_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(data_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(data_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Flow River (m³/s)",
    data=temp_df,
    label="Historical Data",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Trend",
)

plt.title("Flow River Trend")
plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")
plt.show()

### RCP 4.5

In [None]:
# set the month to the time column
flow_rcp45_df["DateTime"] = flow_rcp45_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
flow_rcp45_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="rdis_ymonmean",
    data=flow_rcp45_df,
    label="RCP 4.5",
    color="g",
)

sns.lineplot(
    x="DateTime",
    y="Flow River (m³/s)",
    data=temp_df,
    label="Xerta Data",
    color="black",
)

plt.title("Flow River Rate")
plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=flow_rcp45_df.set_index("DateTime")[["rdis_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp45_trend = result.trend
rcp45_seasonal = result.seasonal
rcp45_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    flow_rcp45_df.set_index("DateTime")["rdis_ymonmean"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp45_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp45_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp45_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="rdis_ymonmean", data=flow_rcp45_df, label="RCP 4.5"
)

sns.lineplot(
    x=air_temp_rcp45_df["DateTime"],
    y=rcp45_trend.values,
    label="Trend",
)

plt.title("Flow River Trend")
plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")
plt.show()

### RCP 8.5

In [None]:
# set the month to the time column
flow_rcp85_df["DateTime"] = flow_rcp85_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
flow_rcp85_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="rdis_ymonmean",
    data=flow_rcp85_df,
    label="RCP 8.5",
)

sns.lineplot(
    x="DateTime",
    y="Flow River (m³/s)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=flow_rcp85_df.set_index("DateTime")[["rdis_ymonmean"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp85_trend = result.trend
rcp85_seasonal = result.seasonal
rcp85_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    flow_rcp85_df.set_index("DateTime")["rdis_ymonmean"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp85_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp85_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp85_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="rdis_ymonmean", data=flow_rcp85_df, label="RCP 8.5"
)

sns.lineplot(
    x=flow_rcp85_df["DateTime"],
    y=rcp85_trend.values,
    label="RCP 8.5 Trend",
)

sns.lineplot(
    x="DateTime",
    y="Flow River (m³/s)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Tarragona Trend",
    color="red",
)

plt.xlabel("Year")
plt.ylabel("Flow River Rate (m³/s)")
plt.show()

### Compare Trends

Linear Regression estimation for each trend, then t-test on the slope.

#### Xerta

In [None]:
df = pd.DataFrame()
timestamps = temp_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = data_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

data_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(data_model.summary())

In [None]:
data_intercept = data_model.params["const"]
data_slope = data_model.params["timestamps"]

#### RCP 4.5

In [None]:
df = pd.DataFrame()
timestamps = flow_rcp45_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = rcp45_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp45_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp45_model.summary())

In [None]:
rcp45_intercept = rcp45_model.params["const"]
rcp45_slope = rcp45_model.params["timestamps"]

#### RCP 8.5

In [None]:
df = pd.DataFrame()
timestamps = flow_rcp85_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = rcp85_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp85_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp85_model.summary())

In [None]:
rcp85_intercept = rcp85_model.params["const"]
rcp85_slope = rcp85_model.params["timestamps"]

#### Final Comparison

$t = \frac{b_1 - b_2}{\sqrt{s_{b_1}^2 + s_{b_2}^2}}$,
$df = n_1 + n_2 - 4$

In [None]:
# plot regression lines using intercept and slope
plt.figure(figsize=(20, 10))

xx = np.linspace(-2, 2, 100)
yy = data_intercept + data_slope * xx
sns.lineplot(x=xx, y=yy, label="Historical Data")

yy = rcp45_intercept + rcp45_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 4.5")

yy = rcp85_intercept + rcp85_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 8.5")

plt.title("Air Temperature Trend Regression Lines")

plt.show()

##### Data vs RCP 4.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp45_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp45_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid

# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

##### Data vs RCP 8.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp85_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp85_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid
# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

## Water Temperature

### Xerta

In [None]:
temp_df = xerta_df[
    ["Year", "Month", "Water Temperature (°C)", "DateTime"]
]

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Water Temperature (°C)",
    data=temp_df,
    label="Historical Data",
)

In [None]:
result = smt.seasonal_decompose(
    x=temp_df.set_index("DateTime")["Water Temperature (°C)"],
    model="multiplicative",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
data_trend = result.trend
data_seasonal = result.seasonal
data_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    temp_df.set_index("DateTime")["Water Temperature (°C)"],
    label="Original",
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(data_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(data_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(data_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Water Temperature (°C)",
    data=temp_df,
    label="Historical Data",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Trend",
)

plt.title("Water Temperature Trend")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 4.5

In [None]:
# set the month to the time column
water_temp_rcp45_df.rename(columns={"DateTime": "time"}, inplace=True)
water_temp_rcp45_df["DateTime"] = water_temp_rcp45_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
water_temp_rcp45_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Value",
    data=water_temp_rcp45_df,
    label="RCP 4.5",
    errorbar=None,
    color="g",
)

sns.lineplot(
    x="DateTime",
    y="Water Temperature (°C)",
    data=temp_df,
    label="Xerta Data",
    color="black",
)

plt.title("Water Temperature")
plt.xlabel("Year")
plt.ylabel("Water Temperature (°C)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=water_temp_rcp45_df.set_index("DateTime")[["Value"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp45_trend = result.trend
rcp45_seasonal = result.seasonal
rcp45_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    water_temp_rcp45_df.set_index("DateTime")["Value"], label="Original"
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp45_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp45_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp45_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Value", data=water_temp_rcp45_df, label="RCP 4.5"
)

sns.lineplot(
    x=water_temp_rcp45_df["DateTime"],
    y=rcp45_trend.values,
    label="Trend",
)

plt.title("Water Temperature Trend")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.show()

### RCP 8.5

In [None]:
# set the month to the time column
water_temp_rcp85_df.rename(columns={"DateTime": "time"}, inplace=True)
water_temp_rcp85_df["DateTime"] = water_temp_rcp85_df.apply(
    lambda row: row["time"].replace(month=int(row["Month"]), day=15),
    axis=1,
)
water_temp_rcp85_df.sort_values(by="DateTime", inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime",
    y="Value",
    data=water_temp_rcp85_df,
    label="RCP 8.5",
)

sns.lineplot(
    x="DateTime",
    y="Water Temperature (°C)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

plt.xlabel("Year")
plt.ylabel("Water Temperature (°C)")

plt.show()

In [None]:
result = smt.seasonal_decompose(
    x=water_temp_rcp85_df.set_index("DateTime")[["Value"]],
    model="additive",
    period=12,
    two_sided=True,
    extrapolate_trend="freq",
)

In [None]:
rcp85_trend = result.trend
rcp85_seasonal = result.seasonal
rcp85_residual = result.resid

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(411)
plt.plot(
    water_temp_rcp85_df.set_index("DateTime")["Value"], label="Original"
)
plt.legend(loc="best")
plt.subplot(412)
plt.plot(rcp85_trend, label="Trend")
plt.legend(loc="best")
plt.subplot(413)
plt.plot(rcp85_seasonal, label="Seasonal")
plt.legend(loc="best")
plt.subplot(414)
plt.plot(rcp85_residual, label="Residual")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
# plot the trend
plt.figure(figsize=(20, 10))

sns.lineplot(
    x="DateTime", y="Value", data=water_temp_rcp85_df, label="RCP 8.5"
)

sns.lineplot(
    x=water_temp_rcp85_df["DateTime"],
    y=rcp85_trend.values,
    label="RCP 8.5 Trend",
)

sns.lineplot(
    x="DateTime",
    y="Water Temperature (°C)",
    data=temp_df,
    label="Tarragona Data",
    color="black",
)

sns.lineplot(
    x=temp_df["DateTime"],
    y=data_trend.values,
    label="Tarragona Trend",
    color="red",
)

plt.xlabel("Year")
plt.ylabel("Water Temperature (°C)")
plt.show()

### Compare Trends

Linear Regression estimation for each trend, then t-test on the slope.

#### Xerta

In [None]:
df = pd.DataFrame()
timestamps = temp_df["DateTime"].values.astype(float) / 10**9

df["timestamps"] = timestamps
df["trend"] = data_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

data_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(data_model.summary())

In [None]:
data_intercept = data_model.params["const"]
data_slope = data_model.params["timestamps"]

#### RCP 4.5

In [None]:
df = pd.DataFrame()
timestamps = (
    water_temp_rcp45_df["DateTime"].values.astype(float) / 10**9
)

df["timestamps"] = timestamps
df["trend"] = rcp45_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp45_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp45_model.summary())

In [None]:
rcp45_intercept = rcp45_model.params["const"]
rcp45_slope = rcp45_model.params["timestamps"]

#### RCP 8.5

In [None]:
df = pd.DataFrame()
timestamps = (
    water_temp_rcp85_df["DateTime"].values.astype(float) / 10**9
)

df["timestamps"] = timestamps
df["trend"] = rcp85_trend.values

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["timestamps", "trend"]])
df["timestamps"] = scaled_data[:, 0]
df["trend"] = scaled_data[:, 1]

rcp85_model = sm.OLS(
    df["trend"], sm.add_constant(df["timestamps"])
).fit()

In [None]:
print(rcp85_model.summary())

In [None]:
rcp85_intercept = rcp85_model.params["const"]
rcp85_slope = rcp85_model.params["timestamps"]

#### Final Comparison

$t = \frac{b_1 - b_2}{\sqrt{s_{b_1}^2 + s_{b_2}^2}}$,
$df = n_1 + n_2 - 4$

In [None]:
# plot regression lines using intercept and slope
plt.figure(figsize=(20, 10))

xx = np.linspace(-2, 2, 100)
yy = data_intercept + data_slope * xx
sns.lineplot(x=xx, y=yy, label="Historical Data")

yy = rcp45_intercept + rcp45_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 4.5")

yy = rcp85_intercept + rcp85_slope * xx
sns.lineplot(x=xx, y=yy, label="RCP 8.5")

plt.title("Air Temperature Trend Regression Lines")

plt.show()

##### Data vs RCP 4.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp45_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp45_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid

# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

##### Data vs RCP 8.5

In [None]:
# Calculate the standard error of the difference in slopes
se_diff = np.sqrt(
    data_model.bse["timestamps"] ** 2
    + rcp85_model.bse["timestamps"] ** 2
)

In [None]:
t_stat = (data_slope - rcp85_slope) / se_diff
# compute the degrees of freedom for the t-distribution
dof = data_model._results.df_resid + rcp85_model._results.df_resid
# calculate the p-value
p_value = stats.t.sf(np.abs(t_stat), dof) * 2

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

# Absorbance Correlation

## Flow River Rate

In [None]:
df = xerta_df[["Flow River", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Flow River"].quantile([0.25, 0.5, 0.75])

df["Flow River Category"] = pd.cut(
    df["Flow River"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in np.sort(df["Flow River Category"].unique()):
    color = colors.pop()

    category_df = df[df["Flow River Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Flow River vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Flow River Category"] == "25%"]["Absorbance 254nm"],
    df[df["Flow River Category"] == "50%"]["Absorbance 254nm"],
    df[df["Flow River Category"] == "75%"]["Absorbance 254nm"],
    df[df["Flow River Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is heteroscedastic
# perform Welch's one-way test
welch_test = pg.welch_anova(
    data=df,
    dv="Absorbance 254nm",
    between="Flow River Category",
)

print(welch_test)

In [None]:
# no equal means
# perform post-hoc test using Conover test
conover_test = sp.posthoc_conover(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Flow River Category",
    p_adjust="holm",
    sort=True,
)

print(conover_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(conover_test)

plt.show()

In [None]:
# it seems that the 25% and 50% categories are significantly different from the 75% and >75% categories
# but the 75% and >75% categories are not significantly different from each other
# as well as the 25% and 50% categories

In [None]:
# combine the 75% and >75% categories and the 25% and 50% categories
df["Flow River Category"] = df["Flow River Category"].apply(
    lambda x: "50%+" if x in ["75%", ">75%"] else "50%-"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Flow River Category"].unique():
    color = colors.pop()

    category_df = df[df["Flow River Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Flow River vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 50%+ is significantly lower than 50%-
# perform the levene test
levene_test = stats.levene(
    df[df["Flow River Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Flow River Category"] == "50%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoscedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Flow River Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Flow River Category"] == "50%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 50%- is significantly lower than 50%+

## Rainfall

In [None]:
df = xerta_df[["Daily Cumulated Rainfall", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Daily Cumulated Rainfall"].quantile(
    [0.25, 0.5, 0.75]
)

df["Rainfall Category"] = pd.cut(
    df["Daily Cumulated Rainfall"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Rainfall Category"].unique():
    color = colors.pop()

    category_df = df[df["Rainfall Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Rainfall vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Rainfall Category"] == "25%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == "50%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == "75%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Rainfall Category"] == "25%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == "50%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == "75%"]["Absorbance 254nm"],
    df[df["Rainfall Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# euqal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Rainfall Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that all the categories are not significantly different from each other

## Air Temperature

In [None]:
df = xerta_df[["Air Temperature", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Air Temperature"].quantile([0.25, 0.5, 0.75])

df["Air Temperature Category"] = pd.cut(
    df["Air Temperature"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Air Temperature Category"].unique():
    color = colors.pop()

    category_df = df[df["Air Temperature Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Air Temperature vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Air Temperature Category"] == "25%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "50%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "75%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Air Temperature Category"] == "25%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "50%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "75%"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# equal means
# perform post-hoc test using Dunn's test

dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Air Temperature Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that all the categories are not significantly different from each other
# however, combine the 25% and 50% categories and the 75% and >75% categories

In [None]:
# combine the 75% and >75% categories and the 25% and 50% categories
df["Air Temperature Category"] = df["Air Temperature Category"].apply(
    lambda x: "50%+" if x in ["75%", ">75%"] else "50%-"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Air Temperature Category"].unique():
    color = colors.pop()

    category_df = df[df["Air Temperature Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Air Temperature vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 50%+ is significantly lower than 50%-
# perform the levene test
levene_test = stats.levene(
    df[df["Air Temperature Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "50%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is heteroscedastic
# perform the t-test for independent samples with non equal variances
t_test = stats.ttest_ind(
    df[df["Air Temperature Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Air Temperature Category"] == "50%+"]["Absorbance 254nm"],
    equal_var=False,
    alternative="greater",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 50%- is significantly higher than 50%+
# Low Air Temperature is associated with high Absorbance 254nm

## Water Temperature

In [None]:
df = xerta_df[["Water Temperature", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Water Temperature"].quantile([0.25, 0.5, 0.75])

df["Water Temperature Category"] = pd.cut(
    df["Water Temperature"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Water Temperature Category"].unique():
    color = colors.pop()

    category_df = df[df["Water Temperature Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Water Temperature vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Water Temperature Category"] == "25%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "50%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "75%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Water Temperature Category"] == "25%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "50%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "75%"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test

dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Water Temperature Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the 25% category is significantly different from the 75% and >75% categories

In [None]:
# combine the 75% and >75% categories and the 25% and 50% categories
df["Water Temperature Category"] = df[
    "Water Temperature Category"
].apply(lambda x: "50%+" if x in ["75%", ">75%"] else "50%-")

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Water Temperature Category"].unique():
    color = colors.pop()

    category_df = df[df["Water Temperature Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Water Temperature vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 50%+ is significantly lower than 50%-
# perform the levene test
levene_test = stats.levene(
    df[df["Water Temperature Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "50%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Water Temperature Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Water Temperature Category"] == "50%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="greater",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 50%- is significantly higher than 50%+
# Low Water Temperature is associated with high Absorbance 254nm

## Turbidity

In [None]:
df = xerta_df[["Turbidity", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Turbidity"].quantile([0.25, 0.5, 0.75])

df["Turbidity Category"] = pd.cut(
    df["Turbidity"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Turbidity Category"].unique():
    color = colors.pop()

    category_df = df[df["Turbidity Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Turbidity vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Turbidity Category"] == "25%"]["Absorbance 254nm"],
    df[df["Turbidity Category"] == "50%"]["Absorbance 254nm"],
    df[df["Turbidity Category"] == "75%"]["Absorbance 254nm"],
    df[df["Turbidity Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is heteroschedastic
# perform Welch's one-way test
welch_test = pg.welch_anova(
    data=df,
    dv="Absorbance 254nm",
    between="Turbidity Category",
)

print(welch_test)

In [None]:
# no equal means
# perform post-hoc test using Conover test
conover_test = sp.posthoc_conover(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Turbidity Category",
    p_adjust="holm",
    sort=True,
)

print(conover_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(conover_test)

plt.show()

In [None]:
# it seems that the >75% category is significantly different from the 50% and 75% categories

In [None]:
# combine the 25%, 50% and 75% categories
df["Turbidity Category"] = df["Turbidity Category"].apply(
    lambda x: "75%-" if x in ["25%", "50%", "75%"] else "75%+"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Turbidity Category"].unique():
    color = colors.pop()

    category_df = df[df["Turbidity Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Turbidity vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# perform the levene test
levene_test = stats.levene(
    df[df["Turbidity Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Turbidity Category"] == "75%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is heteroschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Turbidity Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Turbidity Category"] == "75%+"]["Absorbance 254nm"],
    equal_var=False,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# Low Turbidity is associated with low Absorbance 254nm

## pH

In [None]:
df = xerta_df[["pH", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["pH"].quantile([0.25, 0.5, 0.75])

df["pH Category"] = pd.cut(
    df["pH"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["pH Category"].unique():
    color = colors.pop()

    category_df = df[df["pH Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("pH vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["pH Category"] == "25%"]["Absorbance 254nm"],
    df[df["pH Category"] == "50%"]["Absorbance 254nm"],
    df[df["pH Category"] == "75%"]["Absorbance 254nm"],
    df[df["pH Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["pH Category"] == "25%"]["Absorbance 254nm"],
    df[df["pH Category"] == "50%"]["Absorbance 254nm"],
    df[df["pH Category"] == "75%"]["Absorbance 254nm"],
    df[df["pH Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="pH Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the >75% category is significantly different from the other categories

In [None]:
# combine the 25%, 50% and 75% categories
df["pH Category"] = df["pH Category"].apply(
    lambda x: "75%-" if x in ["25%", "50%", "75%"] else "75%+"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["pH Category"].unique():
    color = colors.pop()

    category_df = df[df["pH Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("pH vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# perform the levene test
levene_test = stats.levene(
    df[df["pH Category"] == "75%-"]["Absorbance 254nm"],
    df[df["pH Category"] == "75%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["pH Category"] == "75%-"]["Absorbance 254nm"],
    df[df["pH Category"] == "75%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# Low pH is associated with low Absorbance 254nm

## Nitrate

In [None]:
df = xerta_df[["Nitrate", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Nitrate"].quantile([0.25, 0.5, 0.75])

df["Nitrate Category"] = pd.cut(
    df["Nitrate"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Nitrate Category"].unique():
    color = colors.pop()

    category_df = df[df["Nitrate Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Nitrate vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Nitrate Category"] == "25%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "50%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "75%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Nitrate Category"] == "25%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "50%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "75%"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Nitrate Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the categories are not significantly different from each other

In [None]:
# combine the 50%, 75% and >75% categories
df["Nitrate Category"] = df["Nitrate Category"].apply(
    lambda x: "25%+" if x in ["50%", "75%", ">75%"] else "25%-"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Nitrate Category"].unique():
    color = colors.pop()

    category_df = df[df["Nitrate Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Nitrate vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 25%- is significantly lower than 25%+
# perform the levene test
levene_test = stats.levene(
    df[df["Nitrate Category"] == "25%-"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "25%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Nitrate Category"] == "25%-"]["Absorbance 254nm"],
    df[df["Nitrate Category"] == "25%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="greater",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 25%- is significantly higher than 25%+
# Low Nitrate is associated with high Absorbance 254nm

## Dissolved Oxygen

In [None]:
df = xerta_df[["Dissolved Oxygen", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Dissolved Oxygen"].quantile([0.25, 0.5, 0.75])

df["Dissolved Oxygen Category"] = pd.cut(
    df["Dissolved Oxygen"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Dissolved Oxygen Category"].unique():
    color = colors.pop()

    category_df = df[df["Dissolved Oxygen Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Dissolved Oxygen vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Dissolved Oxygen Category"] == "25%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "50%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "75%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Dissolved Oxygen Category"] == "25%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "50%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "75%"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Dissolved Oxygen Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the 25% and 50% categories are significantly different from the 75% and >75% categories

In [None]:
# combine the 75% and >75% categories and the 25% and 50% categories
df["Dissolved Oxygen Category"] = df["Dissolved Oxygen Category"].apply(
    lambda x: "50%+" if x in ["75%", ">75%"] else "50%-"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Dissolved Oxygen Category"].unique():
    color = colors.pop()

    category_df = df[df["Dissolved Oxygen Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Dissolved Oxygen vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 25%- is significantly lower than 25%+
# perform the levene test
levene_test = stats.levene(
    df[df["Dissolved Oxygen Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "50%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Dissolved Oxygen Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Dissolved Oxygen Category"] == "50%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 50%- is significantly lower than 50%+
# Low Dissolved Oxygen is associated with low Absorbance 254nm

## Ammonium

In [None]:
df = xerta_df[["Ammonium", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Ammonium"].quantile([0.25, 0.5, 0.75])

df["Ammonium Category"] = pd.cut(
    df["Ammonium"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Ammonium Category"].unique():
    color = colors.pop()

    category_df = df[df["Ammonium Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Ammonium vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Ammonium Category"] == "25%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "50%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "75%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Ammonium Category"] == "25%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "50%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "75%"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Ammonium Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the >75% category is significantly different from the other categories

In [None]:
# combine the 25%, 50% and 75% categories
df["Ammonium Category"] = df["Ammonium Category"].apply(
    lambda x: "75%-" if x in ["25%", "50%", "75%"] else "75%+"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Ammonium Category"].unique():
    color = colors.pop()

    category_df = df[df["Ammonium Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Ammonium vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 25%- is significantly lower than 25%+
# perform the levene test
levene_test = stats.levene(
    df[df["Ammonium Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "75%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Ammonium Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Ammonium Category"] == "75%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# Low Ammonium is associated with low Absorbance 254nm

## Redox Potential

In [None]:
df = xerta_df[["Redox Potential", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Redox Potential"].quantile([0.25, 0.5, 0.75])

df["Redox Potential Category"] = pd.cut(
    df["Redox Potential"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Redox Potential Category"].unique():
    color = colors.pop()

    category_df = df[df["Redox Potential Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Redox Potential vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# TODO: pulire i dati e rifare

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Redox Potential Category"] == "25%"]["Absorbance 254nm"],
    df[df["Redox Potential Category"] == "50%"]["Absorbance 254nm"],
    df[df["Redox Potential Category"] == "75%"]["Absorbance 254nm"],
    df[df["Redox Potential Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is heteroschedastic
# perform Welch's one-way test
welch_test = pg.welch_anova(
    data=df,
    dv="Absorbance 254nm",
    between="Redox Potential Category",
)

print(welch_test)

In [None]:
# equal means
# perform post-hoc test using Conover test
conover_test = sp.posthoc_conover(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Redox Potential Category",
    p_adjust="holm",
    sort=True,
)

print(conover_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(conover_test)

plt.show()

In [None]:
# it seems that the >75% category is significantly different from the other categories

In [None]:
# combine the 25%, 50% and 75% categories
df["Redox Potential Category"] = df["Redox Potential Category"].apply(
    lambda x: "75%-" if x in ["25%", "50%", "75%"] else "75%+"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Redox Potential Category"].unique():
    color = colors.pop()

    category_df = df[df["Redox Potential Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Redox Potential vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 25%- is significantly lower than 25%+
# perform the levene test
levene_test = stats.levene(
    df[df["Redox Potential Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Redox Potential Category"] == "75%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Redox Potential Category"] == "75%-"]["Absorbance 254nm"],
    df[df["Redox Potential Category"] == "75%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="less",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 75%- is significantly lower than 75%+
# Low Ammonium is associated with low Absorbance 254nm

## Conductivity

In [None]:
df = xerta_df[["Conductivity", "Absorbance 254nm"]]

In [None]:
# divide the data based on the quantiles of the Flow River
# 25%, 50%, 75%, > 75%
flow_quantiles = df["Conductivity"].quantile([0.25, 0.5, 0.75])

df["Conductivity Category"] = pd.cut(
    df["Conductivity"],
    bins=[
        -np.inf,
        flow_quantiles[0.25],
        flow_quantiles[0.5],
        flow_quantiles[0.75],
        np.inf,
    ],
    labels=["25%", "50%", "75%", ">75%"],
)

In [None]:
plt.figure(figsize=(20, 10))

colors = ["r", "g", "b", "y"]

for category in df["Conductivity Category"].unique():
    color = colors.pop()

    category_df = df[df["Conductivity Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Conductivity vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# data is not normally distributed

In [None]:
# perform the levene test
levene_test = stats.levene(
    df[df["Conductivity Category"] == "25%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "50%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "75%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == ">75%"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform Kruskal-Wallis test
kruskal_test = stats.kruskal(
    df[df["Conductivity Category"] == "25%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "50%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "75%"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == ">75%"]["Absorbance 254nm"],
)

print(kruskal_test)

In [None]:
# no equal means
# perform post-hoc test using Dunn's test
dunn_test = sp.posthoc_dunn(
    a=df,
    val_col="Absorbance 254nm",
    group_col="Conductivity Category",
    p_adjust="holm",
)

print(dunn_test)

In [None]:
# plot the post-hoc test
plt.figure(figsize=(20, 10))

sp.sign_plot(dunn_test)

plt.show()

In [None]:
# it seems that the 75% and >75% categories are significantly different from the 25% and 50% categories

In [None]:
# combine the 25% and 50% categories and the 75% and >75% categories
df["Conductivity Category"] = df["Conductivity Category"].apply(
    lambda x: "50%+" if x in ["75%", ">75%"] else "50%-"
)

In [None]:
# plot the data again
plt.figure(figsize=(20, 10))

colors = ["r", "g"]

for category in df["Conductivity Category"].unique():
    color = colors.pop()

    category_df = df[df["Conductivity Category"] == category]
    # sns.histplot(category_df['Absorbance 254nm'], label=category, kde=True, color=color)
    sns.kdeplot(
        category_df["Absorbance 254nm"],
        color=color,
        label=category,
        fill=True,
        alpha=0.5,
    )

    # Calculate the mean and variance
    mean = category_df["Absorbance 254nm"].mean()
    variance = category_df["Absorbance 254nm"].var()

    # Plot the mean
    plt.axvline(mean, color=color, linestyle="--")

    # Plot the variance as a shaded region
    plt.fill_betweenx(
        [0, plt.gca().get_ylim()[1]],
        mean - np.sqrt(variance),
        mean + np.sqrt(variance),
        color=color,
        alpha=0.2,
    )

plt.title("Conductivity vs Absorbance 254nm")
plt.xlabel("Absorbance 254nm")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# compare the two categories to assess if the mean of the Absorbance 254nm of 25%- is significantly lower than 25%+
# perform the levene test
levene_test = stats.levene(
    df[df["Conductivity Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "50%+"]["Absorbance 254nm"],
)

print(levene_test)

In [None]:
# data is homoschedastic
# perform the t-test for independent samples with equal variances
t_test = stats.ttest_ind(
    df[df["Conductivity Category"] == "50%-"]["Absorbance 254nm"],
    df[df["Conductivity Category"] == "50%+"]["Absorbance 254nm"],
    equal_var=True,
    alternative="greater",
)

print(t_test)

In [None]:
# reject the null hypothesis
# the mean of the Absorbance 254nm of 50%- is significantly higher than 50%+
# Low Conductivity is associated with high Absorbance 254nm

# Store Projections

In [None]:
air_temp_rcp45_df.drop(columns=["time"], inplace=True)
air_temp_rcp45_df.rename(
    columns={"tas_ymonmean": "Value"}, inplace=True
)
air_temp_rcp45_df.to_excel(
    os.path.join(interpolated_projections_folder, "air_temp_rcp45.xlsx")
)

air_temp_rcp85_df.drop(columns=["time"], inplace=True)
air_temp_rcp85_df.rename(
    columns={"tas_ymonmean": "Value"}, inplace=True
)
air_temp_rcp85_df.to_excel(
    os.path.join(interpolated_projections_folder, "air_temp_rcp85.xlsx")
)

rain_rcp45_df.drop(columns=["time"], inplace=True)
rain_rcp45_df.rename(columns={"pr_ymonmean": "Value"}, inplace=True)
rain_rcp45_df.to_excel(
    os.path.join(
        interpolated_projections_folder, "precipitation_rcp45.xlsx"
    )
)

rain_rcp85_df.drop(columns=["time"], inplace=True)
rain_rcp85_df.rename(columns={"pr_ymonmean": "Value"}, inplace=True)
rain_rcp85_df.to_excel(
    os.path.join(
        interpolated_projections_folder, "precipitation_rcp85.xlsx"
    )
)

flow_rcp45_df.drop(columns=["time"], inplace=True)
flow_rcp45_df.rename(columns={"rdis_ymonmean": "Value"}, inplace=True)
flow_rcp45_df.to_excel(
    os.path.join(interpolated_projections_folder, "flow_rcp45.xlsx")
)

flow_rcp85_df.drop(columns=["time"], inplace=True)
flow_rcp85_df.rename(columns={"rdis_ymonmean": "Value"}, inplace=True)
flow_rcp85_df.to_excel(
    os.path.join(interpolated_projections_folder, "flow_rcp85.xlsx")
)

water_temp_rcp45_df.drop(columns=["time"], inplace=True)
water_temp_rcp45_df.to_excel(
    os.path.join(
        interpolated_projections_folder, "water_temp_rcp45.xlsx"
    )
)

water_temp_rcp85_df.drop(columns=["time"], inplace=True)
water_temp_rcp85_df.to_excel(
    os.path.join(
        interpolated_projections_folder, "water_temp_rcp85.xlsx"
    )
)