# Supply Points (Case dell'Acqua) Data Preprocessing

In [None]:
import os
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Paths

In [None]:
data_folder = os.path.join(os.path.join("..", "..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")
intermediate_data_folder = os.path.join(data_folder, "Intermediate Data")
metadata_folder = os.path.join(data_folder, "Metadata")

reunion_folder = os.path.join(raw_data_folder, "Riunione 24-04-2024")
plot_folder = os.path.join(data_folder, "Plots")
sensor_folder = os.path.join(reunion_folder, "Sensori")

clean_data_folder = os.path.join(data_folder, "Clean Data")

# Load Grab Samples

In [None]:
grab_samples_df = pd.read_excel(
    os.path.join(reunion_folder, "Dati_case_Lab_Ingressi-Uscite.xlsx"),
)

In [None]:
grab_samples_df

## Fix LOD values

In [None]:
with open(os.path.join(reunion_folder, "columns_types.json")) as f:
    column_types = json.load(f)

In [None]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
metadata_columns = list(set(metadata_columns) & set(grab_samples_df.columns))
features_columns = list(set(features_columns) & set(grab_samples_df.columns))
targets_columns = list(set(targets_columns) & set(grab_samples_df.columns))

In [None]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [None]:
def set_label(value):
    if pd.isna(value):
        return "NaN"
    elif isinstance(value, (int, float)):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "NaN"

In [None]:
# combine all value columns in the mapping to the corresponding key column
column_mapping = {
    "Concentrazione ioni idrogeno (unità pH)": [
        "Concentr. ioni idrogeno (al prelievo) (unità pH)"
    ],
}

for final_column, original_columns in column_mapping.items():
    for original_column in original_columns:
        grab_samples_df[final_column] = grab_samples_df[
            final_column
        ].combine_first(grab_samples_df[original_column])
    grab_samples_df.drop(columns=original_columns, inplace=True)

In [None]:
common_columns = [
    "Colore (Cu)",
    "Torbidità (NTu)",
    "Conduttività a 20°C (µS/cm)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)",
    "Concentrazione ioni idrogeno (unità pH)",
    "Temperatura (al prelievo) (°C)",
    "Nitrati (mg/L)",
    "TOC - carbonio organico totale (mg/L di C)",
]

In [None]:
grab_samples_df.columns.to_list()

In [None]:
grab_samples_df = grab_samples_df[
    metadata_columns + common_columns + targets_columns
]

In [None]:
for column in common_columns + targets_columns:
    label_column = column + "_label"
    grab_samples_df.loc[:, label_column] = grab_samples_df[column].apply(
        set_label
    )

In [None]:
grab_samples_df[common_columns] = grab_samples_df[common_columns].map(
    convert_string_values
)

grab_samples_df[targets_columns] = grab_samples_df[targets_columns].map(
    convert_string_values
)

In [None]:
grab_samples_df

# Load Sensor Samples

In [None]:
sensor_df = pd.DataFrame()

for sensor_file in os.listdir(sensor_folder):
    xls = pd.ExcelFile(os.path.join(sensor_folder, sensor_file))
    supply_point = sensor_file.split("-")[0]

    df = pd.DataFrame()

    # first join the sheets in a single dataframe
    for sheet_name in xls.sheet_names:
        df = pd.concat([df, xls.parse(sheet_name, header=1)], axis=0)

    df["Code"] = supply_point

    # then join to  sensor_df
    sensor_df = pd.concat([sensor_df, df], axis=0)

In [None]:
sensor_df

In [None]:
# change the supply point named 'Preapli' to 'Prealpi'
sensor_df["Code"] = sensor_df["Code"].map(
    lambda x: "Prealpi" if x == "Preapli" else x
)

In [None]:
sensor_df = sensor_df[
    sensor_df.columns[~sensor_df.columns.str.contains("Status")]
]

In [None]:
sensor_df

In [None]:
sensor_df.rename(
    columns={
        "Measurement interval=900[sec] (Export-Aggregation disabled)": "DateTime",
        "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)": "Color",
        "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)": "TOC",
        "NO3eq - Measured value [mg/l] (Limit:0.00-88.00)": "Nitrate",
        "UV254t - Measured value [Abs/m] (Limit:0.00-71.00)": "Absorbance",
        "Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)": "Turbidity",
        "pH - Measured value (Limit:0.00-14.00)": "pH",
        "Temperature - Measured value [C] (Limit:-5.00-100.00)": "Temperature",
        "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)": "Conductivity",
        "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)": "Free Chlorine",
        "Flow - Measured value (Limit:0.00-1.00)": "Flow",
    },
    inplace=True,
)

In [None]:
# for each value of Supply Point column, make it caps
sensor_df["Code"] = "HOUSE_" + sensor_df["Code"].str.upper()

In [None]:
sensor_df

In [None]:
sensor_df.drop(columns=["Tag"], inplace=True)

In [None]:
sensor_df["DateTime"] = pd.to_datetime(
    sensor_df["DateTime"], format="%Y-%m-%d %H:%M:%S"
)

# Filter Grab Samples

We only keep the variables measured from the sensors to assess similarity

In [None]:
grab_samples_df.columns.to_list()

In [None]:
grab_samples_df.drop(
    columns=[
        "Campagna",
        "ZONA",
        "Rapporto di prova",
        "Punto di prelievo",
        "Analisi programmate",
    ],
    inplace=True,
)

In [None]:
grab_samples_df

In [None]:
grab_samples_df.rename(
    columns={
        "Data di prelievo": "DateTime",
        "Colore (Cu)": "Color",
        "Torbidità (NTu)": "Turbidity",
        "Conduttività a 20°C (µS/cm)": "Conductivity",
        "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine",
        "Concentrazione ioni idrogeno (unità pH)": "pH",
        "Temperatura (al prelievo) (°C)": "Temperature",
        "Nitrati (mg/L)": "Nitrate",
        "TOC - carbonio organico totale (mg/L di C)": "TOC",
    },
    inplace=True,
)

In [None]:
grab_samples_df["DateTime"] = pd.to_datetime(
    grab_samples_df["DateTime"], format="%Y/%m/%d"
)

In [None]:
# sort by code, type, date
grab_samples_df.sort_values(
    by=["Codice punto di prelievo", "Tipologia", "DateTime"], inplace=True
)

In [None]:
grab_samples_df.rename(
    columns={
        "Codice punto di prelievo": "Code",
        "Tipologia": "Type",
    },
    inplace=True,
)

# Imputation 

## Grab Samples

In [None]:
grab_samples_df

In [None]:
# count number of non null values for each code
for (code, type_), group in grab_samples_df.groupby(["Code", "Type"]):
    print(code, type_)

    print(group.notnull().sum())

    print("\n\n")

In [None]:
for (code, type_), group in grab_samples_df.groupby(["Code", "Type"]):
    fig = go.Figure(
        data=[
            go.Scatter(
                x=group["DateTime"],
                y=group["Color"],
                mode="lines",
                name="Color",
            ),
            go.Scatter(
                x=group["DateTime"],
                y=group["Turbidity"],
                mode="lines",
                name="Turbidity",
            ),
            go.Scatter(
                x=group["DateTime"],
                y=group["Conductivity"],
                mode="lines",
                name="Conductivity",
            ),
            go.Scatter(
                x=group["DateTime"],
                y=group["Free Chlorine"],
                mode="lines",
                name="Free Chlorine",
            ),
            go.Scatter(
                x=group["DateTime"], y=group["pH"], mode="lines", name="pH"
            ),
            go.Scatter(
                x=group["DateTime"],
                y=group["Temperature"],
                mode="lines",
                name="Temperature",
            ),
            go.Scatter(
                x=group["DateTime"],
                y=group["Nitrate"],
                mode="lines",
                name="Nitrate",
            ),
            go.Scatter(
                x=group["DateTime"], y=group["TOC"], mode="lines", name="TOC"
            ),
        ],
        layout=go.Layout(
            title=f"{code} - {type_}",
            xaxis_title="Date",
            yaxis_title="Value",
        ),
    )

    # fig.show()

## Sensor Samples

### Resample Dataset

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
# first resample the sensor_df to have a frequency of 15 minutes
sensor_res_df = (
    sensor_df.groupby("Code")
    .resample("15min", on="DateTime")
    .mean()
    .reset_index()
)

In [None]:
# compare the sensor_res_df with sensor_df
for code in sensor_df["Code"].unique():
    for feature in sensor_df.columns.difference(["Code", "DateTime"]):
        df = sensor_df[sensor_df["Code"] == code][["Code", "DateTime", feature]]
        res_df = sensor_res_df[sensor_res_df["Code"] == code][
            ["Code", "DateTime", feature]
        ]

        common_dates = df[df["Code"] == code]["DateTime"].isin(
            res_df[res_df["Code"] == code]["DateTime"]
        )

        merged_df = pd.merge(
            df[df["Code"] == code],
            res_df[res_df["Code"] == code],
            on=["Code", "DateTime"],
            suffixes=("_sensor", "_sensor_res"),
        )

        # get indexes where the values are not NaN
        not_nan_indexes = (
            merged_df[feature + "_sensor"].notna()
            & merged_df[feature + "_sensor_res"].notna()
        )
        merged_df = merged_df[not_nan_indexes]

        # compute normalized mean absolute error
        mae = mean_absolute_error(
            merged_df[feature + "_sensor"].dropna(),
            merged_df[feature + "_sensor_res"].dropna(),
        )

        mae = mae / df[df["Code"] == code][feature].mean()

        if mae > 0:
            go.Figure(
                data=[
                    go.Scatter(
                        x=df[df["Code"] == code]["DateTime"],
                        y=df[df["Code"] == code][feature],
                        mode="lines",
                        name="Original",
                    ),
                    go.Scatter(
                        x=res_df[res_df["Code"] == code]["DateTime"],
                        y=res_df[res_df["Code"] == code][feature],
                        mode="lines",
                        name="Resampled",
                    ),
                ],
                layout=go.Layout(
                    title=f"{code} - {feature} - MAE: {mae}",
                    xaxis_title="Date",
                    yaxis_title="Value",
                ),
            ).show()

In [None]:
sensor_df = sensor_res_df

### Remove Data Based on MM Considerations

Considerations:
-	I dati della temperatura in uscita dei grab sample sono refrigerati e non possono essere messi in considerazione con i dati dei sensori

#### Bande Nere

Manutenzione sonde Torbidità e UV 03/06/2023-08/06/2023 – Valori Torbidità e UV254 anomali da non considerare dal 03/06/2023-08/06/2023

Manutenzione e allineamento TOC e Nitrati dal 22/12/2023 – Valori precedenti da non considerare

In [None]:
# remove data from 03/06/2023 to 08/06/2023 where Code is HOUSE_BANDE NERE
mask = (
    (sensor_df["Code"] == "HOUSE_BANDE NERE")
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2023-06-03").date())
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-06-08").date())
)

sensor_df.loc[mask, ["Turbidity", "Absorbance"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_BANDE NERE") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-12-22").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
sensor_df["Code"].unique()

#### Berna

Manutenzione e allineamento TOC e Nitrati dal 20/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo Cloro 31/01/2024 – Valore precedenti da non considerare

In [None]:
mask = (sensor_df["Code"] == "HOUSE_BERNA") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-20").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_BERNA") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-31").date()
)

sensor_df.loc[mask, ["Free Chlorine"]] = np.nan

#### Chiostergi

Manutenzione e allineamento TOC e Nitrati dal 20/12/2023 – Valori precedenti da non considerare

In [None]:
mask = (sensor_df["Code"] == "HOUSE_CHIOSTERGI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-12-20").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

#### Fortunato

Manutenzione e sostituzione elettrodo Cloro 24/01/2024 – Valore precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 24/01/2024 – Valori da non considerare dal 04/01/2024 fino al 24/01/2024

Manutenzione e allineamento Nitrati dal 23/11/2023 – Valori precedenti da non considerare

Manutenzione e allineamento TOC dal 20/12/2023 – Valori precedenti da non considerare

In [None]:
mask = (sensor_df["Code"] == "HOUSE_FORTUNATO") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-24").date()
)

sensor_df.loc[mask, ["Free Chlorine"]] = np.nan

In [None]:
mask = (
    (sensor_df["Code"] == "HOUSE_FORTUNATO")
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-24").date())
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2024-01-04").date())
)

sensor_df.loc[mask, ["pH"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_FORTUNATO") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-23").date()
)

sensor_df.loc[mask, "Nitrate"] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_FORTUNATO") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-12-20").date()
)

sensor_df.loc[mask, "TOC"] = np.nan

#### Gramsci

Manutenzione e allineamento TOC e Nitrati dal 23/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo Cloro 07/02/2024 – Valore precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 23/12/2023 – Valori da non considerare dal 27/11/2023 fino al 23/12/2023

Manutenzione e allineamento SACUV dal 23/01/2024 – Valori precedenti da non considerare dal 20/10/2023 fino al 23/01/2024

In [None]:
mask = (sensor_df["Code"] == "HOUSE_GRAMSCI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-23").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_GRAMSCI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-02-07").date()
)

sensor_df.loc[mask, "Free Chlorine"] = np.nan

In [None]:
mask = (
    (sensor_df["Code"] == "HOUSE_GRAMSCI")
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-12-23").date())
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2023-11-27").date())
)

sensor_df.loc[mask, "pH"] = np.nan

In [None]:
mask = (
    (sensor_df["Code"] == "HOUSE_GRAMSCI")
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-23").date())
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2023-10-20").date())
)

sensor_df.loc[mask, "Absorbance"] = np.nan

#### Montevideo

Manutenzione e allineamento TOC e Nitrati dal 20/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 29/01/2024 – Valori segnati in rosso da non considerare e gli altri poco affidabili fino al 29/01/2024

In [None]:
mask = (sensor_df["Code"] == "HOUSE_MONTEVIDEO") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-20").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_MONTEVIDEO") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-29").date()
)

sensor_df.loc[mask, "pH"] = np.nan

#### Prealpi

Manutenzione e allineamento TOC e Nitrati dal 23/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 29/01/2024 – Valori segnati in rosso da non considerare e gli altri poco affidabili fino al 29/01/2024

Manutenzione Torbidità il 22/01/2024– Valori da non considerare dal 05/09/2023 al 22/01/2024

Manutenzione torbidità il 20/03/2024- Valori da non considerare dal 12/02/2024 al 20/03/2024


In [None]:
mask = (sensor_df["Code"] == "HOUSE_PREALPI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-23").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_PREALPI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-29").date()
)

sensor_df.loc[mask, "pH"] = np.nan

In [None]:
mask = (
    (sensor_df["Code"] == "HOUSE_PREALPI")
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-22").date())
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2023-09-05").date())
    & (sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-03-20").date())
    & (sensor_df["DateTime"].dt.date >= pd.to_datetime("2024-02-12").date())
)

sensor_df.loc[mask, "Turbidity"] = np.nan

#### Tabacchi

Manutenzione e allineamento TOC e Nitrati dal 21/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo Cloro 26/01/2024 – Valore precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 25/01/2024 – Valori segnati in rosso da non considerare e gli altri poco affidabili fino al 25/01/2024

In [None]:
mask = (sensor_df["Code"] == "HOUSE_TABACCHI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-21").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_TABACCHI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-26").date()
)

sensor_df.loc[mask, "Free Chlorine"] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_TABACCHI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-25").date()
)

sensor_df.loc[mask, "pH"] = np.nan

#### Tognazzi

Manutenzione e allineamento TOC e Nitrati dal 23/11/2023 – Valori precedenti da non considerare

Manutenzione e sostituzione elettrodo PH 17/01/2024 – Tutti i valori precedenti da non considerare assolutamente

In [None]:
mask = (sensor_df["Code"] == "HOUSE_TOGNAZZI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2023-11-23").date()
)

sensor_df.loc[mask, ["TOC", "Nitrate"]] = np.nan

In [None]:
mask = (sensor_df["Code"] == "HOUSE_TOGNAZZI") & (
    sensor_df["DateTime"].dt.date <= pd.to_datetime("2024-01-17").date()
)

sensor_df.loc[mask, "pH"] = np.nan

### Missing Values

In [None]:
for code in sensor_df["Code"].unique():
    print(code)
    for feature in sensor_df.columns.difference(["DateTime", "Code"]):
        df = sensor_df[sensor_df["Code"] == code][feature]
        print(f"{feature}: " + str(df.isna().sum() / df.shape[0] * 100))
    print()

In [None]:
# Initialize an empty DataFrame
missing_values_df = pd.DataFrame()

for code in sensor_df["Code"].unique():
    # Initialize a dictionary to store the information for the current code
    info_dict = {}
    for feature in sensor_df.columns.difference(["DateTime", "Code"]):
        df = sensor_df[sensor_df["Code"] == code][feature]
        # Store the percentage of missing values in the dictionary

        perc = df.isna().sum() / df.shape[0] * 100
        perc = round(perc, 2)
        perc = str(perc) + "%"
        info_dict[feature] = perc
    # Add the dictionary to the DataFrame
    missing_values_df[code] = pd.Series(info_dict)

In [None]:
missing_values_df.to_excel(
    os.path.join(metadata_folder, "Riunione 24-04-2024", "Missing Values.xlsx"),
    index=True,
)

In [None]:
for code in sensor_df["Code"].unique():
    for feature in sensor_df.columns.difference(["DateTime", "Code"]):
        fig = go.Figure()

        df = sensor_df[sensor_df["Code"] == code][["DateTime", feature]].copy()
        df["is_missing"] = df[feature].isna()
        missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
        missing_values_perc = missing_values_perc.round(2)

        # Create a boolean mask to identify NaN values
        mask = df[feature].isna()

        # Create a new column to identify consecutive NaNs
        df["group"] = (mask & (~mask).shift()).cumsum()

        # Group by the 'group' column and find the start and end dates for each group
        nan_periods = (
            df[mask]
            .groupby("group")["DateTime"]
            .agg(["min", "max"])
            .reset_index(drop=True)
        )

        # Rename the columns for better readability
        nan_periods.columns = ["start_date", "end_date"]

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df[feature],
                mode="lines",
                name=feature,
            )
        )

        for _, row in nan_periods.iterrows():
            fig.add_shape(
                type="rect",
                x0=row["start_date"],
                y0=df[feature].min(),
                x1=row["end_date"],
                y1=df[feature].max(),
                fillcolor="red",
                line=dict(color="red"),
            )

        fig.update_layout(
            title=f"{code} - {feature} - Missing Values: {missing_values_perc}%",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        if not os.path.exists(
            os.path.join(
                plot_folder, "Riunione 24-04-2024", "Missing Values", code
            )
        ):
            os.makedirs(
                os.path.join(
                    plot_folder, "Riunione 24-04-2024", "Missing Values", code
                )
            )

        # # save the plot
        # fig.write_image(
        #     os.path.join(
        #         plot_folder,
        #         "Riunione 24-04-2024",
        #         "Missing Values",
        #         code,
        #         f"{feature}.png",
        #     )
        # )

        # fig.show()

In [None]:
copy_df = sensor_df.copy()

copy_df.dropna(inplace=True)

In [None]:
for code in copy_df["Code"].unique():
    print(code)
    for feature in copy_df.columns.difference(["DateTime", "Code"]):
        df = copy_df[copy_df["Code"] == code][feature]
        print(f"{feature}: " + str(df.isna().sum() / df.shape[0] * 100))
    print()

In [None]:
copy_df = (
    copy_df.groupby("Code")
    .resample("15min", on="DateTime")
    .mean()
    .reset_index()
)

In [None]:
for code in copy_df["Code"].unique():
    print(code)
    for feature in copy_df.columns.difference(["DateTime", "Code"]):
        df = copy_df[copy_df["Code"] == code][feature]
        print(f"{feature}: " + str(df.isna().sum() / df.shape[0] * 100))
    print()

In [None]:
for code in copy_df["Code"].unique():
    for feature in copy_df.columns.difference(["DateTime", "Code"]):
        fig = go.Figure()

        df = copy_df[copy_df["Code"] == code][["DateTime", feature]].copy()

        df["is_missing"] = df[feature].isna()
        missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
        missing_values_perc = missing_values_perc.round(2)

        # Create a boolean mask to identify NaN values
        mask = df[feature].isna()

        # Create a new column to identify consecutive NaNs
        df["group"] = (mask & (~mask).shift()).cumsum()

        # Group by the 'group' column and find the start and end dates for each group
        nan_periods = (
            df[mask]
            .groupby("group")["DateTime"]
            .agg(["min", "max"])
            .reset_index(drop=True)
        )

        # Rename the columns for better readability
        nan_periods.columns = ["start_date", "end_date"]

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df[feature],
                mode="lines",
                name=feature,
            )
        )

        for _, row in nan_periods.iterrows():
            fig.add_shape(
                type="rect",
                x0=row["start_date"],
                y0=df[feature].min(),
                x1=row["end_date"],
                y1=df[feature].max(),
                fillcolor="red",
                line=dict(color="red"),
            )

        fig.update_layout(
            title=f"{code} - {feature} - Missing Values: {missing_values_perc}%",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        fig.show()

#### Fill NANs

In [None]:
copy_df.set_index("DateTime", inplace=True)
for code in copy_df["Code"].unique():
    df = copy_df[copy_df["Code"] == code].copy()

    for feature in copy_df.columns.difference(["Code"]):
        df[feature] = df[feature].interpolate(method="time")

    copy_df[copy_df["Code"] == code] = df

copy_df.reset_index(inplace=True)

In [None]:
# Initialize an empty DataFrame
date_range_df = pd.DataFrame()

for code in copy_df["Code"].unique():
    df = copy_df[copy_df["Code"] == code]

    min_date = df["DateTime"].min().date()
    max_date = df["DateTime"].max().date()
    date_range = f"{min_date} - {max_date}"

    date_range_df[code] = pd.Series({"Date Range": date_range})

In [None]:
date_range_df.to_excel(
    os.path.join(metadata_folder, "Riunione 24-04-2024", "Date Range.xlsx"),
    index=False,
)

In [None]:
for code in copy_df["Code"].unique():
    for feature in copy_df.columns.difference(["DateTime", "Code"]):
        fig = go.Figure()

        df = copy_df[copy_df["Code"] == code][["DateTime", feature]].copy()

        df["is_missing"] = df[feature].isna()
        missing_values_perc = (df["is_missing"].sum() / df.shape[0]) * 100
        missing_values_perc = missing_values_perc.round(2)

        # Create a boolean mask to identify NaN values
        mask = df[feature].isna()

        # Create a new column to identify consecutive NaNs
        df["group"] = (mask & (~mask).shift()).cumsum()

        # Group by the 'group' column and find the start and end dates for each group
        nan_periods = (
            df[mask]
            .groupby("group")["DateTime"]
            .agg(["min", "max"])
            .reset_index(drop=True)
        )

        # Rename the columns for better readability
        nan_periods.columns = ["start_date", "end_date"]

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df[feature],
                mode="lines",
                name=feature,
            )
        )

        for _, row in nan_periods.iterrows():
            fig.add_shape(
                type="rect",
                x0=row["start_date"],
                y0=df[feature].min(),
                x1=row["end_date"],
                y1=df[feature].max(),
                fillcolor="red",
                line=dict(color="red"),
            )

        fig.update_layout(
            title=f"{code} - {feature} - Missing Values: {missing_values_perc}%",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        if not os.path.exists(
            os.path.join(
                plot_folder,
                "Riunione 24-04-2024",
                "Clean Data",
                "No Removed Outliers",
                code,
            )
        ):
            os.makedirs(
                os.path.join(
                    plot_folder,
                    "Riunione 24-04-2024",
                    "Clean Data",
                    "No Removed Outliers",
                    code,
                )
            )

        fig.write_image(
            os.path.join(
                plot_folder,
                "Riunione 24-04-2024",
                "Clean Data",
                "No Removed Outliers",
                code,
                f"{feature}.png",
            )
        )

### Outliers

Outliers (or Anomalies) are detected by using the STL method, which uses LOESS.

In [None]:
from statsmodels.tsa.seasonal import STL

from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler

In [None]:
sensor_df = copy_df

In [None]:
def rolling_z_score(df, window=5):
    rolling_mean = df.rolling(window=window).mean()
    rolling_std = df.rolling(window=window).std()

    upper_threshold = rolling_mean + 2 * rolling_std
    lower_threshold = rolling_mean - 2 * rolling_std

    df["upper_threshold"] = upper_threshold
    df["lower_threshold"] = lower_threshold

    return df

In [None]:
copy_df = (
    copy_df.groupby("Code").resample("D", on="DateTime").median().reset_index()
)

In [None]:
copy_df.set_index("DateTime", inplace=True)
for code in copy_df["Code"].unique():
    df = copy_df[copy_df["Code"] == code].copy()

    for feature in copy_df.columns.difference(["Code"]):
        df[feature] = df[feature].interpolate(method="time")

    copy_df[copy_df["Code"] == code] = df

copy_df.reset_index(inplace=True)

In [None]:
# copy_df
for code in copy_df["Code"].unique():
    for feature in copy_df.columns.difference(["DateTime", "Code"]):
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=copy_df[copy_df["Code"] == code]["DateTime"],
                y=copy_df[copy_df["Code"] == code][feature],
                mode="lines",
                name="Original",
            )
        )

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        if not os.path.exists(
            os.path.join(
                plot_folder, "Riunione 24-04-2024", "Clean Data", "Daily", code
            )
        ):
            os.makedirs(
                os.path.join(
                    plot_folder,
                    "Riunione 24-04-2024",
                    "Clean Data",
                    "Daily",
                    code,
                )
            )

        fig.write_image(
            os.path.join(
                plot_folder,
                "Riunione 24-04-2024",
                "Clean Data",
                "Daily",
                code,
                f"{feature}.png",
            ),
            width=5,
            height=2,
        )

        fig.show()

In [None]:
for code in sensor_df["Code"].unique():
    for feature in sensor_df.columns.difference(["DateTime", "Code"]):
        df = sensor_df[sensor_df["Code"] == code][["DateTime", feature]].copy()

        df.set_index("DateTime", inplace=True)

        stl = STL(df, period=96 * 20, robust=True)
        result = stl.fit()
        seasonal, trend, resid = result.seasonal, result.trend, result.resid

        fig = make_subplots(rows=1, cols=1)

        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[feature],
                mode="lines",
                name="Original",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend,
                mode="lines",
                name="Trend",
            ),
            row=1,
            col=1,
        )

        resid_mean = np.mean(resid)
        resid_std = np.std(resid)

        lower_bound = resid_mean - 2 * resid_std
        upper_bound = resid_mean + 2 * resid_std

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend - 2 * resid_std,
                mode="lines",
                name="Lower Bound",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend + 2 * resid_std,
                mode="lines",
                name="Upper Bound",
            ),
            row=1,
            col=1,
        )

        # fig.add_shape(
        #     type="line",
        #     x0=df.index.min(),
        #     y0=lower_bound,
        #     x1=df.index.max(),
        #     y1=lower_bound,
        #     line=dict(
        #         color="red",
        #         width=1,
        #     ),
        #     row=2,
        #     col=1,
        # )

        # fig.add_shape(
        #     type="line",
        #     x0=df.index.min(),
        #     y0=upper_bound,
        #     x1=df.index.max(),
        #     y1=upper_bound,
        #     line=dict(
        #         color="red",
        #         width=1,
        #     ),
        #     row=2,
        #     col=1,
        # )

        # fig.add_trace(
        #     go.Scatter(
        #         x=df.index,
        #         y=resid,
        #         mode="lines",
        #         name="Residual",
        #     ),
        #     row=2,
        #     col=1,
        # )

        outliers = df[
            (df[feature] < trend - 2 * resid_std)
            | (df[feature] > trend + 2 * resid_std)
        ]
        # outliers = df[df["DateTime"].isin(outliers.index)][["DateTime", feature]]

        fig.add_trace(
            go.Scatter(
                x=outliers.index,
                y=outliers[feature],
                mode="markers",
                name="Outliers",
            ),
            row=1,
            col=1,
        )

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        df.reset_index(inplace=True)

        fig.show()

In [None]:
for code in sensor_df["Code"].unique():
    for feature in sensor_df.columns.difference(["DateTime", "Code"]):
        df = sensor_df[sensor_df["Code"] == code][["DateTime", feature]].copy()

        df.set_index("DateTime", inplace=True)

        # Calculate the moving average and standard deviation
        df["moving_avg"] = df[feature].rolling(window=96 * 20).mean()
        df["moving_std"] = df[feature].rolling(window=96 * 20).std()

        # Calculate the z-score
        df["z_score"] = (df[feature] - df["moving_avg"]) / df["moving_std"]

        df.reset_index(inplace=True)

        fig = make_subplots(rows=2, cols=1)

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df[feature],
                mode="lines",
                name="Original",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df["moving_avg"],
                mode="lines",
                name="Moving Average",
            ),
            row=1,
            col=1,
        )

        lower_bound = -3
        upper_bound = 3

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df["moving_avg"] - 3 * df["moving_std"],
                mode="lines",
                name="Lower Bound",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df["moving_avg"] + 3 * df["moving_std"],
                mode="lines",
                name="Upper Bound",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=df["DateTime"],
                y=df["z_score"],
                mode="lines",
                name="Z-Score",
            ),
            row=2,
            col=1,
        )

        outliers = df[
            (df["z_score"] < lower_bound) | (df["z_score"] > upper_bound)
        ]

        fig.add_trace(
            go.Scatter(
                x=outliers["DateTime"],
                y=outliers[feature],
                mode="markers",
                name="Outliers",
            ),
            row=1,
            col=1,
        )

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        fig.show()

#### Similarities

In [None]:
from scipy import stats

##### Pearson

In [None]:
# for every feature, compare every code with the others
feature_dict = {}
for feature in copy_df.columns.difference(["DateTime", "Code"]):
    feature_df = pd.DataFrame()

    codes = copy_df["Code"].unique()

    for i in range(len(codes)):
        for j in range(i + 1, len(codes)):
            df = copy_df[copy_df["Code"] == codes[i]][
                ["DateTime", feature]
            ].copy()
            other_df = copy_df[copy_df["Code"] == codes[j]][
                ["DateTime", feature]
            ].copy()

            # get the common timerange between the two dataframes
            common_dates = df[df["DateTime"].isin(other_df["DateTime"])][
                "DateTime"
            ]
            df = df[df["DateTime"].isin(common_dates)]
            other_df = other_df[other_df["DateTime"].isin(common_dates)]

            # sort the dataframes by DateTime
            df.sort_values(by="DateTime", inplace=True)
            other_df.sort_values(by="DateTime", inplace=True)

            # normalize the dataframes
            # scaler = MinMaxScaler()
            # df[feature] = scaler.fit_transform(df[[feature]])
            # other_df[feature] = scaler.fit_transform(other_df[[feature]])

            # compute the correlation between the two dataframes
            correlation = stats.pearsonr(df[feature], other_df[feature])[0]
            feature_df.loc[codes[i], codes[j]] = correlation

    feature_dict[feature] = feature_df

In [None]:
with pd.ExcelWriter(
    os.path.join(
        metadata_folder, "Riunione 24-04-2024", "comparison", "pearson.xlsx"
    )
) as writer:
    for key, value in feature_dict.items():
        value.to_excel(writer, sheet_name=key)

##### MAE

In [None]:
# for every feature, compare every code with the others
feature_dict = {}
for feature in copy_df.columns.difference(["DateTime", "Code"]):
    feature_df = pd.DataFrame()

    codes = sensor_df["Code"].unique()

    for i in range(len(codes)):
        for j in range(i + 1, len(codes)):
            df = copy_df[copy_df["Code"] == codes[i]][
                ["DateTime", feature]
            ].copy()
            other_df = copy_df[copy_df["Code"] == codes[j]][
                ["DateTime", feature]
            ].copy()

            # get the common timerange between the two dataframes
            common_dates = df[df["DateTime"].isin(other_df["DateTime"])][
                "DateTime"
            ]
            df = df[df["DateTime"].isin(common_dates)]
            other_df = other_df[other_df["DateTime"].isin(common_dates)]

            # sort the dataframes by DateTime
            df.sort_values(by="DateTime", inplace=True)
            other_df.sort_values(by="DateTime", inplace=True)

            # normalize the dataframes
            # scaler = MinMaxScaler()
            # df[feature] = scaler.fit_transform(df[[feature]])
            # other_df[feature] = scaler.fit_transform(other_df[[feature]])

            # compute mae between the two dataframes
            mae = mean_absolute_error(df[feature], other_df[feature])
            feature_df.loc[codes[i], codes[j]] = mae

    feature_dict[feature] = feature_df

In [None]:
with pd.ExcelWriter(
    os.path.join(
        metadata_folder, "Riunione 24-04-2024", "comparison", "mae.xlsx"
    )
) as writer:
    for key, value in feature_dict.items():
        value.to_excel(writer, sheet_name=key)

##### Affinity Propagation

In [None]:
from sklearn.cluster import AffinityPropagation

In [None]:
prop = AffinityPropagation(affinity="precomputed")

# List of unique house codes
house_codes = copy_df["Code"].unique()

# Compute the affinity matrix
for feature in copy_df.columns.difference(["DateTime", "Code"]):
    affinity_matrix = np.zeros((len(house_codes), len(house_codes)))
    for i, house1 in enumerate(house_codes):
        for j, house2 in enumerate(house_codes):
            if house1 == house2:
                continue
            affinity_matrix[i, j] = stats.wasserstein_distance(
                copy_df[copy_df["Code"] == house1][feature].dropna(),
                copy_df[copy_df["Code"] == house2][feature].dropna(),
            )

    # normalize the affinity matrix to have similarity values between 0 and 1
    affinity_matrix = 1 - (affinity_matrix - affinity_matrix.min()) / (
        affinity_matrix.max() - affinity_matrix.min()
    )

    # set the nan values to 0
    affinity_matrix[np.isnan(affinity_matrix)] = 0

    prop.fit(affinity_matrix)

    print(f"Feature: {feature}")
    print(f"Number of clusters: {len(prop.cluster_centers_indices_)}")

    # Print cluster centers
    print("Cluster centers:")
    for center_index in prop.cluster_centers_indices_:
        print(house_codes[center_index])

    # Print houses in each cluster
    for cluster_id in range(len(prop.cluster_centers_indices_)):
        print(f"Cluster {cluster_id}:")
        for i, label in enumerate(prop.labels_):
            if label == cluster_id:
                print(house_codes[i])

    print()
    print()

# Grab vs Sensor

In [None]:
grab_samples_df

In [None]:
grab_samples_df["Code"] = grab_samples_df["Code"].apply(lambda x: x.strip())

In [None]:
sensor_df

In [None]:
# rename Code HOUSE_BANDE NERE to HOUSE_BANDENERE
sensor_df["Code"] = sensor_df["Code"].str.replace(
    "HOUSE_BANDE NERE", "HOUSE_BANDENERE"
)
copy_df["Code"] = copy_df["Code"].str.replace(
    "HOUSE_BANDE NERE", "HOUSE_BANDENERE"
)

In [None]:
# sensor_df: 15 min sampling rate

for code in ["HOUSE_TABACCHI"]:
    for feature in grab_samples_df.columns.difference(
        ["DateTime", "Code", "Type"]
    ):
        df = sensor_df[sensor_df["Code"] == code][["DateTime", feature]].copy()

        df.set_index("DateTime", inplace=True)

        stl = STL(df, period=96 * 20, robust=True)
        result = stl.fit()
        seasonal, trend, resid = result.seasonal, result.trend, result.resid

        fig = make_subplots(rows=1, cols=1)

        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[feature],
                mode="lines",
                name="Sensor",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend,
                mode="lines",
                name="Trend",
            ),
            row=1,
            col=1,
        )

        resid_mean = np.mean(resid)
        resid_std = np.std(resid)

        lower_bound = resid_mean - 3 * resid_std
        upper_bound = resid_mean + 3 * resid_std

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend - 3 * resid_std,
                mode="lines",
                name="Lower Bound",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend + 3 * resid_std,
                mode="lines",
                name="Upper Bound",
            ),
            row=1,
            col=1,
        )

        outliers = df[
            (df[feature] < trend - 3 * resid_std)
            | (df[feature] > trend + 3 * resid_std)
        ]

        fig.add_trace(
            go.Scatter(
                x=outliers.index,
                y=outliers[feature],
                mode="markers",
                name="Outliers",
            ),
            row=1,
            col=1,
        )

        # add grab samples
        grab_df = grab_samples_df[grab_samples_df["Code"] == code][
            ["DateTime", "Type", feature]
        ].copy()

        # get common dates between the two dataframes
        common_dates = df[df.index.isin(grab_df["DateTime"])]

        grab_df = grab_df[grab_df["DateTime"].isin(common_dates.index)]
        grab_df.set_index("DateTime", inplace=True)

        for type in grab_df["Type"].unique():
            fig.add_trace(
                go.Scatter(
                    x=grab_df[grab_df["Type"] == type].index,
                    y=grab_df[grab_df["Type"] == type][feature],
                    mode="markers",
                    name=f"Grab: {feature} - {type}",
                    marker_symbol="x",
                    marker_size=10,
                ),
                row=1,
                col=1,
            )

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="Date",
            yaxis_title="Value",
        )

        df.reset_index(inplace=True)

        if not os.path.exists(
            os.path.join(
                plot_folder, "Riunione 24-04-2024", "Comparison", "15min", code
            )
        ):
            os.makedirs(
                os.path.join(
                    plot_folder,
                    "Riunione 24-04-2024",
                    "Comparison",
                    "15min",
                    code,
                )
            )

        fig.write_image(
            os.path.join(
                plot_folder,
                "Riunione 24-04-2024",
                "Comparison",
                "15min",
                code,
                f"{feature}.jpeg",
            ),
            scale=5,
            width=8,
            height=2,
        )

        fig.show()

In [None]:
for code in grab_samples_df["Code"].unique():
    for feature in grab_samples_df.columns.difference(
        ["DateTime", "Code", "Type"]
    ):
        df = copy_df[copy_df["Code"] == code][["DateTime", feature]].copy()

        df.set_index("DateTime", inplace=True)

        stl = STL(df, period=20, robust=True)
        result = stl.fit()
        seasonal, trend, resid = result.seasonal, result.trend, result.resid

        fig = make_subplots(rows=1, cols=1)

        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[feature],
                mode="lines",
                name="Sensor",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend,
                mode="lines",
                name="Trend",
            ),
            row=1,
            col=1,
        )

        resid_mean = np.mean(resid)
        resid_std = np.std(resid)

        lower_bound = resid_mean - 3 * resid_std
        upper_bound = resid_mean + 3 * resid_std

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend - 3 * resid_std,
                mode="lines",
                name="Lower Bound",
            ),
            row=1,
            col=1,
        )

        fig.add_trace(
            go.Scatter(
                x=trend.index,
                y=trend + 3 * resid_std,
                mode="lines",
                name="Upper Bound",
            ),
            row=1,
            col=1,
        )

        outliers = df[
            (df[feature] < trend - 3 * resid_std)
            | (df[feature] > trend + 3 * resid_std)
        ]

        fig.add_trace(
            go.Scatter(
                x=outliers.index,
                y=outliers[feature],
                mode="markers",
                name="Outliers",
                marker_size=20,
            ),
            row=1,
            col=1,
        )

        # add grab samples
        grab_df = grab_samples_df[grab_samples_df["Code"] == code][
            ["DateTime", "Type", feature]
        ].copy()

        # get common dates between the two dataframes
        common_dates = df[df.index.isin(grab_df["DateTime"])]

        grab_df = grab_df[grab_df["DateTime"].isin(common_dates.index)]
        grab_df.set_index("DateTime", inplace=True)

        for type in grab_df["Type"].unique():
            fig.add_trace(
                go.Scatter(
                    x=grab_df[grab_df["Type"] == type].index,
                    y=grab_df[grab_df["Type"] == type][feature],
                    mode="markers",
                    name=f"Grab: {feature} - {type}",
                    marker_symbol="x",
                    marker_size=20,
                ),
                row=1,
                col=1,
            )

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="Date",
            yaxis_title="Value",
            font=dict(size=20),
        )

        df.reset_index(inplace=True)

        if not os.path.exists(
            os.path.join(
                plot_folder, "Riunione 24-04-2024", "Comparison", "Daily", code
            )
        ):
            os.makedirs(
                os.path.join(
                    plot_folder,
                    "Riunione 24-04-2024",
                    "Comparison",
                    "Daily",
                    code,
                )
            )

        fig.write_image(
            os.path.join(
                plot_folder,
                "Riunione 24-04-2024",
                "Comparison",
                "Daily",
                code,
                f"{feature}.jpeg",
            ),
            scale=5,
            width=8,
            height=2,
        )

        fig.show()

## Boxplots

In [None]:
for feature in copy_df.columns.difference(["DateTime", "Code", "Type"]):
    fig = go.Figure()

    for code in sensor_df["Code"].unique():
        df = sensor_df[sensor_df["Code"] == code][["DateTime", feature]].copy()

        df.set_index("DateTime", inplace=True)

        # remove wrong values
        if feature == "Absorbance":
            df = df[df[feature] < 5000]

        elif feature == "Color":
            df = df[df[feature] < 6]

        elif feature == "Free Chlorine":
            df = df[df[feature] < 400]

        elif feature == "TOC":
            df = df[df[feature] < 3]

        elif feature == "Turbidity":
            df = df[df[feature] < 2000]

        elif feature == "Flow":
            df = df[df[feature] > 0]

        fig.add_trace(go.Box(y=df[feature], name=code, showlegend=False))

        if feature != "Absorbance" and feature != "Flow":
            # add grab samples
            grab_df = grab_samples_df[grab_samples_df["Code"] == code][
                ["DateTime", "Type", feature]
            ].copy()

            # get common dates between the two dataframes
            common_dates = df[df.index.isin(grab_df["DateTime"])]

            grab_df = grab_df[grab_df["DateTime"].isin(common_dates.index)]
            grab_df.set_index("DateTime", inplace=True)

            for type in grab_df["Type"].unique():
                # add points to the boxplots for the grab samples

                if type == "Ingresso":
                    fig.add_trace(
                        go.Scatter(
                            x=[code],
                            y=grab_df[grab_df["Type"] == type][feature],
                            mode="markers",
                            name=f"Grab: {type}",
                            marker=dict(symbol="x", size=10, color="blue"),
                            showlegend=False,
                        )
                    )
                else:
                    fig.add_trace(
                        go.Scatter(
                            x=[code],
                            y=grab_df[grab_df["Type"] == type][feature],
                            mode="markers",
                            name=f"Grab: {type}",
                            marker=dict(symbol="x", size=10, color="red"),
                            showlegend=False,
                        )
                    )

        df.reset_index(inplace=True)

        fig.update_layout(
            title=f"{feature}",
            xaxis_title="House",
            yaxis_title="Value",
            font=dict(size=18),
        )

        # set legend to the right and set the x blue mark as Ingresso and the red mark as Uscita
        fig.update_layout(
            legend=dict(
                orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
            )
        )

        # if not os.path.exists(
        #     os.path.join(
        #         plot_folder, "Riunione 24-04-2024", "Comparison", code
        #     )
        # ):
        #     os.makedirs(
        #         os.path.join(
        #             plot_folder, "Riunione 24-04-2024", "Comparison", code
        #         )
        #     )

        # fig.write_image(
        #     os.path.join(
        #         plot_folder,
        #         "Riunione 24-04-2024",
        #         "Comparison",
        #         code,
        #         f"{feature}.jpeg",
        #     ),
        #     scale=5,
        #     width=8,
        #     height=2
        # )

    # Add dummy traces for custom legend
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            marker=dict(symbol="x", size=10, color="blue"),
            name="Ingresso",
            showlegend=True,
        )
    )
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            marker=dict(symbol="x", size=10, color="red"),
            name="Uscita",
            showlegend=True,
        )
    )

    fig.show()

# Store Datasets

In [None]:
grab_samples_df.rename(
    columns={
        "Color": "Color (CU)",
        "Turbidity": "Turbidity (NTU)",
        "Free Chlorine": "Free Chlorine (mg/L)",
        "Conductivity": "Conductivity (uS/cm)",
        "pH": "pH",
        "TOC": "TOC (mg/L)",
        "Nitrate": "Nitrate (mg/L)",
        "Temperature": "Temperature (°C)",
    },
    inplace=True,
)

In [None]:
grab_samples_df.to_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Grab Samples.xlsx"),
    index=False,
)

In [None]:
sensor_df.rename(
    columns={
        "Color": "Color (CU)",
        "Turbidity": "Turbidity (NTU)",
        "Free Chlorine": "Free Chlorine (mg/L)",
        "Conductivity": "Conductivity (uS/cm)",
        "pH": "pH",
        "TOC": "TOC (mg/L)",
        "Nitrate": "Nitrate (mg/L)",
        "Temperature": "Temperature (°C)",
        "Absorbance": "UVA254 (1/m)",
        "Flow": "Flow Rate (m³/s)",
    },
    inplace=True,
)

In [None]:
sensor_df.to_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Sensor Data.xlsx"),
    index=False,
)