# Supply Points Data Analysis

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
from scipy.stats import gaussian_kde


from pathvalidate import sanitize_filename

In [None]:
data_path = "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano"
root_folder_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/SafeCREW/soft_sensors/Soft Sensor CS2Milan"

dir_temporary_results_path = os.path.join(data_path, "temporary results")
raw_grab_samples_path = os.path.join(
    dir_temporary_results_path, "raw_grab_samples_supply_points.xlsx"
)
house_codes_path = os.path.join(data_path, "Case-Codici.xlsx")
sensor_data_folder_path = os.path.join(
    root_folder_path, "Case dell'acqua - Sensori"
)

In [None]:
raw_grab_samples_df = pd.read_excel(raw_grab_samples_path, dtype=object)

In [None]:
def count_values(series):
    num_nans = series.isna().sum()
    strings = series[
        series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))
    ].count()
    num_numbers = series[
        series.apply(lambda x: isinstance(x, (int, float)))
    ].count()
    return pd.Series(
        [num_nans, strings, num_numbers], index=["NaN", "Strings", "numbers"]
    )

In [None]:
columns = raw_grab_samples_df.columns[7:]

In [None]:
histogram = raw_grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
histogram

In [None]:
ax = histogram.T[["NaN", "Strings", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

## Convert String Values

In [None]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None


raw_grab_samples_df[columns] = raw_grab_samples_df[columns].applymap(
    convert_string_values
)

In [None]:
%%script false --no-raise-error

plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = raw_grab_samples_df[column].where(
        raw_grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(raw_grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

    directory = os.path.join(dir_temporary_results_path, "histograms_all")
    if not os.path.exists(directory):
        os.makedirs(directory)

    plt.savefig(
        os.path.join(
            directory,
            sanitize_filename(column) + ".png",
        ),
        dpi=300,
    )

In [None]:
%%script false --no-raise-error

for col in columns:
    sanitized_col = sanitize_filename(col)
    # Extract unit of measure from column name
    raw_grab_samples_df.plot(
        x="Data di prelievo",
        y=col,
        legend=False,
        title=f"{sanitized_col}",
        fontsize=8,
        figsize=(40, 10),
    )
    directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Grab samples data plots/{sanitized_col}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)

In [None]:
houses_code_df = pd.read_excel(house_codes_path)

In [None]:
houses_code_df["Casa dell'acqua"][7] = houses_code_df["Casa dell'acqua"][
    7
].rstrip()

houses_code_df.loc[4] = ["Chiostergi", "HOUSE_CHIOSTERGI"]

In [None]:
houses_code_df

In [None]:
# keep just rows of raw_grab_samples_df that have a Codice punto di prelievo that is contained in the houses_code_df Codice Punto di Prelievo
grab_samples_df = raw_grab_samples_df.merge(
    houses_code_df,
    left_on="Codice punto di prelievo",
    right_on="Codice Punto di Prelievo",
    how="inner",
)

grab_samples_df.drop(
    columns=["Casa dell'acqua", "Codice Punto di Prelievo"], inplace=True
)

In [None]:
grab_samples_df

In [None]:
histogram = grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
histogram

In [None]:
ax = histogram.T[["NaN", "Strings", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

In [None]:
%%script false --no-raise-error

plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = grab_samples_df[column].where(
        grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

    directory = os.path.join(dir_temporary_results_path, "histograms_filtered")
    if not os.path.exists(directory):
        os.makedirs(directory)

    plt.savefig(
        os.path.join(
            directory,
            sanitize_filename(column) + ".png",
        ),
        dpi=300,
    )

In [None]:
%%script false --no-raise-error

for punto in grab_samples_df["Codice punto di prelievo"].unique():
    for col in columns:
        grab_samples_df[
            grab_samples_df["Codice punto di prelievo"] == punto
        ].plot(
            x="Data di prelievo",
            y=col,
            legend=False,
            title=f"{punto} - {col}",
            fontsize=8,
            figsize=(40, 10),
        )
        directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Grab samples data plots/{punto}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = sanitize_filename(f"{col}.png")
        plt.savefig(os.path.join(directory, filename), dpi=300)

## Check Distributions Divergence to Exploit More Data

In [None]:
# Initialize a dictionary to store the KL divergence for each feature
kl_divergences = {}
js_divergences = {}
tv_distances = {}
w_distances = {}

# For each feature in the DataFrame
for feature in columns:
    # # Compute the probability distribution of the feature in each DataFrame
    # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
    # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

    # # Add a small constant to avoid division by zero
    # pdist_raw = pdist_raw + np.finfo(np.float64).eps
    # pdist_grab = pdist_grab + np.finfo(np.float64).eps

    if (
        raw_grab_samples_df[feature].dropna().empty
        or grab_samples_df[feature].dropna().empty
        or len(grab_samples_df[feature].dropna().unique()) == 1
    ):
        continue

    kde_raw = gaussian_kde(raw_grab_samples_df[feature].dropna())
    kde_grab = gaussian_kde(grab_samples_df[feature].dropna())

    # Evaluate the KDEs on a range of values
    x = np.linspace(
        min(raw_grab_samples_df[feature].min(), grab_samples_df[feature].min()),
        max(raw_grab_samples_df[feature].max(), grab_samples_df[feature].max()),
        100,
    )
    pdist_raw = kde_raw(x)
    pdist_grab = kde_grab(x)

    # Compute the KL divergence and store it in the dictionary
    kl_divergences[feature] = stats.entropy(pdist_raw, pdist_grab)
    js_divergences[feature] = jensenshannon(pdist_raw, pdist_grab)
    tv_distances[feature] = np.sum(np.abs(pdist_raw - pdist_grab)) / 2
    w_distances[feature] = wasserstein_distance(pdist_raw, pdist_grab)

# Convert the dictionary to a DataFrame
kl_divergences_df = pd.DataFrame.from_dict(
    kl_divergences, orient="index", columns=["KL Divergence"]
)
js_divergences_df = pd.DataFrame.from_dict(
    js_divergences, orient="index", columns=["JS Divergence"]
)
tv_distances_df = pd.DataFrame.from_dict(
    tv_distances, orient="index", columns=["TV Distance"]
)
w_distances_df = pd.DataFrame.from_dict(
    w_distances, orient="index", columns=["Wasserstein Distance"]
)

In [None]:
kl_divergences_df

In [None]:
js_divergences_df

In [None]:
tv_distances_df

In [None]:
w_distances_df

In [None]:
# Create a new figure
plt.figure(figsize=(15, 10))

# Create a subplot for each metric
plt.subplot(2, 2, 1)
kl_divergences_df["KL Divergence"].plot(kind="bar")
plt.title("KL Divergence")

plt.subplot(2, 2, 2)
js_divergences_df["JS Divergence"].plot(kind="bar")
plt.title("Jensen-Shannon Divergence")

plt.subplot(2, 2, 3)
tv_distances_df["TV Distance"].plot(kind="bar")
plt.title("Total Variation Distance")

plt.subplot(2, 2, 4)
w_distances_df.plot(kind="bar")
plt.title("Wasserstein Distance")

# Adjust the layout and show the figure
plt.tight_layout()
plt.show()

In [275]:
kl_divergences = {}
js_divergences = {}
tv_distances = {}
w_distances = {}


codes = grab_samples_df["Codice punto di prelievo"].unique()

for code in codes:
    kl_divergences[code] = {}
    js_divergences[code] = {}
    tv_distances[code] = {}
    w_distances[code] = {}

    for feature in columns:
        # # Compute the probability distribution of the feature in each DataFrame
        # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
        # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

        # # Add a small constant to avoid division by zero
        # pdist_raw = pdist_raw + np.finfo(np.float64).eps
        # pdist_grab = pdist_grab + np.finfo(np.float64).eps

        if (
            raw_grab_samples_df[feature].dropna().empty
            or grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][feature]
            .dropna()
            .empty
            or len(
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature]
                .dropna()
                .unique()
            )
            == 1
        ):
            continue

        kde_raw = gaussian_kde(raw_grab_samples_df[feature].dropna())
        kde_grab = gaussian_kde(
            grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][feature].dropna()
        )

        # Evaluate the KDEs on a range of values
        x = np.linspace(
            min(
                raw_grab_samples_df[feature].min(),
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature].min(),
            ),
            max(
                raw_grab_samples_df[feature].max(),
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature].max(),
            ),
            100,
        )
        pdist_raw = kde_raw(x)
        pdist_grab = kde_grab(x)

        # Compute the KL divergence and store it in the dictionary
        kl_divergences[code][feature] = stats.entropy(pdist_raw, pdist_grab)
        js_divergences[code][feature] = jensenshannon(pdist_raw, pdist_grab)
        tv_distances[code][feature] = np.sum(np.abs(pdist_raw - pdist_grab)) / 2
        w_distances[code][feature] = wasserstein_distance(pdist_raw, pdist_grab)

# Convert the dictionary to a DataFrame
kl_divergences_df = pd.DataFrame.from_dict(kl_divergences, orient="index")
js_divergences_df = pd.DataFrame.from_dict(js_divergences, orient="index")
tv_distances_df = pd.DataFrame.from_dict(tv_distances, orient="index")
w_distances_df = pd.DataFrame.from_dict(w_distances, orient="index")

In [276]:
kl_divergences_df

Unnamed: 0,Concentr. ioni idrogeno al prelievo (unità pH),Durezza totale (°F),Temperatura - °C,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
HOUSE_GASPARRI,19.304316,1.371838,0.122664,0.988372,inf,inf,
HOUSE_CIVITAVECCHIA,11.558795,8.011925,1.875037,0.63476,inf,,
HOUSE_BANDENERE,9.409422,5.219588,0.317875,0.311692,inf,,
HOUSE_S_RITA,11.275469,0.164981,0.103248,2.169435,inf,,
HOUSE_GRAMSCI,3.254346,2.112229,0.02865,inf,inf,,
HOUSE_BERNA,7.941463,0.504855,1.225931,4.423484,inf,,
HOUSE_CERMENATE,13.590131,1.925862,0.15454,1.345834,inf,,
HOUSE_MONTEVIDEO,5.965417,2.553024,0.064977,6.504665,inf,,
HOUSE_CHIOSTERGI,3.59757,3.831937,inf,0.342611,inf,,3.355111
HOUSE_MARAZZA,8.267292,3.269559,0.080332,1.46121,inf,,


In [277]:
js_divergences_df

Unnamed: 0,Concentr. ioni idrogeno al prelievo (unità pH),Durezza totale (°F),Temperatura - °C,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
HOUSE_GASPARRI,0.366656,0.300229,0.17492,0.472452,0.271922,0.407327,
HOUSE_CIVITAVECCHIA,0.315784,0.592862,0.333232,0.404329,0.271922,,
HOUSE_BANDENERE,0.296001,0.442111,0.184956,0.269417,0.271922,,
HOUSE_S_RITA,0.338229,0.127426,0.163128,0.234242,0.271922,,
HOUSE_GRAMSCI,0.276161,0.362914,0.08696,0.466616,0.271922,,
HOUSE_BERNA,0.291873,0.23416,0.271044,0.314775,0.271922,,
HOUSE_CERMENATE,0.32904,0.337629,0.204454,0.203617,0.271922,,
HOUSE_MONTEVIDEO,0.300635,0.374863,0.11512,0.370716,0.271922,,
HOUSE_CHIOSTERGI,0.294018,0.425307,0.642866,0.300132,0.271922,,0.773489
HOUSE_MARAZZA,0.354974,0.395454,0.14249,0.467747,0.271922,,


In [278]:
tv_distances_df

Unnamed: 0,Concentr. ioni idrogeno al prelievo (unità pH),Durezza totale (°F),Temperatura - °C,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
HOUSE_GASPARRI,8.6452,0.763633,0.754976,119.693431,0.026393,0.18905,
HOUSE_CIVITAVECCHIA,6.745692,1.670448,1.16178,85.824818,0.02123,,
HOUSE_BANDENERE,6.101555,1.277523,0.452906,65.038417,0.218876,,
HOUSE_S_RITA,8.40595,0.303589,0.693903,37.082324,0.054965,,
HOUSE_GRAMSCI,4.468054,0.96011,0.331139,101.16017,0.006176,,
HOUSE_BERNA,5.671988,0.615964,0.873447,60.546719,0.038949,,
HOUSE_CERMENATE,7.51003,0.87871,0.804541,31.338219,0.006226,,
HOUSE_MONTEVIDEO,5.675995,1.015276,0.426479,74.791102,0.003361,,
HOUSE_CHIOSTERGI,5.391929,1.215327,2.671775,67.813301,0.061011,,0.07595
HOUSE_MARAZZA,8.477826,1.10046,0.612969,112.917145,0.001825,,


In [300]:
w_distances_df.to_excel(
    "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/wasserstein_distances.xlsx"
)

In [287]:
time_range = {}


codes = grab_samples_df["Codice punto di prelievo"].unique()

for code in codes:
    time_range[code] = {}

    for feature in columns:
        # # Compute the probability distribution of the feature in each DataFrame
        # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
        # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

        # # Add a small constant to avoid division by zero
        # pdist_raw = pdist_raw + np.finfo(np.float64).eps
        # pdist_grab = pdist_grab + np.finfo(np.float64).eps

        if (
            raw_grab_samples_df[feature].dropna().empty
            or grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][feature]
            .dropna()
            .empty
            or len(
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature]
                .dropna()
                .unique()
            )
            == 1
        ):
            continue

        temp_df = grab_samples_df[
            grab_samples_df["Codice punto di prelievo"] == code
        ][["Data di prelievo", feature]]
        temp_df.dropna(inplace=True)

        min_time = temp_df["Data di prelievo"].min()
        max_time = temp_df["Data di prelievo"].max()
        length = temp_df.shape[0]

        time_range[code][feature] = {
            "start_time": min_time,
            "end_time": max_time,
            "n_samples": length,
        }

# Convert the dictionary to a DataFrame
time_range_df = pd.DataFrame.from_dict(time_range, orient="index")

In [288]:
time_range_df

Unnamed: 0,Concentr. ioni idrogeno al prelievo (unità pH),Durezza totale (°F),Temperatura - °C,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
HOUSE_GASPARRI,"{'start_time': 2021-04-12 00:00:00, 'end_time'...","{'start_time': 2021-09-16 00:00:00, 'end_time'...","{'start_time': 2021-04-12 00:00:00, 'end_time'...","{'start_time': 2022-01-25 00:00:00, 'end_time'...","{'start_time': 2021-04-12 00:00:00, 'end_time'...","{'start_time': 2022-01-25 00:00:00, 'end_time'...",
HOUSE_CIVITAVECCHIA,"{'start_time': 2021-05-24 00:00:00, 'end_time'...","{'start_time': 2021-08-25 00:00:00, 'end_time'...","{'start_time': 2021-05-24 00:00:00, 'end_time'...","{'start_time': 2022-02-14 00:00:00, 'end_time'...","{'start_time': 2021-07-01 00:00:00, 'end_time'...",,
HOUSE_BANDENERE,"{'start_time': 2021-06-14 00:00:00, 'end_time'...","{'start_time': 2021-08-26 00:00:00, 'end_time'...","{'start_time': 2021-06-14 00:00:00, 'end_time'...","{'start_time': 2022-04-19 00:00:00, 'end_time'...","{'start_time': 2021-06-14 00:00:00, 'end_time'...",,
HOUSE_S_RITA,"{'start_time': 2021-07-27 00:00:00, 'end_time'...","{'start_time': 2021-07-27 00:00:00, 'end_time'...","{'start_time': 2021-07-27 00:00:00, 'end_time'...","{'start_time': 2022-02-07 00:00:00, 'end_time'...","{'start_time': 2021-07-27 00:00:00, 'end_time'...",,
HOUSE_GRAMSCI,"{'start_time': 2021-07-29 00:00:00, 'end_time'...","{'start_time': 2021-07-29 00:00:00, 'end_time'...","{'start_time': 2021-07-29 00:00:00, 'end_time'...","{'start_time': 2022-02-22 00:00:00, 'end_time'...","{'start_time': 2021-07-29 00:00:00, 'end_time'...",,
HOUSE_BERNA,"{'start_time': 2021-08-26 00:00:00, 'end_time'...","{'start_time': 2021-08-26 00:00:00, 'end_time'...","{'start_time': 2021-08-26 00:00:00, 'end_time'...","{'start_time': 2022-04-19 00:00:00, 'end_time'...","{'start_time': 2021-08-26 00:00:00, 'end_time'...",,
HOUSE_CERMENATE,"{'start_time': 2021-09-30 00:00:00, 'end_time'...","{'start_time': 2021-09-30 00:00:00, 'end_time'...","{'start_time': 2021-09-30 00:00:00, 'end_time'...","{'start_time': 2022-01-18 00:00:00, 'end_time'...","{'start_time': 2021-09-30 00:00:00, 'end_time'...",,
HOUSE_MONTEVIDEO,"{'start_time': 2021-10-06 00:00:00, 'end_time'...","{'start_time': 2021-10-06 00:00:00, 'end_time'...","{'start_time': 2021-10-06 00:00:00, 'end_time'...","{'start_time': 2022-04-19 00:00:00, 'end_time'...","{'start_time': 2021-10-06 00:00:00, 'end_time'...",,
HOUSE_CHIOSTERGI,"{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2022-02-15 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...",,"{'start_time': 2021-10-12 00:00:00, 'end_time'..."
HOUSE_MARAZZA,"{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...","{'start_time': 2022-05-19 00:00:00, 'end_time'...","{'start_time': 2021-10-12 00:00:00, 'end_time'...",,


### Sensors Data

In [None]:
df_list = []

for file in os.listdir(sensor_data_folder_path):
    if file.endswith(".csv"):
        temp_df = pd.read_csv(
            os.path.join(sensor_data_folder_path, file), header=1, sep=";"
        )
        location_name = file.split("_")[0]
        temp_df.insert(0, "Location", location_name)
        code = houses_code_df[
            houses_code_df["Casa dell'acqua"] == location_name
        ]["Codice Punto di Prelievo"].values[0]
        temp_df.insert(1, "Codice Punto di Prelievo", code)
        df_list.append(temp_df)

raw_sensor_data_df = pd.concat(df_list, ignore_index=True)

In [None]:
raw_sensor_data_df

In [None]:
%%script false --no-raise-error

# group by codice punto di prelievo and plot every column
for punto in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    for col in raw_sensor_data_df.columns:
        # check if column belongs to float type
        if raw_sensor_data_df[col].dtype == float:
            sanitized_col = col.split("-")[0].rstrip()
            # Extract unit of measure from column name
            unit_of_measure = (
                col.split("[")[1].split("]")[0] if "[" in col else ""
            )
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == punto
            ].plot(
                x="Measurement interval=900[sec] (Export-Aggregation disabled)",
                y=col,
                legend=False,
                title=f"{punto} - {sanitized_col} [{unit_of_measure}]",
                fontsize=8,
                figsize=(40, 10),
            )
            directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Sensor data plots/{punto}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)

## Final Plots

In [None]:
sensor_df = raw_sensor_data_df.copy()
grab_df = raw_grab_samples_df.copy()

In [316]:
grab_df.columns.to_list()

['Data di prelievo',
 'Rapporto di prova',
 'Punto di prelievo',
 'Codice punto di prelievo',
 'Campagna',
 'Analisi programmate',
 'ZONA',
 'Alcalinità (mg/L)',
 'Cloro residuo libero (mg/L di Cl2)',
 'Colore (CU)',
 'Conduttività a 20°C (µS/cm)',
 'Concentrazione ioni idrogeno (unità pH)',
 'Concentr. ioni idrogeno al prelievo (unità pH)',
 'Durezza totale (°F)',
 'Indice di aggressività ottenuo per calcolo (no unità)',
 'Residuo secco a 180°C (mg/L)',
 'Solidi sospesi totali (mg/L)',
 'Temperatura - °C',
 'Torbidità (NTU)',
 'Bicarbonati - mg/L',
 'TOC - carbonio organico totale (mg/L di C)',
 'Carica batterica a 22°C (UFC/mL)',
 'Conteggio colonie a 30°C (UFC/mL)',
 'Carica batterica a 37°C (UFC/mL)',
 'Batteri coliformi a 37°C (MPN / 100 mL)',
 'Enterococchi (MPN / 100mL)',
 'Escherichia Coli (MPN / 100mL)',
 'Pseudomonas aeruginosa (UFC / 250mL)',
 'Pseudomonas aeruginosa (UFC/100 mL)',
 'source']

In [315]:
sensor_df.columns.to_list()

['Location',
 'Codice Punto di Prelievo',
 'Measurement interval=900[sec] (Export-Aggregation disabled)',
 'Status',
 'Tag',
 'COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)',
 'Status [COLORtrue - Measured value]',
 'TOCeq - Measured value [mg/l] (Limit:0.00-22.00)',
 'Status [TOCeq - Measured value]',
 'NO3eq - Measured value [mg/l] (Limit:0.00-88.00)',
 'Status [NO3eq - Measured value]',
 'UV254t - Measured value [Abs/m] (Limit:0.00-71.00)',
 'Status [UV254t - Measured value]',
 'Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)',
 'Status [Turbidity - Measured value]',
 'DOCeq - Measured value [mg/l] (Limit:0.00-17.00)',
 'Status [DOCeq - Measured value]',
 'pH - Measured value (Limit:0.00-14.00)',
 'Status [pH - Measured value]',
 'Temperature - Measured value [C] (Limit:-5.00-100.00)',
 'Status [Temperature - Measured value]',
 'ORP - Measured value [mV] (Limit:-2000.00-2000.00)',
 'Status [ORP - Measured value]',
 'Conductivity - Measured value [uS/cm] (Limit:0.

In [None]:
sensor_df[
    "Measurement interval=900[sec] (Export-Aggregation disabled)"
] = pd.to_datetime(
    sensor_df["Measurement interval=900[sec] (Export-Aggregation disabled)"],
    format="mixed",
)
grab_df["Data di prelievo"] = pd.to_datetime(grab_df["Data di prelievo"])

In [None]:
column_mapping = {
    "Colore (CU)": "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)",
    "TOC - carbonio organico totale (mg/L di C)": "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)",
    "Conduttività a 20°C (µS/cm)": "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)",
    "Cloro residuo libero (mg/L di Cl2)": "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)",
    "Concentrazione ioni idrogeno (unità pH)": "pH - Measured value (Limit:0.00-14.00)",
    "Temperatura - °C": "Temperature - Measured value [C] (Limit:-5.00-100.00)",
    "Codice punto di prelievo": "Codice Punto di Prelievo",
}

# Add a 'source' column to each dataframe
sensor_df["source"] = "sensor"
grab_df["source"] = "grab"

# Initialize an empty DataFrame for the result
result_df = pd.DataFrame()

# For each pair of columns in the mapping
for grab_col, sensor_col in column_mapping.items():
    # Merge the two dataframes on the date columns
    merged_df = pd.merge(
        sensor_df[
            [
                "Measurement interval=900[sec] (Export-Aggregation disabled)",
                sensor_col,
                "source",
            ]
        ],
        grab_df[["Data di prelievo", grab_col, "source"]],
        left_on="Measurement interval=900[sec] (Export-Aggregation disabled)",
        right_on="Data di prelievo",
        how="outer",
    )

    # Rename the columns
    merged_df.columns = [
        "Date",
        sensor_col,
        "Source_sensor",
        "Date_grab",
        grab_col,
        "Source_grab",
    ]

    # Append the merged dataframe to the result dataframe
    result_df = pd.concat([result_df, merged_df], axis=1)

# Drop the extra date column
result_df = result_df.loc[:, ~result_df.columns.duplicated()]

In [None]:
plot_data_df = pd.DataFrame()

column_mapping["Date_grab"] = "Date"
column_mapping["Source_grab"] = "Source_sensor"

# For each pair of columns in the mapping
for grab_col, sensor_col in column_mapping.items():
    # Combine the pair of columns into a single column
    plot_data_df[grab_col] = result_df[grab_col].combine_first(
        result_df[sensor_col]
    )

plot_data_df.rename(
    columns={"Source_grab": "Source", "Date_grab": "Date"}, inplace=True
)

In [None]:
plot_data_df

In [None]:
print(
    f"{plot_data_df[plot_data_df['Source'] == 'sensor']['Date'].min()} + ' - ' + {plot_data_df[plot_data_df['Source'] == 'sensor']['Date'].max()}"
)

In [None]:
print(
    f"{plot_data_df[plot_data_df['Source'] == 'grab']['Date'].min()} + ' - ' + {plot_data_df[plot_data_df['Source'] == 'grab']['Date'].max()}"
)

In [None]:
from matplotlib.lines import Line2D


codes = plot_data_df[plot_data_df["Source"] == "sensor"][
    "Codice punto di prelievo"
].unique()

for code in codes:
    for column in plot_data_df.columns[:5]:
        plt.figure(figsize=(40, 10))

        # Filter the dataframe for rows where Source is 'sensor'

        # Melt the dataframe to have 'Date', 'Codice punto di prelievo', 'Source' and 'value' columns
        sensor_data_df = plot_data_df[
            (plot_data_df["Codice punto di prelievo"] == code)
            & (plot_data_df["Source"] == "sensor")
        ]
        grab_sample_df = plot_data_df[plot_data_df["Source"] == "grab"]
        # grab_sample_df = plot_data_df[(plot_data_df['Codice punto di prelievo'] == code) & (plot_data_df['Source'] == 'grab')]

        sns.lineplot(data=sensor_data_df, x="Date", y=column, color="red")
        sns.lineplot(data=grab_sample_df, x="Date", y=column, color="blue")

        plt.title(code, fontsize=20)
        plt.xlabel("Time", fontsize=20)
        plt.ylabel(column, fontsize=20)

        custom_lines = [
            Line2D([0], [0], color="red", lw=4),
            Line2D([0], [0], color="blue", lw=4),
        ]
        plt.legend(custom_lines, ["Sensor", "Grab"])

        directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Comparison Plots/{code}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = sanitize_filename(f"{column}.png")
        plt.savefig(os.path.join(directory, filename), dpi=300)

# Store Metadata

In [314]:
# show a table for each index summarizing the time_range_df by showing the number of samples, the start time and the end time for each feature in each house
from IPython.display import display, Markdown

for code in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    with pd.ExcelWriter(
        f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/{code}.xlsx"
    ) as writer:
        for column in time_range_df.columns:
            row = time_range_df.loc[code, column]
            if isinstance(row, dict):
                df = pd.DataFrame(list(row.items()), columns=["Key", "Value"])
                df.to_excel(writer, sheet_name=sanitize_filename(column))
            else:
                continue
        min_time = pd.to_datetime(
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == code
            ]["Measurement interval=900[sec] (Export-Aggregation disabled)"],
            format="mixed",
            dayfirst=True,
        ).min()
        max_time = pd.to_datetime(
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == code
            ]["Measurement interval=900[sec] (Export-Aggregation disabled)"],
            format="mixed",
            dayfirst=True,
        ).max()
        df = pd.DataFrame({"start_time": [min_time], "end_time": [max_time]})
        df.to_excel(writer, sheet_name="sensor")

