# Supply Points Data Analysis

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
from scipy.stats import gaussian_kde


from pathvalidate import sanitize_filename

### Paths

In [4]:
data_path = "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano"
root_folder_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/SafeCREW/soft_sensors/Soft Sensor CS2Milan"

dir_temporary_results_path = os.path.join(data_path, "temporary results")
raw_grab_samples_path = os.path.join(
    dir_temporary_results_path, "raw_grab_samples_supply_points.xlsx"
)
house_codes_path = os.path.join(data_path, "Case-Codici.xlsx")
sensor_data_folder_path = os.path.join(
    root_folder_path, "Case dell'acqua - Sensori"
)

### Utils

In [None]:
def count_values(series):
    num_nans = series.isna().sum()
    strings = series[
        series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))
    ].count()
    num_numbers = series[
        series.apply(lambda x: isinstance(x, (int, float)))
    ].count()
    return pd.Series(
        [num_nans, strings, num_numbers], index=["NaN", "Strings", "numbers"]
    )

In [None]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

# Grab Samples

### Import Dataset

In [None]:
raw_grab_samples_df = pd.read_excel(raw_grab_samples_path, dtype=object)

In [None]:
raw_grab_samples_df.columns.to_list()

In [None]:
columns = raw_grab_samples_df.columns[7:]

## All Case dell'Acqua - Overall Analysis

### NaN vs Strings vs Numbers

In [None]:
histogram = raw_grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
histogram

In [None]:
# To see Strings add 'Strings' to the list below
ax = histogram.T[["NaN", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

#### Convert String Values

In [None]:
raw_grab_samples_df[columns] = raw_grab_samples_df[columns].applymap(
    convert_string_values
)

### Histogram Distributions

In [None]:
# %%script false --no-raise-error

plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = raw_grab_samples_df[column].where(
        raw_grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(raw_grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

    # directory = os.path.join(dir_temporary_results_path, "histograms_all")
    # if not os.path.exists(directory):
    #     os.makedirs(directory)

    # plt.savefig(
    #     os.path.join(
    #         directory,
    #         sanitize_filename(column) + ".png",
    #     ),
    #     dpi=300,
    # )

    plt.show()

### Timeseries Plots

In [None]:
# %%script false --no-raise-error

for col in columns:
    sanitized_col = sanitize_filename(col)
    # Extract unit of measure from column name
    raw_grab_samples_df.plot(
        x="Data di prelievo",
        y=col,
        legend=False,
        title=f"{sanitized_col}",
        fontsize=8,
        figsize=(40, 10),
    )
    # directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Grab samples data plots/{sanitized_col}"
    # if not os.path.exists(directory):
    #     os.makedirs(directory)
    # plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)
    plt.show()

## Selected Case dell'Acqua - Overall Analysis

### Import Codes Mapping

In [6]:
houses_code_df = pd.read_excel(house_codes_path)

In [7]:
houses_code_df["Casa dell'acqua"][7] = houses_code_df["Casa dell'acqua"][
    7
].rstrip()

houses_code_df.loc[4] = ["Chiostergi", "HOUSE_CHIOSTERGI"]

In [None]:
houses_code_df

### Filter Overall Dataset

In [None]:
# keep just rows of raw_grab_samples_df that have a Codice punto di prelievo
# that is contained in the houses_code_df Codice Punto di Prelievo
grab_samples_df = raw_grab_samples_df.merge(
    houses_code_df,
    left_on="Codice punto di prelievo",
    right_on="Codice Punto di Prelievo",
    how="inner",
)

grab_samples_df.drop(
    columns=["Casa dell'acqua", "Codice Punto di Prelievo"], inplace=True
)

In [None]:
grab_samples_df

### NaN vs Strings vs Numbers 

In [None]:
histogram = grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
histogram

In [None]:
ax = histogram.T[["NaN", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

### Check Distributions Divergence to Exploit More Data (All vs Selected Overall)

In [1]:
# Initialize a dictionary to store the KL divergence for each feature
kl_divergences = {}
js_divergences = {}
tv_distances = {}
w_distances = {}

# For each feature in the DataFrame
for feature in columns:
    # # Compute the probability distribution of the feature in each DataFrame
    # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
    # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

    # # Add a small constant to avoid division by zero
    # pdist_raw = pdist_raw + np.finfo(np.float64).eps
    # pdist_grab = pdist_grab + np.finfo(np.float64).eps

    if (
        raw_grab_samples_df[feature].dropna().empty
        or grab_samples_df[feature].dropna().empty
        or len(grab_samples_df[feature].dropna().unique()) == 1
    ):
        continue

    kde_raw = gaussian_kde(raw_grab_samples_df[feature].dropna())
    kde_grab = gaussian_kde(grab_samples_df[feature].dropna())

    # Evaluate the KDEs on a range of values
    x = np.linspace(
        min(raw_grab_samples_df[feature].min(), grab_samples_df[feature].min()),
        max(raw_grab_samples_df[feature].max(), grab_samples_df[feature].max()),
        100,
    )
    pdist_raw = kde_raw(x)
    pdist_grab = kde_grab(x)

    # Compute the KL divergence and store it in the dictionary
    kl_divergences[feature] = stats.entropy(pdist_raw, pdist_grab)
    js_divergences[feature] = jensenshannon(pdist_raw, pdist_grab)
    tv_distances[feature] = np.sum(np.abs(pdist_raw - pdist_grab)) / 2
    w_distances[feature] = wasserstein_distance(pdist_raw, pdist_grab)

# Convert the dictionary to a DataFrame
kl_divergences_df = pd.DataFrame.from_dict(
    kl_divergences, orient="index", columns=["KL Divergence"]
)
js_divergences_df = pd.DataFrame.from_dict(
    js_divergences, orient="index", columns=["JS Divergence"]
)
tv_distances_df = pd.DataFrame.from_dict(
    tv_distances, orient="index", columns=["TV Distance"]
)
w_distances_df = pd.DataFrame.from_dict(
    w_distances, orient="index", columns=["Wasserstein Distance"]
)

NameError: name 'columns' is not defined

## Selected Case dell'Acqua - One by One Analysis

### NaN vs Strings vs Numbers

In [None]:
for code in grab_samples_df["Codice punto di prelievo"].unique():
    histogram = grab_samples_df[
        grab_samples_df["Codice punto di prelievo"] == code
    ][columns].apply(count_values)
    histogram.loc["Total"] = histogram.sum()
    ax = histogram.T[["NaN", "numbers"]].plot.bar(figsize=(30, 10))
    ax.set_title(code)
    for p in ax.patches:
        ax.annotate(
            str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

    directory = os.path.join(dir_temporary_results_path, "histograms_by_house")
    if not os.path.exists(directory):
        os.makedirs(directory)

    plt.savefig(
        os.path.join(
            directory,
            sanitize_filename(code) + ".png",
        ),
        dpi=300,
        bbox_inches="tight",
    )

    # plt.show()

### Histogram Distributions

In [None]:
# %%script false --no-raise-error

plt.style.use("ggplot")

for code in grab_samples_df["Codice punto di prelievo"].unique():
    for column in columns:
        plt.figure(figsize=(20, 10))
        hist = grab_samples_df[
            grab_samples_df["Codice punto di prelievo"] == code
        ][column].where(
            grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][column].apply(lambda x: isinstance(x, (int, float)))
        )
        count, bins, patches = plt.hist(
            hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
        )
        plt.title(
            code
            + " - "
            + column
            + " - Count: "
            + str(hist.count())
            + " / "
            + str(
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ].shape[0]
            )
        )
        plt.ylabel("Frequency")

        # Set x-ticks to bin edges and x-tick labels to intervals
        plt.xticks(
            bins[:-1],
            [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
            rotation="vertical",
            fontsize=8,
        )

        # Add count for every bar
        for p in patches:
            plt.annotate(
                str(int(p.get_height())),
                (p.get_x() * 1.005, p.get_height() * 1.02),
            )

        # directory = os.path.join(dir_temporary_results_path, "histograms_filtered")
        # if not os.path.exists(directory):
        #     os.makedirs(directory)

        # plt.savefig(
        #     os.path.join(
        #         directory,
        #         sanitize_filename(code + ' - ' + column) + ".png",
        #     ),
        #     dpi=300,
        # )

        plt.show()

In [None]:
kl_divergences_df

In [None]:
js_divergences_df

In [None]:
tv_distances_df

In [None]:
w_distances_df

### Timeseries Plots

In [None]:
# %%script false --no-raise-error

for punto in grab_samples_df["Codice punto di prelievo"].unique():
    for col in columns:
        grab_samples_df[
            grab_samples_df["Codice punto di prelievo"] == punto
        ].plot(
            x="Data di prelievo",
            y=col,
            legend=False,
            title=f"{punto} - {col}",
            fontsize=8,
            figsize=(40, 10),
        )
        directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Grab samples data plots/{punto}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = sanitize_filename(f"{col}.png")
        plt.savefig(os.path.join(directory, filename), dpi=300)

        plt.show()

### Check Distributions Divergence to Exploit More Data (All vs Selected One by One)

In [None]:
kl_divergences = {}
js_divergences = {}
tv_distances = {}
w_distances = {}


codes = grab_samples_df["Codice punto di prelievo"].unique()

for code in codes:
    kl_divergences[code] = {}
    js_divergences[code] = {}
    tv_distances[code] = {}
    w_distances[code] = {}

    for feature in columns:
        # # Compute the probability distribution of the feature in each DataFrame
        # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
        # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

        # # Add a small constant to avoid division by zero
        # pdist_raw = pdist_raw + np.finfo(np.float64).eps
        # pdist_grab = pdist_grab + np.finfo(np.float64).eps

        if (
            raw_grab_samples_df[feature].dropna().empty
            or grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][feature]
            .dropna()
            .empty
            or len(
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature]
                .dropna()
                .unique()
            )
            == 1
        ):
            continue

        kde_raw = gaussian_kde(raw_grab_samples_df[feature].dropna())
        kde_grab = gaussian_kde(
            grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == code
            ][feature].dropna()
        )

        # Evaluate the KDEs on a range of values
        x = np.linspace(
            min(
                raw_grab_samples_df[feature].min(),
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature].min(),
            ),
            max(
                raw_grab_samples_df[feature].max(),
                grab_samples_df[
                    grab_samples_df["Codice punto di prelievo"] == code
                ][feature].max(),
            ),
            100,
        )
        pdist_raw = kde_raw(x)
        pdist_grab = kde_grab(x)

        # Compute the KL divergence and store it in the dictionary
        kl_divergences[code][feature] = stats.entropy(pdist_raw, pdist_grab)
        js_divergences[code][feature] = jensenshannon(pdist_raw, pdist_grab)
        tv_distances[code][feature] = np.sum(np.abs(pdist_raw - pdist_grab)) / 2
        w_distances[code][feature] = wasserstein_distance(pdist_raw, pdist_grab)

# Convert the dictionary to a DataFrame
kl_divergences_df = pd.DataFrame.from_dict(kl_divergences, orient="index")
js_divergences_df = pd.DataFrame.from_dict(js_divergences, orient="index")
tv_distances_df = pd.DataFrame.from_dict(tv_distances, orient="index")
w_distances_df = pd.DataFrame.from_dict(w_distances, orient="index")

In [None]:
kl_divergences_df

In [None]:
js_divergences_df

In [None]:
tv_distances_df

In [None]:
w_distances_df

#### Save wasserstein distances

In [None]:
w_distances_df.to_excel(
    "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/wasserstein_distances.xlsx"
)

### Time Range for each Casa dell'Acqua

In [None]:
time_range = {}


codes = grab_samples_df["Codice punto di prelievo"].unique()

for code in codes:
    time_range[code] = {}

    for feature in columns:
        # # Compute the probability distribution of the feature in each DataFrame
        # pdist_raw = np.histogram(raw_grab_samples_df[feature].dropna(), bins=100, density=True)[0]
        # pdist_grab = np.histogram(grab_samples_df[feature].dropna(), bins=100, density=True)[0]

        # # Add a small constant to avoid division by zero
        # pdist_raw = pdist_raw + np.finfo(np.float64).eps
        # pdist_grab = pdist_grab + np.finfo(np.float64).eps

        temp_df = grab_samples_df[
            grab_samples_df["Codice punto di prelievo"] == code
        ][["Data di prelievo", feature]]
        temp_df.dropna(inplace=True)

        min_time = temp_df["Data di prelievo"].min()
        max_time = temp_df["Data di prelievo"].max()
        length = temp_df.shape[0]

        time_range[code][feature] = {
            "start_time": min_time,
            "end_time": max_time,
            "n_samples": length,
        }

# Convert the dictionary to a DataFrame
time_range_df = pd.DataFrame.from_dict(time_range, orient="index")

In [None]:
time_range_df

# Sensor Samples

### Load Data

In [8]:
df_list = []

for file in os.listdir(sensor_data_folder_path):
    if file.endswith(".csv"):
        temp_df = pd.read_csv(
            os.path.join(sensor_data_folder_path, file), header=1, sep=";"
        )
        location_name = file.split("_")[0]
        temp_df.insert(0, "Location", location_name)
        code = houses_code_df[
            houses_code_df["Casa dell'acqua"] == location_name
        ]["Codice Punto di Prelievo"].values[0]
        temp_df.insert(1, "Codice Punto di Prelievo", code)
        df_list.append(temp_df)

raw_sensor_data_df = pd.concat(df_list, ignore_index=True)

In [None]:
raw_sensor_data_df

In [None]:
# get columns that do not contain the string 'Status'
sensor_columns = raw_sensor_data_df.columns[
    ~raw_sensor_data_df.columns.str.contains("Status")
]
sensor_columns.to_list()

['Location',
 'Codice Punto di Prelievo',
 'Measurement interval=900[sec] (Export-Aggregation disabled)',
 'Tag',
 'COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)',
 'TOCeq - Measured value [mg/l] (Limit:0.00-22.00)',
 'NO3eq - Measured value [mg/l] (Limit:0.00-88.00)',
 'UV254t - Measured value [Abs/m] (Limit:0.00-71.00)',
 'Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)',
 'DOCeq - Measured value [mg/l] (Limit:0.00-17.00)',
 'pH - Measured value (Limit:0.00-14.00)',
 'Temperature - Measured value [C] (Limit:-5.00-100.00)',
 'ORP - Measured value [mV] (Limit:-2000.00-2000.00)',
 'Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)',
 'Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)',
 'Flow - Measured value (Limit:0.00-1.00)',
 'Free Chlorine - Clean value [mg/l] (Limit:0.00-2.00)']

### NaN vs Strings vs Numbers

In [None]:
for code in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    histogram = raw_sensor_data_df[
        raw_sensor_data_df["Codice Punto di Prelievo"] == code
    ][sensor_columns.to_list()[4:]].apply(count_values)
    histogram.loc["Total"] = histogram.sum()
    ax = histogram.T[["NaN", "numbers"]].plot.bar(figsize=(30, 10))
    ax.set_title(code)

    for p in ax.patches:
        ax.annotate(
            str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

    directory = os.path.join(
        dir_temporary_results_path, "NaNvsNumbers_sensor_data"
    )
    if not os.path.exists(directory):
        os.makedirs(directory)

    plt.savefig(
        os.path.join(
            directory,
            sanitize_filename(code) + ".png",
        ),
        dpi=300,
        bbox_inches="tight",
    )

    # plt.show()

### Histrogram Distributions

In [None]:
for code in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    for column in sensor_columns.to_list()[4:]:
        plt.figure(figsize=(20, 10))
        hist = raw_sensor_data_df[
            raw_sensor_data_df["Codice Punto di Prelievo"] == code
        ][column].where(
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == code
            ][column].apply(lambda x: isinstance(x, (int, float)))
        )
        count, bins, patches = plt.hist(
            hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
        )
        plt.title(
            code
            + " - "
            + column
            + " - Count: "
            + str(hist.count())
            + " / "
            + str(
                raw_sensor_data_df[
                    raw_sensor_data_df["Codice Punto di Prelievo"] == code
                ].shape[0]
            )
        )
        plt.ylabel("Frequency")

        # Set x-ticks to bin edges and x-tick labels to intervals
        plt.xticks(
            bins[:-1],
            [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
            rotation="vertical",
            fontsize=8,
        )

        # Add count for every bar
        for p in patches:
            plt.annotate(
                str(int(p.get_height())),
                (p.get_x() * 1.005, p.get_height() * 1.02),
            )

        directory = os.path.join(
            dir_temporary_results_path, "histograms_sensor_data"
        )
        # if not os.path.exists(directory):
        #     os.makedirs(directory)

        # plt.savefig(
        #     os.path.join(
        #         directory,
        #         sanitize_filename(code + ' - ' + column) + ".png",
        #     ),
        #     dpi=300,
        # )

        plt.show()

### Timeseries Plots

In [None]:
# %%script false --no-raise-error

# group by codice punto di prelievo and plot every column
for punto in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    for col in raw_sensor_data_df.columns:
        # check if column belongs to float type
        if raw_sensor_data_df[col].dtype == float:
            sanitized_col = col.split("-")[0].rstrip()
            # Extract unit of measure from column name
            unit_of_measure = (
                col.split("[")[1].split("]")[0] if "[" in col else ""
            )
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == punto
            ].plot(
                x="Measurement interval=900[sec] (Export-Aggregation disabled)",
                y=col,
                legend=False,
                title=f"{punto} - {sanitized_col} [{unit_of_measure}]",
                fontsize=8,
                figsize=(40, 10),
            )
            # directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Sensor data plots/{punto}"
            # if not os.path.exists(directory):
            #     os.makedirs(directory)
            # plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)

            plt.show()

# Final Plots

### Final fixes

In [None]:
sensor_df = raw_sensor_data_df.copy()
grab_df = raw_grab_samples_df.copy()

In [None]:
grab_df.columns.to_list()

In [None]:
grab_columns = grab_df.columns[7:]

In [None]:
sensor_df.columns.to_list()

In [None]:
sensor_df[
    "Measurement interval=900[sec] (Export-Aggregation disabled)"
] = pd.to_datetime(
    sensor_df["Measurement interval=900[sec] (Export-Aggregation disabled)"],
    format="%d/%m/%Y %H:%M",
)
grab_df["Data di prelievo"] = pd.to_datetime(grab_df["Data di prelievo"])

### Map Common Columns between the two dfs

In [None]:
"""
For the moment the common columns are taken manually
"""

column_mapping = {
    "Data di prelievo": "Measurement interval=900[sec] (Export-Aggregation disabled)",
    "Colore (CU)": "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)",
    "TOC - carbonio organico totale (mg/L di C)": "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)",
    "Conduttività a 20°C (µS/cm)": "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)",
    "Cloro residuo libero (mg/L di Cl2)": "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)",
    "Concentr. ioni idrogeno al prelievo (unità pH)": "pH - Measured value (Limit:0.00-14.00)",
    "Temperatura - °C": "Temperature - Measured value [C] (Limit:-5.00-100.00)",
    "Codice punto di prelievo": "Codice Punto di Prelievo",
}

inverse_column_mapping = {v: k for k, v in column_mapping.items()}

In [None]:
# create dataframe with grab_columns
grabs = pd.DataFrame(data=grab_columns, columns=["grab_columns"])

# create dataframe with sensor_columns
sensors = pd.DataFrame(data=sensor_columns, columns=["sensor_columns"])

grabs.to_excel(
    "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/grab_columns.xlsx"
)
sensors.to_excel(
    "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/sensor_columns.xlsx"
)

In [None]:
common_columns = pd.DataFrame(
    column_mapping.items(), columns=["Grab", "Sensor"]
)

In [None]:
common_columns.to_excel(
    "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/common_columns.xlsx"
)

In [None]:
# Rename sensor df columns with grab df columns

sensor_df.rename(columns=inverse_column_mapping, inplace=True)

### Create unique df

In [None]:
sensor_df["Source"] = "sensor"
grab_df["Source"] = "grab"

result_df = grab_df.merge(
    sensor_df,
    on=["Data di prelievo"],
    how="outer",
)

for column in list(column_mapping.keys())[1:]:
    result_df[column + "_x"] = result_df[column + "_x"].fillna(
        result_df[column + "_y"]
    )
    result_df[column] = result_df[column + "_x"]
    result_df.drop(columns=[column + "_x", column + "_y"], inplace=True)

result_df["Source_x"] = result_df["Source_x"].fillna(result_df["Source_y"])
result_df["Source"] = result_df["Source_x"]
result_df.drop(columns=["Source_x", "Source_y"], inplace=True)

# drop columns not in column_mapping and different from 'source'
result_df.drop(
    columns=[
        column
        for column in result_df.columns
        if column not in list(column_mapping.keys()) + ["Source"]
    ],
    inplace=True,
)

plot_data_df = result_df.copy()
plot_data_df.rename(columns={"Data di prelievo": "Date"}, inplace=True)

In [None]:
print(
    f"{plot_data_df[plot_data_df['Source'] == 'sensor']['Date'].min()} + ' - ' + {plot_data_df[plot_data_df['Source'] == 'sensor']['Date'].max()}"
)

In [None]:
print(
    f"{plot_data_df[plot_data_df['Source'] == 'grab']['Date'].min()} + ' - ' + {plot_data_df[plot_data_df['Source'] == 'grab']['Date'].max()}"
)

In [None]:
# drop rows with all NaN values
plot_data_df.dropna(
    subset=list(column_mapping.keys())[1:-1], how="all", inplace=True
)
plot_data_df

In [None]:
plot_data_df[
    (plot_data_df["Source"] == "grab")
    & (plot_data_df["Codice punto di prelievo"].isin(codes))
]

### Comparison Plots for each selected Casa dell'Acqua

In [None]:
from matplotlib.lines import Line2D


codes = plot_data_df[plot_data_df["Source"] == "sensor"][
    "Codice punto di prelievo"
].unique()

for code in codes:
    for column in list(column_mapping.keys())[1:-1]:
        plt.figure(figsize=(40, 10))

        # Filter the dataframe for rows where Source is 'sensor'

        # Melt the dataframe to have 'Date', 'Codice punto di prelievo', 'Source' and 'value' columns
        sensor_data_df = plot_data_df[
            (plot_data_df["Codice punto di prelievo"] == code)
            & (plot_data_df["Source"] == "sensor")
        ]

        # if we want to exploit more data
        # grab_sample_df = plot_data_df[plot_data_df["Source"] == "grab"]

        grab_sample_df = plot_data_df[
            (plot_data_df["Codice punto di prelievo"] == code)
            & (plot_data_df["Source"] == "grab")
        ]

        sns.lineplot(
            data=sensor_data_df, x="Date", y=column, color="red", errorbar=None
        )
        sns.lineplot(
            data=grab_sample_df, x="Date", y=column, color="blue", errorbar=None
        )
        # sns.scatterplot(data=grab_sample_df, x="Date", y=column, color="blue", marker="x")

        plt.title(code, fontsize=20)
        plt.xlabel("Time", fontsize=20)
        plt.ylabel(column, fontsize=20)

        custom_lines = [
            Line2D([0], [0], color="red", lw=4),
            Line2D([0], [0], color="blue", lw=4),
        ]
        plt.legend(custom_lines, ["Sensor", "Grab"])

        directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Comparison Plots/{code}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = sanitize_filename(f"{column}.png")
        plt.savefig(os.path.join(directory, filename), dpi=300)
        # plt.show()

# Store Metadata

In [None]:
# show a table for each index summarizing the time_range_df by showing the number of samples, the start time and the end time for each feature in each house
for code in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    with pd.ExcelWriter(
        f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Metadata/{code}.xlsx"
    ) as writer:
        df = pd.DataFrame()
        for column in list(column_mapping.keys())[1:-1]:
            row = time_range_df.loc[code, column]
            if isinstance(row, dict):
                temp_df = pd.DataFrame(
                    index=list(row.keys()),
                    data=list(row.values()),
                    columns=[column],
                )
                df = pd.concat([df, temp_df], axis=1)
                # df.to_excel(writer, sheet_name=sanitize_filename(column))
            else:
                continue
        min_time = pd.to_datetime(
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == code
            ]["Measurement interval=900[sec] (Export-Aggregation disabled)"],
            format="mixed",
            dayfirst=True,
        ).min()
        max_time = pd.to_datetime(
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == code
            ]["Measurement interval=900[sec] (Export-Aggregation disabled)"],
            format="mixed",
            dayfirst=True,
        ).max()
        sens_df = pd.DataFrame(
            index=["start_time", "end_time"],
            data=[min_time, max_time],
            columns=["sensor"],
        )
        df = pd.concat([df, sens_df], axis=1)
        df.to_excel(writer, sheet_name=code)

# Common time interval between sensor and grab

In [None]:
codes = plot_data_df[plot_data_df["Source"] == "sensor"][
    "Codice punto di prelievo"
].unique()

for code in codes:
    # Filter the dataframe for rows where Source is 'sensor'

    # Melt the dataframe to have 'Date', 'Codice punto di prelievo', 'Source' and 'value' columns
    sensor_data_df = plot_data_df[
        (plot_data_df["Codice punto di prelievo"] == code)
        & (plot_data_df["Source"] == "sensor")
    ]
    grab_sample_df = plot_data_df[
        (plot_data_df["Codice punto di prelievo"] == code)
        & (plot_data_df["Source"] == "grab")
    ]
    # if we want to exploit more data
    # grab_sample_df = plot_data_df[plot_data_df["Source"] == "grab"]

    print(f"==== {code} ====")
    print(
        "Sensor: "
        + str(sensor_data_df["Date"].min())
        + " - "
        + str(sensor_data_df["Date"].max())
    )
    print(
        "Grab: "
        + str(grab_sample_df["Date"].min())
        + " - "
        + str(grab_sample_df["Date"].max())
    )
    print()

    if code == "HOUSE_CERMENATE":
        pass

    # get samples that are in the common time range
    grab_common_time_range_df = grab_sample_df[
        (grab_sample_df["Date"] >= sensor_data_df["Date"].min())
        & (grab_sample_df["Date"] <= sensor_data_df["Date"].max())
    ]

    grab_common_time_range_df.dropna(inplace=True)

    sensor_common_time_range_df = sensor_data_df[
        (sensor_data_df["Date"] >= grab_sample_df["Date"].min())
        & (sensor_data_df["Date"] <= grab_sample_df["Date"].max())
    ]

    common_time_range_df = pd.concat(
        [grab_common_time_range_df, sensor_common_time_range_df],
        ignore_index=True,
    )

    sensor_time_range_df = common_time_range_df[
        common_time_range_df["Source"] == "sensor"
    ]

    grab_time_range_df = common_time_range_df[
        common_time_range_df["Source"] == "grab"
    ]

    # plot the samples in the common time range for each feature with different colors
    for column in list(column_mapping.keys())[1:-1]:
        plt.figure(figsize=(40, 10))

        sns.lineplot(
            data=sensor_time_range_df,
            x="Date",
            y=column,
            color="red",
            errorbar=None,
        )

        sns.lineplot(
            data=grab_time_range_df,
            x="Date",
            y=column,
            color="blue",
            errorbar=None,
        )

        sns.scatterplot(
            data=grab_time_range_df,
            x="Date",
            y=column,
            color="blue",
        )

        plt.title(code, fontsize=20)
        plt.xlabel("Time", fontsize=20)
        plt.ylabel(column, fontsize=20)

        custom_lines = [
            Line2D([0], [0], color="red", lw=4),
            Line2D([0], [0], color="blue", lw=4),
        ]
        plt.legend(custom_lines, ["Sensor", "Grab"])

        # directory = f"/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/Common Time Range/{code}"
        # if not os.path.exists(directory):
        #     os.makedirs(directory)
        # filename = sanitize_filename(f"{column}.png")
        # plt.savefig(os.path.join(directory, filename), dpi=300)
        plt.show()