# Supply Points Data Analysis

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data_path = "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano"
dir_temporary_results_path = os.path.join(data_path, "temporary results")
raw_grab_samples_path = os.path.join(
    dir_temporary_results_path, "raw_grab_samples_supply_points.csv"
)
house_codes_path = os.path.join(data_path, "Case-Codici.xlsx")

In [5]:
raw_grab_samples_df = pd.read_csv(raw_grab_samples_path)

In [None]:
def count_values(series):
    num_nans = series.isna().sum()
    strings = series[
        series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))
    ].count()
    num_numbers = series[
        series.apply(lambda x: isinstance(x, (int, float)))
    ].count()
    return pd.Series(
        [num_nans, strings, num_numbers], index=["NaN", "Strings", "numbers"]
    )

  series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))


In [None]:
columns = raw_grab_samples_df.columns[7:]

In [None]:
histogram = raw_grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
histogram

In [None]:
ax = histogram.T[["NaN", "Strings", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

In [None]:
plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = raw_grab_samples_df[column].where(
        raw_grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(raw_grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

In [None]:
# TODO finire di spostare da house code path

In [None]:
houses_code_df = pd.read_excel(house_codes_path)

In [None]:
houses_code_df["Casa dell'acqua"][7] = houses_code_df["Casa dell'acqua"][
    7
].rstrip()

houses_code_df.loc[4] = ["Chiostergi", "HOUSE_CHIOSTERGI"]

In [None]:
houses_code_df

In [None]:
# keep just rows of raw_grab_samples_df that have a Codice punto di prelievo that is contained in the houses_code_df Codice Punto di Prelievo
grab_samples_df = raw_grab_samples_df.merge(
    houses_code_df,
    left_on="Codice punto di prelievo",
    right_on="Codice Punto di Prelievo",
    how="inner",
)

grab_samples_df.drop(
    columns=["Casa dell'acqua", "Codice Punto di Prelievo"], inplace=True
)

In [None]:
grab_samples_df

In [None]:
histogram = grab_samples_df[columns].apply(count_values)

In [None]:
histogram.loc["Total"] = histogram.sum()

In [None]:
ax = histogram.T[["NaN", "Strings", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

In [None]:
plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = grab_samples_df[column].where(
        grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )

In [None]:
# TODO prima trasformare i dati non numerici in numerici e poi fare il plot

for punto in grab_samples_df["Codice punto di prelievo"].unique():
    for col in grab_samples_df.columns:
        # check if column belongs to float type
        if grab_samples_df[col].dtype == float:
            sanitized_col = col.split("-")[0].rstrip()
            # Extract unit of measure from column name
            unit_of_measure = (
                col.split("[")[1].split("]")[0] if "[" in col else ""
            )
            grab_samples_df[
                grab_samples_df["Codice punto di prelievo"] == punto
            ].plot(
                x="Data di prelievo",
                y=col,
                legend=False,
                title=f"{punto} - {sanitized_col} [{unit_of_measure}]",
                fontsize=8,
                figsize=(40, 10),
            )
            directory = f"/Users/massimilianoarca/Documents/SafeCREW/Soft Sensors data/Milano/Grab samples data plots/{punto}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)

### Sensors Data

In [None]:
df_list = []

for file in os.listdir(sensor_data_folder_path):
    if file.endswith(".csv"):
        temp_df = pd.read_csv(
            os.path.join(sensor_data_folder_path, file), header=1, sep=";"
        )
        location_name = file.split("_")[0]
        temp_df.insert(0, "Location", location_name)
        code = houses_code_df[
            houses_code_df["Casa dell'acqua"] == location_name
        ]["Codice Punto di Prelievo"].values[0]
        temp_df.insert(1, "Codice Punto di Prelievo", code)
        df_list.append(temp_df)

raw_sensor_data_df = pd.concat(df_list, ignore_index=True)

In [None]:
raw_sensor_data_df

In [None]:
# group by codice punto di prelievo and plot every column
for punto in raw_sensor_data_df["Codice Punto di Prelievo"].unique():
    for col in raw_sensor_data_df.columns:
        # check if column belongs to float type
        if raw_sensor_data_df[col].dtype == float:
            sanitized_col = col.split("-")[0].rstrip()
            # Extract unit of measure from column name
            unit_of_measure = (
                col.split("[")[1].split("]")[0] if "[" in col else ""
            )
            raw_sensor_data_df[
                raw_sensor_data_df["Codice Punto di Prelievo"] == punto
            ].plot(
                x="Measurement interval=900[sec] (Export-Aggregation disabled)",
                y=col,
                legend=False,
                title=f"{punto} - {sanitized_col} [{unit_of_measure}]",
                fontsize=8,
                figsize=(40, 10),
            )
            directory = f"/Users/massimilianoarca/Documents/SafeCREW/Soft Sensors data/Milano/Sensor data plots/{punto}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            plt.savefig(f"{directory}/{sanitized_col}.png", dpi=300)