# Supply Points (Case dell'Acqua) Data Preprocessing

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [3]:
dir_store_path = "/Users/massimilianoarca/Documents/PoliMi/Research Grant/SafeCREW/Data/Milano/temporary results"
root_folder_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/SafeCREW/soft_sensors/Soft Sensor CS2Milan"

grab_samples_supply_points_path = os.path.join(
    root_folder_path,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)
all_grab_samples_path = os.path.join(
    root_folder_path, "Tutti punti - Grab Samples"
)
sensor_data_folder_path = os.path.join(
    root_folder_path, "Case dell'acqua - Sensori"
)

## Collect all grab samples for Supply Points

### Get Name and ID of Supply Points

In [None]:
column_list = "CS, CT"

meta_supply_points_df = pd.read_excel(
    grab_samples_supply_points_path, usecols=column_list, header=4
)

In [None]:
meta_supply_points_df

### Load all grab samples

In [None]:
column_list = [
    "Data di prelievo",
    "Rapporto di prova",
    "Punto di prelievo",
    "Codice punto di prelievo",
    "Campagna",
    "Analisi programmate",
    "ZONA",
    "Alcalinità (mg/L)",
    "Alcalinità equivalente a carbonati (mg/L di CO3)",
    "Alcalinità equivalente a idrossidi (mg/L di OH)",
    "Bicarbonati (mg/L)",
    "Torbidità (NTu)",
    "Colore (Cu)",
    "Conduttività a 20°C (µS/cm)",
    "Concentrazione ioni idrogeno (unità pH)",
    "Durezza (da calcolo) (°F)",
    "Durezza totale (°F)",
    "Indice di aggressività ottenuto per calcolo (no unità)",
    "Residuo secco a 180°C (mg/L)",
    "Solidi sospesi totali (mg/L)",
    "Cloro residuo libero (mg/L di Cl2)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)",
    "Concentr. ioni idrogeno (al prelievo) (unità pH)",
    "Temperatura - °C",
    "Temperatura (al prelievo) (°C)",
    "Temperatura (°C)",
    "TOC - carbonio organico totale (mg/L di C)",
    "Cloroformio (µg/L)",
    "Bromodiclorometano (µg/L)",
    "Dibromoclorometano (µg/L)",
    "Bromoformio (µg/L)",
    "Sommatoria totale trialometani (µg/L)",
    "Conta delle colonie a 22°C (UFC/mL)",
    "Conteggio colonie a 30°C (UFC/mL)",
    "Conta delle colonie a 37°C (UFC/mL)",
    "Batteri coliformi a 37°C (MPN/100 mL)",
    "Enterococchi (MPN/100 mL)",
    "Escherichia coli (MPN/100 mL)",
    "Pseudomonas aeruginosa (UFC/250 mL)",
    "Pseudomonas aeruginosa (UFC/100 mL)",
    "Legionella spp (UFC/L)",
    "Legionella pneumophila (UFC/L)",
    "Legionella molecolare pneumophila",
    "Legionella molecolare spp",
]

In [None]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(set(df.columns) & set(column_list))
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples)

### Get Supply Points Grab Samples

In [None]:
supply_points_df = grab_samples_df.merge(
    meta_supply_points_df,
    left_on="Codice punto di prelievo",
    right_on="filtro 2",
    how="left",
)

In [None]:
# supply_points_df.drop(columns=["filtro 1", "filtro 2"], inplace=True)
supply_points_df.drop_duplicates(inplace=True)

In [None]:
# combine the different Temperature columns into one column by mantaining the positions without deleting the nans and drop the others
supply_points_df["Temperatura °C"] = supply_points_df["Temperatura (°C)"]

for col in ["Temperatura - °C", "Temperatura (al prelievo) (°C)"]:
    supply_points_df["Temperatura °C"] = supply_points_df[
        "Temperatura °C"
    ].combine_first(supply_points_df[col])

supply_points_df.drop(
    columns=[
        "Temperatura - °C",
        "Temperatura (al prelievo) (°C)",
        "Temperatura (°C)",
    ],
    inplace=True,
)

In [None]:
supply_points_df = supply_points_df[
    (
        supply_points_df["filtro 1"].notna()
        | supply_points_df["filtro 2"].notna()
    )
]

In [None]:
supply_points_df.to_excel(
    "/Users/massimilianoarca/Documents/SafeCREW/Soft Sensors data/Milano/All supply points.xlsx",
    index=False,
)

In [None]:
supply_points_df

In [5]:
columns_list = "A:G, M:Y, BY, BZ:CB, CD, CF, CJ, CL:CM"

In [6]:
raw_grab_samples_df = pd.read_excel(
    grab_samples_supply_points_path, header=4, usecols=columns_list
)

In [7]:
def convert_to_float(x):
    try:
        if x == "3,0":
            pass
        return float(x.replace(",", ".")) if type(x) == str and "," in x else x
    except ValueError:
        return x

In [8]:
raw_grab_samples_df["Carica batterica a 22°C (UFC/mL)"] = raw_grab_samples_df[
    "Carica batterica a 22°C (UFC/mL)"
].apply(convert_to_float)

In [9]:
raw_grab_samples_df["Codice punto di prelievo"] = raw_grab_samples_df[
    "Codice punto di prelievo"
].str.strip()

In [10]:
raw_grab_samples_df

Unnamed: 0,Data di prelievo,Rapporto di prova,Punto di prelievo,Codice punto di prelievo,Campagna,Analisi programmate,ZONA,Alcalinità (mg/L),Cloro residuo libero (mg/L di Cl2),Colore (CU),...,Bicarbonati - mg/L,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Conteggio colonie a 30°C (UFC/mL),Carica batterica a 37°C (UFC/mL),Batteri coliformi a 37°C (MPN / 100 mL),Enterococchi (MPN / 100mL),Escherichia Coli (MPN / 100mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
0,2014-07-18,3185/14,C.A. ingresso Menotti > U.di Nemi,ING_UCCNEMI,CASA_ACQUA,Analisi case dell'acqua,Case acqua,153.07,0.06,<0.01,...,,,342,,845,1,0,0,1,
1,2016-06-13,2818/16,Casa Acqua Appennini con CO2,SII00801,UTENZA,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",case acqua,95,0.01,0.77,...,94.0,,<1,,<1,<1,<1,<1,,
2,2014-01-03,16/14,Casa Acqua Lessona,SII00659,MICRO_PS,"Coliformi, E-Coli, Enterococchi, Pseudomonas",Case acqua,,0.05,,...,,,,,,0,0,0,0,
3,2016-04-22,1860/16,Casa Acqua Lessona,SII00659,MICR_UT_PS,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",case acqua,,0.03,,...,,,2,,<1,<1,<1,<1,<1,
4,2016-06-13,2819/16,Casa Acqua Via Appennini non trattata,SII00802,UTENZA,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",case acqua,122,0.02,0.77,...,122.0,,<1,,<1,<1,<1,<1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2263,2023-03-29,1554/23,Casa dell'acqua Piazza Scolari,HOUSE_SCOLARI,DKR,Casa Dkr,Case Acqua,,,,...,,,< 1,,,0,0,0,< 1,
2264,2023-03-30,1573/23,V.le Omero-NT-CA21,HOUSE_OME1,PROACQUA,Case Proacqua,Case Acqua,,,,...,,,,,,,,,,
2265,2023-03-30,1574/23,Menotti > U.di Nemi-NT-CA08,HOUSE_UCC1,PROACQUA,Case Proacqua,Case Acqua,,,,...,,,,,,,,,,
2266,2023-03-30,1575/23,Casa dell'acqua Piazza Ovidio,HOUSE_OVIDIO,DKR,Casa Dkr,Case Acqua,,,,...,,,< 1,,,0,0,0,<1,


In [12]:
raw_grab_samples_df.to_csv(
    os.path.join(dir_store_path, "raw_grab_samples_supply_points.csv"),
    index=False,
)

In [150]:
def count_values(series):
    num_nans = series.isna().sum()
    strings = series[
        series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))
    ].count()
    num_numbers = series[
        series.apply(lambda x: isinstance(x, (int, float)))
    ].count()
    return pd.Series(
        [num_nans, strings, num_numbers], index=["NaN", "Strings", "numbers"]
    )

  series.astype(str).str.contains("|".join(["<", "\*", ">", "[a-zA-Z]"]))


In [151]:
columns = raw_grab_samples_df.columns[7:]

In [152]:
histogram = raw_grab_samples_df[columns].apply(count_values)

In [153]:
histogram.loc["Total"] = histogram.sum()

In [154]:
histogram

Unnamed: 0,Alcalinità (mg/L),Cloro residuo libero (mg/L di Cl2),Colore (CU),Conduttività a 20°C (µS/cm),Concentrazione ioni idrogeno (unità pH),Concentr. ioni idrogeno al prelievo (unità pH),Durezza totale (°F),Indice di aggressività ottenuo per calcolo (no unità),Residuo secco a 180°C (mg/L),Solidi sospesi totali (mg/L),...,Bicarbonati - mg/L,TOC - carbonio organico totale (mg/L di C),Carica batterica a 22°C (UFC/mL),Conteggio colonie a 30°C (UFC/mL),Carica batterica a 37°C (UFC/mL),Batteri coliformi a 37°C (MPN / 100 mL),Enterococchi (MPN / 100mL),Escherichia Coli (MPN / 100mL),Pseudomonas aeruginosa (UFC / 250mL),Pseudomonas aeruginosa (UFC/100 mL)
,1800,1262,1360,1136,1602,1151,633,1337,1132,2268,...,1614,2021,773,2268,1139,762,984,782,1491,1633
Strings,5,512,388,0,57,0,2,47,0,0,...,0,4,324,0,153,183,176,178,374,187
numbers,463,494,520,1132,609,1117,1633,884,1136,0,...,654,243,1171,0,976,1323,1108,1308,403,448
Total,2268,2268,2268,2268,2268,2268,2268,2268,2268,2268,...,2268,2268,2268,2268,2268,2268,2268,2268,2268,2268


In [None]:
ax = histogram.T[["NaN", "Strings", "numbers"]].plot.bar(figsize=(30, 10))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))

In [None]:
plt.style.use("ggplot")

for column in columns:
    plt.figure(figsize=(20, 10))
    hist = raw_grab_samples_df[column].where(
        raw_grab_samples_df[column].apply(lambda x: isinstance(x, (int, float)))
    )
    count, bins, patches = plt.hist(
        hist.dropna(), bins=30, edgecolor="black", linewidth=1.2
    )
    plt.title(
        column
        + " - Count: "
        + str(hist.count())
        + " / "
        + str(raw_grab_samples_df.shape[0])
    )
    plt.ylabel("Frequency")

    # Set x-ticks to bin edges and x-tick labels to intervals
    plt.xticks(
        bins[:-1],
        [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)],
        rotation="vertical",
        fontsize=8,
    )

    # Add count for every bar
    for p in patches:
        plt.annotate(
            str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() * 1.02)
        )