# Supply Points (Case dell'Acqua) Data Preprocessing

In [1]:
import os
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
import seaborn as sns

# Paths

In [2]:
utils_folder = os.path.join("..", "..", "utils")

with open(os.path.join(utils_folder, "onedrive.txt"), "r") as f:
    cloud_data_folder = os.path.join(f.readline().strip(), "Case dell'acqua")

grab_samples_folder = os.path.join(cloud_data_folder, "Grab Samples")
sensors_folder = os.path.join(cloud_data_folder, "Sensori")

local_data_folder = os.path.join("..", "..", "data")
intermediate_data_folder = os.path.join(local_data_folder, "Intermediate Data")
clean_data_folder = os.path.join(local_data_folder, "Clean Data")
raw_data_folder = os.path.join(local_data_folder, "Raw Data")

plot_folder = os.path.join(local_data_folder, "Plots")

all_grab_samples_path = os.path.join(
    raw_data_folder, "Tutti punti - Grab Samples"
)

grab_samples_supply_points_path = os.path.join(
    raw_data_folder,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)

In [3]:
# Tra i grab non c'è l'ORP, mentre
# tra i sensori non c'è DOC (c'è il TOC) e L'UVA254

# Quindi in comune abbiamo:
# Color, TOC, Nitrati, Turbidity, pH, Temperature, Conductivity, Free Chlorine

# Load Grab Samples

In [4]:
grab_df = pd.DataFrame()
for filename in os.listdir(grab_samples_folder):
    if grab_df.empty:
        grab_df = pd.read_excel(os.path.join(grab_samples_folder, filename))
    else:
        df = pd.read_excel(os.path.join(grab_samples_folder, filename))
        grab_df = pd.concat([grab_df, df])

In [5]:
grab_df.head(5)

Unnamed: 0,Data di prelievo,Rapporto di prova,Punto di prelievo,Codice punto di prelievo,Alcalinità (mg/L),Alcalinità equivalente a carbonati (mg/L di CO3),Alcalinità equivalente a idrossidi (mg/L di OH),Bicarbonati (mg/L),Torbidità (NTu),Colore (Cu),...,Enterococchi (UFC/250 ml),Batteri coliformi a 37°C (UFC/250 ml),Acido Perfluoroottanoico PFOA (µg/L),Acido Perfluoroottansolfonico PFOS (µg/L),Somma di PFAS (µg/L),filtro 1,filtro 2,Numero analiti,Autorizzazione di RT \ndei risultati,Note
0,2024-06-04 00:00:00,2763/24,Ingresso Casa dell'acqua Via Tognazzi 3,100184,267.1,,,,0.3,"<1,0",...,,,"<0,01","<0,01","<0,01",Ingresso Casa dell'acqua Via Tognazzi 3,100184.0,100.0,Angela Manenti,
1,2024-06-04 00:00:00,2770/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,237.3,,,,"<0,3",1.7,...,,,"<0,01","<0,01",0.01,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205.0,100.0,Angela Manenti,
2,2024-06-10 00:00:00,2849/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,221.0,,,,"<0,3",1.8,...,,,"<0,01","<0,01","<0,01",Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205.0,101.0,Angela Manenti,
3,2024-06-10 00:00:00,2851/24,Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900,208.7,,,,"<0,3",2.7,...,,,"<0,01","<0,01","<0,01",Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900.0,101.0,Angela Manenti,
4,2024-06-17 00:00:00,2998/24,Ingresso Casa dell'acqua Piazza Antonio Gramsc...,100708,197.2,,,,"<0,3",1.2,...,,,"<0,01","<0,01","<0,01",Ingresso Casa dell'acqua Piazza Antonio Gramsc...,100708.0,101.0,Angela Manenti,


In [6]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)

In [7]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [8]:
common_metadata_columns = list(
    set(metadata_columns).intersection(grab_df.columns)
)
common_features_columns = list(
    set(features_columns).intersection(grab_df.columns)
)
common_targets_columns = list(
    set(targets_columns).intersection(grab_df.columns)
)

In [9]:
# remove columns that are not in the column_types.json file
grab_df = grab_df[
    common_metadata_columns + common_features_columns + common_targets_columns
]

In [10]:
grab_df.head(5)

Unnamed: 0,Data di prelievo,Rapporto di prova,Punto di prelievo,Codice punto di prelievo,Colore (Cu),Conduttività a 20°C (µS/cm),Temperatura (al prelievo) (°C),Torbidità (NTu),Nitrati (mg/L),Cloro residuo libero (al prelievo) (mg/L di Cl2),...,Bromodiclorometano (µg/L),Enterococchi (MPN/100 mL),Somma di PFAS (µg/L),Escherichia coli (MPN/100 mL),Acido Perfluoroottansolfonico PFOS (µg/L),Cloroformio (µg/L),Conta delle colonie a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC/250 mL),Dibromoclorometano (µg/L),Bromoformio (µg/L)
0,2024-06-04 00:00:00,2763/24,Ingresso Casa dell'acqua Via Tognazzi 3,100184,"<1,0",693.0,17.9,0.3,30.8,0.08,...,"<0,20",0,"<0,01",0,"<0,01",0.5,0,<1,0.3,2.1
1,2024-06-04 00:00:00,2770/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,1.7,549.0,16.6,"<0,3",30.1,0.04,...,"<0,20",0,0.01,0,"<0,01",1.3,0,<1,0.2,1.3
2,2024-06-10 00:00:00,2849/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,1.8,544.0,19.9,"<0,3",29.2,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",1.0,<3,<1,0.2,1.2
3,2024-06-10 00:00:00,2851/24,Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900,2.7,529.0,18.3,"<0,3",28.2,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",2.5,<3,<1,"<0,20",1.4
4,2024-06-17 00:00:00,2998/24,Ingresso Casa dell'acqua Piazza Antonio Gramsc...,100708,1.2,533.0,17.9,"<0,3",31.6,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",2.1,<3,<1,"<0,20","<0,20"


## Fix LOD values

In [11]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [12]:
def set_label(value):
    if pd.isna(value):
        return "NaN"
    elif isinstance(value, (int, float)):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "NaN"

In [13]:
# add TTHMs columns as the sum of the four TTHMs columns
grab_df

Unnamed: 0,Data di prelievo,Rapporto di prova,Punto di prelievo,Codice punto di prelievo,Colore (Cu),Conduttività a 20°C (µS/cm),Temperatura (al prelievo) (°C),Torbidità (NTu),Nitrati (mg/L),Cloro residuo libero (al prelievo) (mg/L di Cl2),...,Bromodiclorometano (µg/L),Enterococchi (MPN/100 mL),Somma di PFAS (µg/L),Escherichia coli (MPN/100 mL),Acido Perfluoroottansolfonico PFOS (µg/L),Cloroformio (µg/L),Conta delle colonie a 22°C (UFC/mL),Pseudomonas aeruginosa (UFC/250 mL),Dibromoclorometano (µg/L),Bromoformio (µg/L)
0,2024-06-04 00:00:00,2763/24,Ingresso Casa dell'acqua Via Tognazzi 3,100184,"<1,0",693.0,17.9,0.3,30.8,0.08,...,"<0,20",0,"<0,01",0,"<0,01",0.5,0,<1,0.3,2.1
1,2024-06-04 00:00:00,2770/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,1.7,549.0,16.6,"<0,3",30.1,0.04,...,"<0,20",0,0.01,0,"<0,01",1.3,0,<1,0.2,1.3
2,2024-06-10 00:00:00,2849/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,1.8,544.0,19.9,"<0,3",29.2,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",1.0,<3,<1,0.2,1.2
3,2024-06-10 00:00:00,2851/24,Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900,2.7,529.0,18.3,"<0,3",28.2,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",2.5,<3,<1,"<0,20",1.4
4,2024-06-17 00:00:00,2998/24,Ingresso Casa dell'acqua Piazza Antonio Gramsc...,100708,1.2,533.0,17.9,"<0,3",31.6,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",2.1,<3,<1,"<0,20","<0,20"
5,2024-06-19 00:00:00,3047/24,Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900,"<1,0",525.0,19.1,"<0,3",27.9,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",3.7,<3,<1,"<0,20",1
6,2024-06-19 00:00:00,3048/24,Ingresso Casa dell'acqua Via Tabacchi - Via Ba...,100205,"<1,0",526.0,16.8,"<0,3",29.1,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",1.2,4,<1,0.2,1.2
7,2024-06-20 00:00:00,3072/24,Ingresso Casa dell'acqua Via Berna 11/4,100408,1.3,557.0,18.1,"<0,3",30.8,0.06,...,"<0,20",0,"<0,01",0,"<0,01",1.3,110,<1,0.3,1.6
8,2024-06-20 00:00:00,3074/24,Ingresso Casa dell'acqua Piazzale Giovanni dal...,100374,2.6,519.0,18.2,"<0,3",21.1,0.07,...,"<0,20",0,"<0,01",0,"<0,01",0.7,3,<1,"<0,20",2
9,2024-06-24 00:00:00,3105/24,Ingresso Casa dell'acqua Via Valparaìso / Mont...,100900,1.4,524.0,18.1,"<0,3",27.3,"<0,04",...,"<0,20",0,"<0,01",0,"<0,01",4.9,4,<1,"<0,20",1.2


In [14]:
for column in common_features_columns + common_targets_columns:
    label_column = column + "_label"
    grab_df.loc[:, label_column] = grab_df[column].apply(set_label)

In [15]:
grab_df[common_features_columns] = grab_df[common_features_columns].map(
    convert_string_values
)

grab_df[common_targets_columns] = grab_df[common_targets_columns].map(
    convert_string_values
)

In [16]:
grab_df.columns.to_list()

['Data di prelievo',
 'Rapporto di prova',
 'Punto di prelievo',
 'Codice punto di prelievo',
 'Colore (Cu)',
 'Conduttività a 20°C (µS/cm)',
 'Temperatura (al prelievo) (°C)',
 'Torbidità (NTu)',
 'Nitrati (mg/L)',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)',
 'Concentrazione ioni idrogeno (unità pH)',
 'TOC - carbonio organico totale (mg/L di C)',
 'Batteri coliformi a 37°C (MPN/100 mL)',
 'Acido Perfluoroottanoico PFOA (µg/L)',
 'Bromodiclorometano (µg/L)',
 'Enterococchi (MPN/100 mL)',
 'Somma di PFAS (µg/L)',
 'Escherichia coli (MPN/100 mL)',
 'Acido Perfluoroottansolfonico PFOS (µg/L)',
 'Cloroformio (µg/L)',
 'Conta delle colonie a 22°C (UFC/mL)',
 'Pseudomonas aeruginosa (UFC/250 mL)',
 'Dibromoclorometano (µg/L)',
 'Bromoformio (µg/L)',
 'Colore (Cu)_label',
 'Conduttività a 20°C (µS/cm)_label',
 'Temperatura (al prelievo) (°C)_label',
 'Torbidità (NTu)_label',
 'Nitrati (mg/L)_label',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)_label',
 'Concentrazione ioni idr

# Load Sensor Samples

In [17]:
sensor_dict = {}

for sensor_file in os.listdir(sensors_folder):
    if sensor_file == ".DS_Store":
        continue

    sensor_folder = os.path.join(sensors_folder, sensor_file)
    for filename in os.listdir(sensor_folder):
        if not filename.endswith(".xlsx"):
            continue

        house_code = filename.split("_")[0]
        if house_code not in sensor_dict:
            sensor_dict[house_code] = pd.read_excel(
                os.path.join(sensor_folder, filename), header=1
            )
        else:
            df = pd.read_excel(os.path.join(sensor_folder, filename), header=1)
            sensor_dict[house_code] = pd.concat([sensor_dict[house_code], df])

In [18]:
sensor_dict["via TABACCHI"].columns.to_list()

['Measurement interval=900[sec] (Export-Aggregation disabled)',
 'Status',
 'Tag',
 'COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)',
 'Status [COLORtrue - Measured value]',
 'COLORtrue - Clean value [Hazen-eq.] (Limit:0.00-300.00)',
 'Status [COLORtrue - Clean value]',
 'TOCeq - Measured value [mg/l] (Limit:0.00-22.00)',
 'Status [TOCeq - Measured value]',
 'TOCeq - Clean value [mg/l] (Limit:0.00-22.00)',
 'Status [TOCeq - Clean value]',
 'NO3eq - Measured value [mg/l] (Limit:0.00-88.00)',
 'Status [NO3eq - Measured value]',
 'NO3eq - Clean value [mg/l] (Limit:0.00-88.00)',
 'Status [NO3eq - Clean value]',
 'UV254t - Measured value [Abs/m] (Limit:0.00-71.00)',
 'Status [UV254t - Measured value]',
 'UV254t - Clean value [Abs/m] (Limit:0.00-71.00)',
 'Status [UV254t - Clean value]',
 'Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)',
 'Status [Turbidity - Measured value]',
 'DOCeq - Measured value [mg/l] (Limit:0.00-17.00)',
 'Status [DOCeq - Measured value]',
 'DOCeq

In [19]:
columns_mapping = {
    "Measurement interval=900[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=999[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=0[sec] (Export-Aggregation disabled)": "DateTime",
    "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)": "Color (CU)",
    "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)": "TOC (mg/l)",
    "NO3eq - Measured value [mg/l] (Limit:0.00-88.00)": "Nitrate (mg/l)",
    "UV254t - Measured value [Abs/m] (Limit:0.00-71.00)": "UVA254 (1/m)",
    "Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)": "Turbidity (FTU)",
    "pH - Measured value (Limit:0.00-14.00)": "pH",
    "Temperature - Measured value [C] (Limit:-5.00-100.00)": "Temperature (°C)",
    "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)": "Conductivity (μS/cm)",
    "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)": "Free Chlorine (mg/l)",
}


for house_code, df in sensor_dict.items():
    sensor_dict[house_code] = df.rename(columns=columns_mapping)

    # set to get unique values
    columns = set(columns_mapping.values())

    sensor_dict[house_code] = sensor_dict[house_code][list(columns)]

# Processing

## Grab Samples

In [20]:
grab_df.drop(
    columns=[
        "Codice punto di prelievo",
        "Rapporto di prova",
    ],
    inplace=True,
)

In [21]:
grab_df["Punto di prelievo"].unique()

array(["Ingresso Casa dell'acqua Via Tognazzi 3",
       "Ingresso Casa dell'acqua Via Tabacchi - Via Balill",
       "Ingresso Casa dell'acqua Via Valparaìso / Montevid",
       "Ingresso Casa dell'acqua Piazza Antonio Gramsci 2-",
       "Ingresso Casa dell'acqua Via Berna 11/4",
       "Ingresso Casa dell'acqua Piazzale Giovanni dalle B",
       "Ingresso Casa dell'acqua Piazza Prealpi 4",
       "Ingresso Casa dell'Acqua Via Chiostergi - Via Gius",
       "Ingresso Casa dell'acqua Via Valparaìso / Montevideo",
       "Ingresso Casa dell'acqua Via Tabacchi - Via Balilla 3",
       "Ingresso Casa dell'acqua Piazzale Giovanni dalle Bande Nere - Via Vinc",
       "Ingresso Casa dell'acqua Piazza Antonio Gramsci 2-10",
       "Ingresso Casa dell'Acqua Via Chiostergi - Via Giuseppe Chiostergi 13",
       "Ingresso Casa dell'acqua Piazza Fortunato - Ospedale Galezzi",
       "Ing SAFECREW Casa dell'acqua Via Valparaìso / Montevideo",
       "Ing SAFECREW Casa dell'acqua Via Tabacchi - Via

In [22]:
# change name of Punta di prelievo values to match codes
def change_name(name):
    if "Tognazzi" in name:
        return "Tognazzi"
    elif "Tabacchi" in name:
        return "Tabacchi"
    elif "Gramsci" in name:
        return "Gramsci"
    elif "Berna" in name:
        return "Berna"
    elif "Bande Nere" in name or "Piazzale Giovanni" in name:
        return "Bande Nere"
    elif "Prealpi" in name:
        return "Prealpi"
    elif "Chiostergi" in name:
        return "Chiostergi"
    elif "Montevideo" in name or "Montevid" in name:
        return "Montevideo"
    elif "Fortunato" in name:
        return "Fortunato"
    else:
        return name

In [23]:
grab_df["Punto di prelievo"] = grab_df["Punto di prelievo"].map(change_name)

In [24]:
grab_df["Data di prelievo"] = pd.to_datetime(grab_df["Data di prelievo"])

In [25]:
grab_df

Unnamed: 0,Data di prelievo,Punto di prelievo,Colore (Cu),Conduttività a 20°C (µS/cm),Temperatura (al prelievo) (°C),Torbidità (NTu),Nitrati (mg/L),Cloro residuo libero (al prelievo) (mg/L di Cl2),Concentrazione ioni idrogeno (unità pH),TOC - carbonio organico totale (mg/L di C),...,Bromodiclorometano (µg/L)_label,Enterococchi (MPN/100 mL)_label,Somma di PFAS (µg/L)_label,Escherichia coli (MPN/100 mL)_label,Acido Perfluoroottansolfonico PFOS (µg/L)_label,Cloroformio (µg/L)_label,Conta delle colonie a 22°C (UFC/mL)_label,Pseudomonas aeruginosa (UFC/250 mL)_label,Dibromoclorometano (µg/L)_label,Bromoformio (µg/L)_label
0,2024-06-04,Tognazzi,0.5,693.0,17.9,0.3,30.8,0.08,7.3,5.28,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
1,2024-06-04,Tabacchi,1.7,549.0,16.6,0.15,30.1,0.04,7.5,1.5,...,Less than,Normal,Normal,Normal,Less than,Normal,Normal,Less than,Normal,Normal
2,2024-06-10,Tabacchi,1.8,544.0,19.9,0.15,29.2,0.02,7.5,1.06,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Normal,Normal
3,2024-06-10,Montevideo,2.7,529.0,18.3,0.15,28.2,0.02,7.5,0.54,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Normal
4,2024-06-17,Gramsci,1.2,533.0,17.9,0.15,31.6,0.02,7.6,0.12,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Less than
5,2024-06-19,Montevideo,0.5,525.0,19.1,0.15,27.9,0.02,7.5,0.26,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Normal
6,2024-06-19,Tabacchi,0.5,526.0,16.8,0.15,29.1,0.02,7.6,0.35,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
7,2024-06-20,Berna,1.3,557.0,18.1,0.15,30.8,0.06,7.6,1.19,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
8,2024-06-20,Bande Nere,2.6,519.0,18.2,0.15,21.1,0.07,7.5,0.31,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Less than,Normal
9,2024-06-24,Montevideo,1.4,524.0,18.1,0.15,27.3,0.02,7.5,0.39,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Less than,Normal


## Sensor Samples

In [26]:
sensor_dict.keys()

dict_keys(['Fortunato', 'via Tognazzi', 'via TABACCHI', 'Berna', 'Piazza Prealpi', 'Gramsci', 'Montevideo', 'Chiostergi', 'Bande Nere'])

In [27]:
# change the name of the keys to match the names in the grab_df
sensor_dict["Tabacchi"] = sensor_dict.pop("via TABACCHI")
sensor_dict["Tognazzi"] = sensor_dict.pop("via Tognazzi")
sensor_dict["Prealpi"] = sensor_dict.pop("Piazza Prealpi")

In [28]:
for code in grab_df["Punto di prelievo"].unique():
    sensor_df = sensor_dict.pop(code)
    sensor_df["DateTime"] = pd.to_datetime(sensor_df["DateTime"])
    sensor_df.set_index("DateTime", inplace=True)
    sensor_dict[code] = sensor_df

# Missing Values

## Grab

In [29]:
# compute number of missing values for each column
for code in grab_df["Punto di prelievo"].unique():
    code_df = grab_df[grab_df["Punto di prelievo"] == code]
    for column in common_features_columns + common_targets_columns:
        # count the number of missing values
        missing_values = code_df[column].isna().sum()
        if missing_values > 0:
            print(
                f"{code} has {missing_values} missing values in column {column}"
            )

Tabacchi has 2 missing values in column Colore (Cu)
Tabacchi has 4 missing values in column Nitrati (mg/L)
Tabacchi has 4 missing values in column Acido Perfluoroottanoico PFOA (µg/L)
Tabacchi has 4 missing values in column Somma di PFAS (µg/L)
Tabacchi has 4 missing values in column Acido Perfluoroottansolfonico PFOS (µg/L)
Tabacchi has 2 missing values in column Dibromoclorometano (µg/L)
Tabacchi has 3 missing values in column Bromoformio (µg/L)
Montevideo has 1 missing values in column Colore (Cu)
Montevideo has 1 missing values in column Torbidità (NTu)
Montevideo has 4 missing values in column Nitrati (mg/L)
Montevideo has 4 missing values in column Acido Perfluoroottanoico PFOA (µg/L)
Montevideo has 4 missing values in column Somma di PFAS (µg/L)
Montevideo has 4 missing values in column Acido Perfluoroottansolfonico PFOS (µg/L)
Montevideo has 1 missing values in column Conta delle colonie a 22°C (UFC/mL)
Montevideo has 3 missing values in column Bromoformio (µg/L)
Berna has 1 mi

In [30]:
grab_df

Unnamed: 0,Data di prelievo,Punto di prelievo,Colore (Cu),Conduttività a 20°C (µS/cm),Temperatura (al prelievo) (°C),Torbidità (NTu),Nitrati (mg/L),Cloro residuo libero (al prelievo) (mg/L di Cl2),Concentrazione ioni idrogeno (unità pH),TOC - carbonio organico totale (mg/L di C),...,Bromodiclorometano (µg/L)_label,Enterococchi (MPN/100 mL)_label,Somma di PFAS (µg/L)_label,Escherichia coli (MPN/100 mL)_label,Acido Perfluoroottansolfonico PFOS (µg/L)_label,Cloroformio (µg/L)_label,Conta delle colonie a 22°C (UFC/mL)_label,Pseudomonas aeruginosa (UFC/250 mL)_label,Dibromoclorometano (µg/L)_label,Bromoformio (µg/L)_label
0,2024-06-04,Tognazzi,0.5,693.0,17.9,0.3,30.8,0.08,7.3,5.28,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
1,2024-06-04,Tabacchi,1.7,549.0,16.6,0.15,30.1,0.04,7.5,1.5,...,Less than,Normal,Normal,Normal,Less than,Normal,Normal,Less than,Normal,Normal
2,2024-06-10,Tabacchi,1.8,544.0,19.9,0.15,29.2,0.02,7.5,1.06,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Normal,Normal
3,2024-06-10,Montevideo,2.7,529.0,18.3,0.15,28.2,0.02,7.5,0.54,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Normal
4,2024-06-17,Gramsci,1.2,533.0,17.9,0.15,31.6,0.02,7.6,0.12,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Less than
5,2024-06-19,Montevideo,0.5,525.0,19.1,0.15,27.9,0.02,7.5,0.26,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Normal
6,2024-06-19,Tabacchi,0.5,526.0,16.8,0.15,29.1,0.02,7.6,0.35,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
7,2024-06-20,Berna,1.3,557.0,18.1,0.15,30.8,0.06,7.6,1.19,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
8,2024-06-20,Bande Nere,2.6,519.0,18.2,0.15,21.1,0.07,7.5,0.31,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Less than,Normal
9,2024-06-24,Montevideo,1.4,524.0,18.1,0.15,27.3,0.02,7.5,0.39,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Less than,Normal


In [31]:
# compute number of rows that have at least one missing value
for code in grab_df["Punto di prelievo"].unique():
    code_df = grab_df[grab_df["Punto di prelievo"] == code]

    missing_values = (
        code_df[common_features_columns + common_targets_columns]
        .isna()
        .any(axis=1)
        .sum()
    )
    if missing_values > 0:
        print(f"{code} has {missing_values} rows with missing values")

Tabacchi has 4 rows with missing values
Montevideo has 4 rows with missing values
Berna has 1 rows with missing values


In [32]:
# remove the Berna rows with missing values
row_index = grab_df[
    (grab_df["Punto di prelievo"] == "Berna")
    & (
        grab_df[common_features_columns + common_targets_columns]
        .isna()
        .any(axis=1)
    )
].index

grab_df.drop(row_index, inplace=True)

In [33]:
# for the moment no imputation is done

## Sensor

In [34]:
for code in sensor_dict.keys():
    sensor_df = sensor_dict[code]
    for column in sensor_df.columns:
        missing_values = sensor_df[column].isna().sum()
        if missing_values > 0:
            print(
                f"{code} has {missing_values} missing values in column {column}"
            )

Tabacchi has 6 missing values in column Color (CU)
Tabacchi has 6 missing values in column Nitrate (mg/l)
Tabacchi has 6 missing values in column pH
Tabacchi has 6 missing values in column UVA254 (1/m)
Tabacchi has 6 missing values in column Turbidity (FTU)
Tabacchi has 6 missing values in column Temperature (°C)
Tabacchi has 6 missing values in column Conductivity (μS/cm)
Tabacchi has 6 missing values in column Free Chlorine (mg/l)
Tabacchi has 6 missing values in column TOC (mg/l)
Bande Nere has 1 missing values in column Turbidity (FTU)
Fortunato has 1 missing values in column TOC (mg/l)


In [35]:
# the number of missing values is very low, so we can do implicit imputation with time interpolation
for code in sensor_dict.keys():
    sensor_df = sensor_dict.pop(code)
    sensor_df.interpolate(method="time", inplace=True)
    sensor_dict[code] = sensor_df

# Combine Historical Grab Samples

In [36]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(
        set(df.columns.to_list())
        & set(metadata_columns + features_columns + targets_columns)
    )
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples, ignore_index=True)

In [37]:
grab_samples_df

Unnamed: 0,Conduttività a 20°C (µS/cm),Colore (CU),Codice punto di prelievo,Acido Perfluoroottanoico PFOA (µg/L),Temperatura - °C,Enterococchi (MPN / 100mL),Batteri coliformi a 37°C (MPN / 100 mL),Data di prelievo,Nitrati (mg/L),Acido Perfluoroottansolfonico PFOS (µg/L),...,Conta delle colonie a 22°C (UFC/mL),Enterococchi (MPN/100 mL),Temperatura (°C),Escherichia coli (MPN/100 mL),Colore (Cu),Batteri coliformi a 37°C (MPN/100 mL),Torbidità (NTu),Pseudomonas aeruginosa (UFC/250 mL),Cloro residuo libero (al prelievo) (mg/L di Cl2),Temperatura (al prelievo) (°C)
0,657.3385,"<0,01",TEST00216,,15.8,,,2011-12-14,,,...,,,,,,,,,,
1,,,TEST00216,,,0,0,2014-01-02,,,...,,,,,,,,,,
2,687,0.67,TEST00210,,,0,0,2011-11-22,,,...,,,,,,,,,,
3,,,TEST00210,,,,,2013-06-24,,,...,,,,,,,,,,
4,,,TEST00210,,,,,2013-07-16,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76091,578.0,,015146C003,,,,,2023-05-31,,,...,,0,,0.0,<1.0,0,<0.3,,0.05,
76092,528.0,,TUBAZIONE,,,,,2023-05-31,,,...,,0,,0.0,1.0,0,<0.3,,<0.04,
76093,493.0,,TUBAZIONE,,,,,2023-05-31,,,...,,0,,0.0,<1.0,0,<0.3,,<0.04,
76094,404.0,,BRICK,,,,,2023-05-31,21.8,,...,<1,,,,1.0,,<0.3,0,<0.04,


In [38]:
column_list = "CS, CT"

meta_supply_points_df = pd.read_excel(
    grab_samples_supply_points_path, usecols=column_list, header=4
)

In [39]:
meta_supply_points_df

Unnamed: 0,filtro 1,filtro 2
0,C.A. ingresso Menotti > U.di Nemi,ING_UCCNEMI
1,Casa Acqua Appennini con CO2,SII00801
2,Casa Acqua Lessona,SII00659
3,Casa Acqua Lessona,SII00659
4,Casa Acqua Via Appennini non trattata,SII00802
...,...,...
2263,Casa dell'acqua Piazza Scolari,HOUSE_SCOLARI
2264,V.le Omero-NT-CA21,HOUSE_OME1
2265,Menotti > U.di Nemi-NT-CA08,HOUSE_UCC1
2266,Casa dell'acqua Piazza Ovidio,HOUSE_OVIDIO


In [40]:
hist_grab_df = grab_samples_df.merge(
    meta_supply_points_df,
    left_on=["Punto di prelievo", "Codice punto di prelievo"],
    right_on=["filtro 1", "filtro 2"],
    how="inner",
)

In [41]:
# supply_points_df.drop(columns=["filtro 1", "filtro 2"], inplace=True)
hist_grab_df.drop_duplicates(inplace=True)

In [42]:
# combine all value columns in the mapping to the corresponding key column
column_mapping = {
    "Temperatura (°C)": [
        "Temperatura - °C",
        "Temperatura (al prelievo) (°C)",
    ],
    "Cloro residuo libero (mg/L di Cl2)": [
        "Cloro residuo libero (al prelievo) (mg/L di Cl2)",
    ],
    "Torbidità (NTU)": [
        "Torbidità (NTu)",
    ],
    "Batteri coliformi a 37°C (MPN/100 mL)": [
        "Batteri coliformi a 37°C (MPN / 100 mL)",
    ],
    "Colore (CU)": [
        "Colore (Cu)",
    ],
    "Escherichia coli (MPN/100 mL)": [
        "Escherichia Coli (MPN / 100mL)",
    ],
    "Enterococchi (MPN/100 mL)": [
        "Enterococchi (MPN / 100mL)",
    ],
}

for final_column, original_columns in column_mapping.items():
    for original_column in original_columns:
        hist_grab_df[final_column] = hist_grab_df[final_column].combine_first(
            hist_grab_df[original_column]
        )
    hist_grab_df.drop(columns=original_columns, inplace=True)

In [43]:
hist_grab_df.columns.to_list()

['Conduttività a 20°C (µS/cm)',
 'Colore (CU)',
 'Codice punto di prelievo',
 'Acido Perfluoroottanoico PFOA (µg/L)',
 'Data di prelievo',
 'Nitrati (mg/L)',
 'Acido Perfluoroottansolfonico PFOS (µg/L)',
 'Punto di prelievo',
 'Bromodiclorometano (µg/L)',
 'Cloroformio (µg/L)',
 'Torbidità (NTU)',
 'Bromoformio (µg/L)',
 'TOC - carbonio organico totale (mg/L di C)',
 'Concentrazione ioni idrogeno (unità pH)',
 'Rapporto di prova',
 'Cloro residuo libero (mg/L di Cl2)',
 'Dibromoclorometano (µg/L)',
 'Conta delle colonie a 22°C (UFC/mL)',
 'Enterococchi (MPN/100 mL)',
 'Temperatura (°C)',
 'Escherichia coli (MPN/100 mL)',
 'Batteri coliformi a 37°C (MPN/100 mL)',
 'Pseudomonas aeruginosa (UFC/250 mL)',
 'filtro 1',
 'filtro 2']

In [44]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [45]:
hist_common_metadata_columns = list(
    set(metadata_columns).intersection(hist_grab_df.columns)
)
hist_common_features_columns = list(
    set(features_columns).intersection(hist_grab_df.columns)
)
hist_common_targets_columns = list(
    set(targets_columns).intersection(hist_grab_df.columns)
)

In [46]:
def print_columns(title, columns):
    print(f"{title}:")
    for col in columns:
        print(f"  - {col}")
    print()


print_columns(
    "Historical Common Metadata Columns", hist_common_metadata_columns
)
print_columns("Common Metadata Columns", common_metadata_columns)
print_columns(
    "Historical Common Features Columns", hist_common_features_columns
)
print_columns("Common Features Columns", common_features_columns)
print_columns("Historical Common Targets Columns", hist_common_targets_columns)
print_columns("Common Targets Columns", common_targets_columns)

Historical Common Metadata Columns:
  - Data di prelievo
  - Rapporto di prova
  - Punto di prelievo
  - Codice punto di prelievo

Common Metadata Columns:
  - Data di prelievo
  - Rapporto di prova
  - Punto di prelievo
  - Codice punto di prelievo

Historical Common Features Columns:
  - Conduttività a 20°C (µS/cm)
  - Temperatura (°C)
  - Nitrati (mg/L)
  - Colore (CU)
  - Concentrazione ioni idrogeno (unità pH)
  - Cloro residuo libero (mg/L di Cl2)
  - Torbidità (NTU)
  - TOC - carbonio organico totale (mg/L di C)

Common Features Columns:
  - Colore (Cu)
  - Conduttività a 20°C (µS/cm)
  - Temperatura (al prelievo) (°C)
  - Torbidità (NTu)
  - Nitrati (mg/L)
  - Cloro residuo libero (al prelievo) (mg/L di Cl2)
  - Concentrazione ioni idrogeno (unità pH)
  - TOC - carbonio organico totale (mg/L di C)

Historical Common Targets Columns:
  - Batteri coliformi a 37°C (MPN/100 mL)
  - Bromodiclorometano (µg/L)
  - Enterococchi (MPN/100 mL)
  - Dibromoclorometano (µg/L)
  - Escherichia

In [47]:
# do a mapping of the hist_common_features_columns to the common_features_columns (sorted)
# and the hist_common_targets_columns to the common_targets_columns (sorted)

# the mapping is done by sorting the columns and then zipping them together
mapping_features = dict(
    zip(sorted(hist_common_features_columns), sorted(common_features_columns))
)
mapping_targets = dict(
    zip(sorted(hist_common_targets_columns), sorted(common_targets_columns))
)

In [48]:
hist_grab_df.rename(columns=mapping_features, inplace=True)
hist_grab_df.rename(columns=mapping_targets, inplace=True)

In [49]:
hist_grab_df.columns.to_list()

['Conduttività a 20°C (µS/cm)',
 'Colore (Cu)',
 'Codice punto di prelievo',
 'Acido Perfluoroottanoico PFOA (µg/L)',
 'Data di prelievo',
 'Nitrati (mg/L)',
 'Acido Perfluoroottansolfonico PFOS (µg/L)',
 'Punto di prelievo',
 'Bromodiclorometano (µg/L)',
 'Cloroformio (µg/L)',
 'Torbidità (NTu)',
 'Bromoformio (µg/L)',
 'TOC - carbonio organico totale (mg/L di C)',
 'Concentrazione ioni idrogeno (unità pH)',
 'Rapporto di prova',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)',
 'Dibromoclorometano (µg/L)',
 'Conta delle colonie a 22°C (UFC/mL)',
 'Enterococchi (MPN/100 mL)',
 'Temperatura (al prelievo) (°C)',
 'Escherichia coli (MPN/100 mL)',
 'Batteri coliformi a 37°C (MPN/100 mL)',
 'Pseudomonas aeruginosa (UFC/250 mL)',
 'filtro 1',
 'filtro 2']

In [50]:
hist_grab_df.drop(
    columns=[
        "filtro 1",
        "filtro 2",
    ],
    inplace=True,
)

In [51]:
# get only the hist_grab_df rows that have the Punto di prelievo containing the grab_df Punto di prelievo
hist_grab_df = hist_grab_df[
    (
        hist_grab_df["Punto di prelievo"].str.contains(
            "|".join(grab_df["Punto di prelievo"].unique()),
            case=False,
            na=False,
        )
    )
    | (
        hist_grab_df["Codice punto di prelievo"].str.contains(
            "|".join(grab_df["Punto di prelievo"].unique()),
            case=False,
            na=False,
        )
    )
]

In [52]:
hist_grab_df

Unnamed: 0,Conduttività a 20°C (µS/cm),Colore (Cu),Codice punto di prelievo,Acido Perfluoroottanoico PFOA (µg/L),Data di prelievo,Nitrati (mg/L),Acido Perfluoroottansolfonico PFOS (µg/L),Punto di prelievo,Bromodiclorometano (µg/L),Cloroformio (µg/L),...,Concentrazione ioni idrogeno (unità pH),Rapporto di prova,Cloro residuo libero (al prelievo) (mg/L di Cl2),Dibromoclorometano (µg/L),Conta delle colonie a 22°C (UFC/mL),Enterococchi (MPN/100 mL),Temperatura (al prelievo) (°C),Escherichia coli (MPN/100 mL),Batteri coliformi a 37°C (MPN/100 mL),Pseudomonas aeruginosa (UFC/250 mL)
447768,472.0,0.42,CASAACQUA,,2020-12-10,30,,Piazza Prealpi,,,...,7.6,3829/20,<0.04,,< 1,0,8.9,0.0,0,<1
447792,712.0,0.03,CASAACQUA,,2021-01-12,31,,Casa Acqua Via Tognazzi,,,...,7.2,79/21,0.05,,< 1,0,10.6,0.0,0,
447956,556.0,0.69,CASAACQUA,,2021-01-26,24,,Via Tabacchi,,,...,7.3,317/21,<0.04,,<1,0,10.8,0.0,0,
448238,,,HOUSE_PREALPI,,2021-03-10,,,Casa dell'acqua Piazza Prealpi,,,...,,917/21,<0.04,,< 1,0,7.1,0.0,0,< 1
448400,,,CASAACQUA,,2021-04-15,,,Piazza Bande Nere,,,...,,1370/21,<0.04,,< 1,0,12.9,0.0,0,< 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465350,,,HOUSE_BANDENERE,,2023-05-17,,,Casa dell'acqua Piazzale Giovanni dalle Bande ...,,,...,,2325/23,,,< 1,0,,0.0,0,
465361,,,HOUSE_BERNA,,2023-05-17,,,Casa dell'acqua Via Berna,,,...,,2326/23,,,< 1,0,,0.0,0,
465371,,,HOUSE_MONTEVIDEO,,2023-05-17,,,Casa dell'acqua Via Montevideo,,,...,,2327/23,,,microorganismi prese,0,,0.0,0,
465450,,,HOUSE_CHIOSTERGI,,2023-05-22,,,Casa dell'Acqua Via Chiostergi,,,...,,2353/23,,,<1,0,,0.0,0,<1


In [53]:
grab_df["Punto di prelievo"].unique()

array(['Tognazzi', 'Tabacchi', 'Montevideo', 'Gramsci', 'Berna',
       'Bande Nere', 'Prealpi', 'Chiostergi', 'Fortunato'], dtype=object)

In [54]:
grab_df.columns.to_list()

['Data di prelievo',
 'Punto di prelievo',
 'Colore (Cu)',
 'Conduttività a 20°C (µS/cm)',
 'Temperatura (al prelievo) (°C)',
 'Torbidità (NTu)',
 'Nitrati (mg/L)',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)',
 'Concentrazione ioni idrogeno (unità pH)',
 'TOC - carbonio organico totale (mg/L di C)',
 'Batteri coliformi a 37°C (MPN/100 mL)',
 'Acido Perfluoroottanoico PFOA (µg/L)',
 'Bromodiclorometano (µg/L)',
 'Enterococchi (MPN/100 mL)',
 'Somma di PFAS (µg/L)',
 'Escherichia coli (MPN/100 mL)',
 'Acido Perfluoroottansolfonico PFOS (µg/L)',
 'Cloroformio (µg/L)',
 'Conta delle colonie a 22°C (UFC/mL)',
 'Pseudomonas aeruginosa (UFC/250 mL)',
 'Dibromoclorometano (µg/L)',
 'Bromoformio (µg/L)',
 'Colore (Cu)_label',
 'Conduttività a 20°C (µS/cm)_label',
 'Temperatura (al prelievo) (°C)_label',
 'Torbidità (NTu)_label',
 'Nitrati (mg/L)_label',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)_label',
 'Concentrazione ioni idrogeno (unità pH)_label',
 'TOC - carbonio organico 

In [55]:
from operator import contains


# do a function that for a value, if an item of grab_df['Punto di prelievo'].unique() is contained in the value, then change the value to the item
def change_name(value):
    for name in grab_df["Punto di prelievo"].unique():
        if contains(value, name):
            return name
    return value

In [56]:
hist_grab_df["Punto di prelievo"] = hist_grab_df["Punto di prelievo"].map(
    change_name
)

In [57]:
for column in common_features_columns + common_targets_columns:
    if column not in hist_grab_df.columns:
        continue
    label_column = column + "_label"
    hist_grab_df.loc[:, label_column] = hist_grab_df[column].apply(set_label)

In [58]:
for column in common_features_columns + common_targets_columns:
    if column not in hist_grab_df.columns:
        continue
    hist_grab_df[column] = hist_grab_df[column].map(convert_string_values)

In [59]:
grab_df.shape, hist_grab_df.shape

((33, 42), (116, 42))

In [60]:
# give me the columns that are in grab_df but not in hist_grab_df
for column in grab_df.columns:
    if column not in hist_grab_df.columns:
        print(column)

Somma di PFAS (µg/L)
Somma di PFAS (µg/L)_label


In [61]:
for column in hist_grab_df.columns:
    if column not in grab_df.columns:
        print(column)

Codice punto di prelievo
Rapporto di prova


In [62]:
hist_grab_df.drop(
    columns=[
        "Rapporto di prova",
        "Codice punto di prelievo",
    ],
    inplace=True,
)

In [63]:
grab_df = pd.concat([grab_df, hist_grab_df], ignore_index=True)

In [64]:
grab_df.shape

(149, 42)

In [65]:
grab_df.columns.to_list()

['Data di prelievo',
 'Punto di prelievo',
 'Colore (Cu)',
 'Conduttività a 20°C (µS/cm)',
 'Temperatura (al prelievo) (°C)',
 'Torbidità (NTu)',
 'Nitrati (mg/L)',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)',
 'Concentrazione ioni idrogeno (unità pH)',
 'TOC - carbonio organico totale (mg/L di C)',
 'Batteri coliformi a 37°C (MPN/100 mL)',
 'Acido Perfluoroottanoico PFOA (µg/L)',
 'Bromodiclorometano (µg/L)',
 'Enterococchi (MPN/100 mL)',
 'Somma di PFAS (µg/L)',
 'Escherichia coli (MPN/100 mL)',
 'Acido Perfluoroottansolfonico PFOS (µg/L)',
 'Cloroformio (µg/L)',
 'Conta delle colonie a 22°C (UFC/mL)',
 'Pseudomonas aeruginosa (UFC/250 mL)',
 'Dibromoclorometano (µg/L)',
 'Bromoformio (µg/L)',
 'Colore (Cu)_label',
 'Conduttività a 20°C (µS/cm)_label',
 'Temperatura (al prelievo) (°C)_label',
 'Torbidità (NTu)_label',
 'Nitrati (mg/L)_label',
 'Cloro residuo libero (al prelievo) (mg/L di Cl2)_label',
 'Concentrazione ioni idrogeno (unità pH)_label',
 'TOC - carbonio organico 

In [66]:
# %%script false --no-raise-error
# FIXME this piece of code needs to be rearranged
columns_mapping = {
    "Data di prelievo": "DateTime",
    "Punto di prelievo": "Code",
    "Colore (Cu)": "Color (CU)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/L)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Torbidità (NTu)": "Turbidity (NTU)",
    "Nitrati (mg/L)": "Nitrate (mg/L)",
}

grab_df.rename(columns=columns_mapping, inplace=True)

# Combine with First Batch

## Grab

In [67]:
first_batch_grab_df = pd.read_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Grab Samples.xlsx")
)

In [68]:
first_batch_grab_df.head(5)

Unnamed: 0,Type,Code,DateTime,Color (CU),Turbidity (NTU),Conductivity (uS/cm),Free Chlorine (mg/L),pH,Temperature (°C),Nitrate (mg/L),...,Cloroformio (µg/L)_label,Batteri coliformi a 37°C (MPN/100 mL)_label,Conteggio colonie a 30°C (UFC/mL)_label,Bromodiclorometano (µg/L)_label,Escherichia coli (MPN/100 mL)_label,Bromoformio (µg/L)_label,Pseudomonas aeruginosa (UFC/250 mL)_label,Dibromoclorometano (µg/L)_label,Conta delle colonie a 37°C (UFC/mL)_label,Conta delle colonie a 22°C (UFC/mL)_label
0,Ingresso,HOUSE_BANDENERE,2023-05-17,0.5,0.15,458.0,,7.77,,,...,,,,,,,,,,
1,Ingresso,HOUSE_BANDENERE,2023-07-27,2.14,0.15,508.0,0.02,7.4,,,...,,,,,,,,,,
2,Ingresso,HOUSE_BANDENERE,2023-09-18,0.5,0.15,480.0,0.02,7.5,17.6,,...,,,,,,,,,,
3,Ingresso,HOUSE_BANDENERE,2023-12-11,,,551.0,0.05,7.5,,,...,,,,,,,,,,
4,Ingresso,HOUSE_BANDENERE,2024-02-01,1.2,0.15,502.0,0.02,7.0,8.5,21.5,...,Normal,Normal,,Less than,Normal,Normal,Less than,Less than,,Normal


In [69]:
first_batch_grab_df = first_batch_grab_df[
    first_batch_grab_df["Type"] == "Ingresso"
]
first_batch_grab_df.drop(columns=["Type"], inplace=True)

In [70]:
grab_df.head(5)

Unnamed: 0,DateTime,Code,Color (CU),Conductivity (uS/cm),Temperature (°C),Turbidity (NTU),Nitrate (mg/L),Free Chlorine (mg/L),pH,TOC (mg/L),...,Bromodiclorometano (µg/L)_label,Enterococchi (MPN/100 mL)_label,Somma di PFAS (µg/L)_label,Escherichia coli (MPN/100 mL)_label,Acido Perfluoroottansolfonico PFOS (µg/L)_label,Cloroformio (µg/L)_label,Conta delle colonie a 22°C (UFC/mL)_label,Pseudomonas aeruginosa (UFC/250 mL)_label,Dibromoclorometano (µg/L)_label,Bromoformio (µg/L)_label
0,2024-06-04,Tognazzi,0.5,693.0,17.9,0.3,30.8,0.08,7.3,5.28,...,Less than,Normal,Less than,Normal,Less than,Normal,Normal,Less than,Normal,Normal
1,2024-06-04,Tabacchi,1.7,549.0,16.6,0.15,30.1,0.04,7.5,1.5,...,Less than,Normal,Normal,Normal,Less than,Normal,Normal,Less than,Normal,Normal
2,2024-06-10,Tabacchi,1.8,544.0,19.9,0.15,29.2,0.02,7.5,1.06,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Normal,Normal
3,2024-06-10,Montevideo,2.7,529.0,18.3,0.15,28.2,0.02,7.5,0.54,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Normal
4,2024-06-17,Gramsci,1.2,533.0,17.9,0.15,31.6,0.02,7.6,0.12,...,Less than,Normal,Less than,Normal,Less than,Normal,Less than,Less than,Less than,Less than


In [71]:
first_batch_grab_df["Code"].unique()

array(['HOUSE_BANDENERE', 'HOUSE_BERNA', 'HOUSE_CHIOSTERGI',
       'HOUSE_FORTUNATO', 'HOUSE_GRAMSCI', 'HOUSE_MONTEVIDEO',
       'HOUSE_PREALPI', 'HOUSE_TABACCHI', 'HOUSE_TOGNAZZI'], dtype=object)

In [72]:
grab_df["Code"].unique()

array(['Tognazzi', 'Tabacchi', 'Montevideo', 'Gramsci', 'Berna',
       'Bande Nere', 'Prealpi', 'Chiostergi', 'Fortunato'], dtype=object)

In [73]:
code_mapping = {
    "HOUSE_BANDENERE": "Bande Nere",
    "HOUSE_BERNA": "Berna",
    "HOUSE_CHIOSTERGI": "Chiostergi",
    "HOUSE_FORTUNATO": "Fortunato",
    "HOUSE_GRAMSCI": "Gramsci",
    "HOUSE_MONTEVIDEO": "Montevideo",
    "HOUSE_PREALPI": "Prealpi",
    "HOUSE_TABACCHI": "Tabacchi",
    "HOUSE_TOGNAZZI": "Tognazzi",
}

In [74]:
first_batch_grab_df["Code"] = first_batch_grab_df["Code"].map(code_mapping)

In [75]:
grab_df.shape, first_batch_grab_df.shape

((149, 42), (56, 40))

In [76]:
for column in grab_df.columns:
    if column not in first_batch_grab_df.columns:
        print(column)

Acido Perfluoroottanoico PFOA (µg/L)
Somma di PFAS (µg/L)
Acido Perfluoroottansolfonico PFOS (µg/L)
Acido Perfluoroottanoico PFOA (µg/L)_label
Somma di PFAS (µg/L)_label
Acido Perfluoroottansolfonico PFOS (µg/L)_label


In [77]:
for column in first_batch_grab_df.columns:
    if column not in grab_df.columns:
        print(column)

Conteggio colonie a 30°C (UFC/mL)
Conta delle colonie a 37°C (UFC/mL)
Conteggio colonie a 30°C (UFC/mL)_label
Conta delle colonie a 37°C (UFC/mL)_label


In [78]:
# combine the two dataframes
grab_df = pd.concat([grab_df, first_batch_grab_df])

In [79]:
grab_df

Unnamed: 0,DateTime,Code,Color (CU),Conductivity (uS/cm),Temperature (°C),Turbidity (NTU),Nitrate (mg/L),Free Chlorine (mg/L),pH,TOC (mg/L),...,Acido Perfluoroottansolfonico PFOS (µg/L)_label,Cloroformio (µg/L)_label,Conta delle colonie a 22°C (UFC/mL)_label,Pseudomonas aeruginosa (UFC/250 mL)_label,Dibromoclorometano (µg/L)_label,Bromoformio (µg/L)_label,Conteggio colonie a 30°C (UFC/mL),Conta delle colonie a 37°C (UFC/mL),Conteggio colonie a 30°C (UFC/mL)_label,Conta delle colonie a 37°C (UFC/mL)_label
0,2024-06-04,Tognazzi,0.50,693.0,17.9,0.300,30.8,0.08,7.30,5.28,...,Less than,Normal,Normal,Less than,Normal,Normal,,,,
1,2024-06-04,Tabacchi,1.70,549.0,16.6,0.150,30.1,0.04,7.50,1.50,...,Less than,Normal,Normal,Less than,Normal,Normal,,,,
2,2024-06-10,Tabacchi,1.80,544.0,19.9,0.150,29.2,0.02,7.50,1.06,...,Less than,Normal,Less than,Less than,Normal,Normal,,,,
3,2024-06-10,Montevideo,2.70,529.0,18.3,0.150,28.2,0.02,7.50,0.54,...,Less than,Normal,Less than,Less than,Less than,Normal,,,,
4,2024-06-17,Gramsci,1.20,533.0,17.9,0.150,31.6,0.02,7.60,0.12,...,Less than,Normal,Less than,Less than,Less than,Less than,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,2023-07-12,Tognazzi,1.66,688.0,19.0,0.150,,0.02,7.46,0.22,...,,,,,,,,,,
98,2023-09-19,Tognazzi,0.50,698.0,18.7,0.150,,0.05,7.40,0.10,...,,,,,,,,,,
99,2023-11-14,Tognazzi,0.50,695.0,13.8,0.249,,0.04,7.37,0.24,...,,,,,,,,,,
100,2024-02-13,Tognazzi,0.50,691.0,12.9,0.150,30.6,0.02,7.10,0.20,...,,Normal,Normal,Less than,Normal,Normal,,,,


In [80]:
grab_df.sort_values(by="DateTime", inplace=True)

## Sensors

In [81]:
# first rename the columns of the current sensors_dfs

for code in sensor_dict.keys():
    sensor_df = sensor_dict[code].copy()

    sensor_df.rename(
        columns={
            "Conductivity (μS/cm)": "Conductivity (uS/cm)",
            "TOC (mg/l)": "TOC (mg/L)",
            "Nitrate (mg/l)": "Nitrate (mg/L)",
            "Free Chlorine (mg/l)": "Free Chlorine (mg/L)",
            "Turbidity (FTU)": "Turbidity (NTU)",
        },
        inplace=True,
    )
    sensor_dict.update({code: sensor_df})

In [82]:
first_batch_sensor_df = pd.read_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Sensor Data.xlsx")
)

In [83]:
first_batch_sensor_df

Unnamed: 0,DateTime,Code,Color (CU),TOC (mg/L),Nitrate (mg/L),UVA254 (1/m),Turbidity (NTU),pH,Temperature (°C),Conductivity (uS/cm),Free Chlorine (mg/L),Flow Rate (m³/s)
0,2023-12-23 00:00:00,HOUSE_BANDENERE,1.0,0.3,22.4,0.0,0.38,8.25,11.3,522.0,0.003,1.0
1,2023-12-23 00:15:00,HOUSE_BANDENERE,1.0,0.3,22.4,0.0,0.36,8.25,11.0,522.0,0.005,1.0
2,2023-12-23 00:30:00,HOUSE_BANDENERE,1.0,0.3,22.4,0.0,0.40,8.25,11.1,522.0,0.007,1.0
3,2023-12-23 00:45:00,HOUSE_BANDENERE,1.0,0.3,22.4,0.0,0.36,8.25,11.0,522.0,0.009,1.0
4,2023-12-23 01:00:00,HOUSE_BANDENERE,1.0,0.3,22.4,0.0,0.38,8.25,11.0,522.0,0.008,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
91065,2024-05-09 13:30:00,HOUSE_TOGNAZZI,1.0,0.3,29.6,0.8,0.03,7.29,16.9,697.0,0.036,1.0
91066,2024-05-09 13:45:00,HOUSE_TOGNAZZI,1.0,0.3,29.6,0.8,0.06,7.29,16.9,697.0,0.035,1.0
91067,2024-05-09 14:00:00,HOUSE_TOGNAZZI,1.0,0.3,29.6,0.8,0.05,7.29,16.9,697.0,0.031,1.0
91068,2024-05-09 14:15:00,HOUSE_TOGNAZZI,1.0,0.3,29.6,0.8,0.06,7.29,16.9,697.0,0.025,1.0


In [84]:
first_batch_sensor_df["Code"] = first_batch_sensor_df["Code"].map(code_mapping)

In [85]:
first_batch_sensor_df.drop(columns=["Flow Rate (m³/s)"], inplace=True)

In [86]:
sensor_dict["Bande Nere"]

Unnamed: 0_level_0,Color (CU),Nitrate (mg/L),pH,UVA254 (1/m),Turbidity (NTU),Temperature (°C),Conductivity (uS/cm),Free Chlorine (mg/L),TOC (mg/L)
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-05-01 14:15:00,1,22.0,7.79,0.0,0.23,16.4,508,0.041,0.3
2024-05-01 14:30:00,1,22.0,7.79,0.0,0.13,16.3,506,0.047,0.3
2024-05-01 14:45:00,1,22.0,7.79,0.0,0.18,16.2,507,0.036,0.3
2024-05-01 15:00:00,1,22.0,7.79,0.0,0.20,16.3,506,0.026,0.3
2024-05-01 15:15:00,1,22.0,7.79,0.0,0.08,16.3,506,0.023,0.3
...,...,...,...,...,...,...,...,...,...
2024-09-09 12:45:00,1,22.5,7.96,0.1,0.37,17.5,514,0.000,0.3
2024-09-09 13:00:00,1,22.5,7.97,0.1,0.41,17.7,513,0.000,0.3
2024-09-09 13:15:00,1,22.5,7.97,0.1,0.35,17.8,513,0.000,0.3
2024-09-09 13:30:00,1,22.5,7.97,0.1,0.36,18.2,511,0.000,0.3


In [87]:
for code in first_batch_sensor_df["Code"].unique():
    df = first_batch_sensor_df[first_batch_sensor_df["Code"] == code].copy()
    df["DateTime"] = pd.to_datetime(df["DateTime"])
    df.set_index("DateTime", inplace=True)

    df.drop(columns=["Code"], inplace=True)

    sensor_df = sensor_dict.pop(code)

    sensor_df = pd.concat([sensor_df, df])
    sensor_df.sort_index(inplace=True)
    sensor_dict[code] = sensor_df

In [88]:
sensor_dict["Berna"]

Unnamed: 0_level_0,Color (CU),Nitrate (mg/L),pH,UVA254 (1/m),Turbidity (NTU),Temperature (°C),Conductivity (uS/cm),Free Chlorine (mg/L),TOC (mg/L)
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-02-01 00:00:00,1.0,28.2,7.46,0.147,0.227,13.6,518.0,0.098,0.2
2024-02-01 00:15:00,1.0,28.0,7.46,0.145,0.247,13.5,513.0,0.089,0.2
2024-02-01 00:30:00,1.0,28.0,7.46,0.129,0.193,13.5,512.0,0.081,0.2
2024-02-01 00:45:00,1.0,27.8,7.47,0.119,0.183,13.5,506.0,0.062,0.1
2024-02-01 01:00:00,1.0,27.6,7.47,0.123,0.215,13.5,502.0,0.050,0.1
...,...,...,...,...,...,...,...,...,...
2024-09-09 12:40:00,4.0,27.5,7.55,2849.000,1099.000,16.4,483.0,0.022,0.8
2024-09-09 12:56:39,4.0,27.5,7.55,2825.000,1075.000,17.0,487.0,0.011,0.8
2024-09-09 13:13:18,4.0,27.5,7.56,2825.000,1054.000,16.9,487.0,0.012,0.8
2024-09-09 13:29:57,4.0,27.5,7.58,2833.000,1078.000,16.3,486.0,0.005,0.8


# Outliers

## Sensor Samples

In [89]:
sensor_df.rename(
    columns={
        "Conductivity (μS/cm)": "Conductivity (uS/cm)",
        "TOC (mg/l)": "TOC (mg/L)",
        "Nitrate (mg/l)": "Nitrate (mg/L)",
        "Free Chlorine (mg/l)": "Free Chlorine (mg/L)",
        "Turbidity (FTU)": "Turbidity (NTU)",
    },
    inplace=True,
)

In [90]:
# Notes

# - GRAMSCI
# Turbidity selected upper threshold is 1.5
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.8
# UVA254 selected upper threshold is 1.5


# - BERNA
# Turbidity selected upper threshold is 1.5
# Temperature selected upper threshold is 19.5
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.2

# - BANDE NERE
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Nitrate selected lower threshold is 20
# UVA254 selected upper threshold is 0.4

# - CHIOSTREGI
# free chlorine selected upper threshold is 0.06

# - FORTUNATO
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Nitrate selected lower threshold is 25
# UVA254 selected upper threshold is 0.4

# - MONTEVIDEO
# Color selected upper threshold is 4
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 1
# Nitrate selected lower threshold is 20
# TOC selected upper threshold is 1
# UVA254 selected upper threshold is 4

# - PREALPI
# Turbidity selected upper threshold is 0.7
# UVA254 selected upper threshold is 1.5

# - TABACCHI

# - TOGNAZZI
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.4

thresholds = {
    "Gramsci": {
        "Turbidity (NTU)": 1.5,
        "Conductivity (uS/cm)": 400,
        # "Free Chlorine (mg/l)": 0.8,
        "UVA254 (1/m)": 1.5,
    },
    "Berna": {
        "Turbidity (NTU)": 1.5,
        "Temperature (°C)": 19.5,
        "Conductivity (uS/cm)": 400,
        # "Free Chlorine (mg/L)": 0.2,
        "UVA254 (1/m)": 1.5,
    },
    "Bande Nere": {
        "Turbidity (NTU)": 1,
        "Conductivity (uS/cm)": 400,
        "Nitrate (mg/L)": 20,
        "UVA254 (1/m)": 0.4,
    },
    "Chiostergi": {
        "Free Chlorine (mg/L)": 0.06,
    },
    "Fortunato": {
        "Turbidity (NTU)": 1,
        "Conductivity (uS/cm)": 400,
        "Nitrate (mg/L)": 25,
        "UVA254 (1/m)": 0.4,
    },
    "Montevideo": {
        "Color (CU)": 4,
        "Turbidity (NTU)": 1,
        "Conductivity (uS/cm)": 400,
        "Free Chlorine (mg/L)": 1,
        "Nitrate (mg/L)": 20,
        "TOC (mg/L)": 1,
        "UVA254 (1/m)": 4,
    },
    "Prealpi": {
        "Turbidity (NTU)": 0.7,
        "UVA254 (1/m)": 1.5,
    },
    "Tabacchi": {},
    "Tognazzi": {
        "Conductivity (uS/cm)": 400,
        "Free Chlorine (mg/L)": 0.4,
    },
}

In [91]:
figsize = (30, 20)
plt.rcParams.update({"font.size": 22})

for code in sensor_dict.keys():
    sensor_df = sensor_dict[code]
    for column in sensor_df.columns:
        df = sensor_df[column].copy()

        # drop rows with duplicated index
        df = df[~df.index.duplicated(keep="first")]

        # plot the data with the thresholds for the variables that have them
        # and compare the distribution of the values with the thresholds

        if column in thresholds[code]:
            threshold = thresholds[code][column]
            fig, ax = plt.subplots(2, 2, figsize=figsize)
            sns.lineplot(x=df.index, y=df, ax=ax[0, 0])
            ax[0, 0].set_title(f"Raw Data")
            ax[0, 0].set_ylabel(column)
            ax[0, 0].set_xlabel("DateTime")
            ax[0, 0].grid()

            fig_hist = sns.histplot(
                df, bins=50, kde=True, stat="probability", ax=ax[1, 0]
            )
            ax[1, 0].set_title(f"Raw Data")
            ax[1, 0].set_ylabel("Probability")
            ax[1, 0].set_xlabel(column)
            ax[1, 0].grid()

            if column not in ["Conductivity (uS/cm)", "Nitrate (mg/L)"]:
                ax[0, 0].axhline(
                    y=threshold,
                    color="r",
                    linestyle="dashed",
                    label="Upper Threshold",
                )
                ax[0, 0].text(
                    df.index[0],
                    threshold,
                    f"Upper Threshold: {threshold}",
                    color="r",
                    va="bottom",
                )
                ax[1, 0].axvline(
                    x=threshold,
                    color="r",
                    linestyle="dashed",
                    label="Upper Threshold",
                )
                ax[1, 0].text(
                    threshold,
                    fig_hist.get_ylim()[1],
                    f"Upper Threshold: {threshold}",
                    color="r",
                    rotation=90,
                    ha="right",
                    va="top",
                )
                df = df[df <= threshold]
            else:
                ax[0, 0].axhline(
                    y=threshold,
                    color="r",
                    linestyle="dashed",
                    label="Lower Threshold",
                )
                ax[0, 0].text(
                    df.index[0],
                    threshold,
                    f"Lower Threshold: {threshold}",
                    color="r",
                    va="bottom",
                )
                ax[1, 0].axvline(
                    x=threshold,
                    color="r",
                    linestyle="dashed",
                    label="Lower Threshold",
                )
                ax[1, 0].text(
                    threshold,
                    fig_hist.get_ylim()[1],
                    f"Lower Threshold: {threshold}",
                    color="r",
                    rotation=90,
                    ha="right",
                    va="top",
                )
                df = df[df >= threshold]

            sns.lineplot(x=df.index, y=df, ax=ax[0, 1], color="g")
            ax[0, 1].set_title(f"Filtered Data")
            ax[0, 1].set_ylabel(column)
            ax[0, 1].set_xlabel("DateTime")
            ax[0, 1].grid()

            sns.histplot(
                df,
                bins=50,
                kde=True,
                stat="probability",
                ax=ax[1, 1],
                color="g",
            )
            ax[1, 1].set_title(f"Filtered Data")
            ax[1, 1].set_ylabel("Probability")
            ax[1, 1].set_xlabel(column)
            ax[1, 1].grid()
        else:
            plt.figure(figsize=figsize)
            sns.lineplot(x=df.index, y=df)
            plt.ylabel(column)
            plt.xlabel("DateTime")
            plt.grid()

        plt.suptitle(f"{code} - {column}", fontsize=30)
        plt.tight_layout()

        column_ = column.replace("/", "_")

        path = os.path.join(plot_folder, "Clean Data", "Removed Outliers", code)

        if not os.path.exists(path):
            os.makedirs(path)

        plt.savefig(
            os.path.join(
                plot_folder,
                "Clean Data",
                "Removed Outliers",
                code,
                f"{column_}.png",
            ),
            dpi=300,
        )
        plt.close()

        # plt.show()

plt.rcParams.update({"font.size": 10})

In [92]:
# remove the rows that have values outside the thresholds
for code in thresholds.keys():
    sensor_df = sensor_dict[code].copy()

    for column in thresholds[code].keys():
        threshold = thresholds[code][column]
        df = sensor_df[column].copy()

        df = (
            df[df > threshold]
            if column not in ["Conductivity (uS/cm)", "Nitrate (mg/L)"]
            else df[df < threshold]
        )

        sensor_df.loc[df.index, column] = np.nan

    sensor_df.interpolate(method="time", inplace=True)

    sensor_dict.update({code: sensor_df})

# Store Data

In [93]:
grab_df.to_excel(os.path.join(clean_data_folder, "grab.xlsx"), index=False)

In [94]:
if not os.path.exists(os.path.join(clean_data_folder, "sensors")):
    os.mkdir(os.path.join(clean_data_folder, "sensors"))

for code in sensor_dict.keys():
    sensor_dict[code].to_excel(
        os.path.join(clean_data_folder, "sensors", f"{code}.xlsx"), index=True
    )