# Supply Points (Case dell'Acqua) Data Preprocessing

In [1]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [2]:
data_folder = os.path.join(os.path.join("..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

datasets_folder = os.path.join(data_folder, "Intermediate Data")
store_folder = os.path.join(data_folder, "temporary results")

grab_samples_supply_points_path = os.path.join(
    raw_data_folder,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)

## Collect all grab samples for Supply Points

### Load all grab samples

In [3]:
grab_samples_df = pd.read_excel(
    os.path.join(datasets_folder, "All grab samples.xlsx"),
)

### Import column types

In [4]:
# load from json file the columns
with open(os.path.join(store_folder, "columns_types.json"), "r") as f:
    column_types = json.load(f)

all_metadata_columns = column_types["metadata_columns"]
all_feature_columns = column_types["features_columns"]
all_target_columns = column_types["targets_columns"]

all_columns = all_metadata_columns + all_feature_columns + all_target_columns

metadata_columns = list(
    set(all_metadata_columns) & set(grab_samples_df.columns)
)
feature_columns = list(set(all_feature_columns) & set(grab_samples_df.columns))
target_columns = list(set(all_target_columns) & set(grab_samples_df.columns))

### Get Name and ID of Supply Points

In [5]:
column_list = "CS, CT"

meta_supply_points_df = pd.read_excel(
    grab_samples_supply_points_path, usecols=column_list, header=4
)

In [6]:
meta_supply_points_df

Unnamed: 0,filtro 1,filtro 2
0,C.A. ingresso Menotti > U.di Nemi,ING_UCCNEMI
1,Casa Acqua Appennini con CO2,SII00801
2,Casa Acqua Lessona,SII00659
3,Casa Acqua Lessona,SII00659
4,Casa Acqua Via Appennini non trattata,SII00802
...,...,...
2263,Casa dell'acqua Piazza Scolari,HOUSE_SCOLARI
2264,V.le Omero-NT-CA21,HOUSE_OME1
2265,Menotti > U.di Nemi-NT-CA08,HOUSE_UCC1
2266,Casa dell'acqua Piazza Ovidio,HOUSE_OVIDIO


### Get Supply Points Grab Samples

In [7]:
supply_points_df = grab_samples_df.merge(
    meta_supply_points_df,
    left_on=["Punto di prelievo", "Codice punto di prelievo"],
    right_on=["filtro 1", "filtro 2"],
    how="inner",
)

In [8]:
# supply_points_df.drop(columns=["filtro 1", "filtro 2"], inplace=True)
supply_points_df.drop_duplicates(inplace=True)

In [9]:
supply_points_df

Unnamed: 0,Analisi programmate,Campagna,Codice punto di prelievo,Data di prelievo,Punto di prelievo,Rapporto di prova,ZONA,Cloro residuo libero (al prelievo) (mg/L di Cl2),Cloro residuo libero (mg/L di Cl2),Colore (CU),...,Carica batterica a 37°C (UFC/mL),Cloroformio (µg/L),Conteggio colonie a 30°C (UFC/mL),Dibromoclorometano (µg/L),Enterococchi (MPN/100 mL),Escherichia coli (MPN/100 mL),Legionella spp (UFC/L),Pseudomonas aeruginosa (UFC/250 mL),filtro 1,filtro 2
0,Analisi case dell'acqua,CASA_ACQUA,ING_UCCNEMI,2014-07-18,C.A. ingresso Menotti > U.di Nemi,3185/14,Case acqua,,0.06,0.005,...,845.0,,,,0.0,0.0,,,C.A. ingresso Menotti > U.di Nemi,ING_UCCNEMI
1,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",UTENZA,SII00801,2016-06-13,Casa Acqua Appennini con CO2,2818/16,case acqua,,0.01,0.770,...,0.5,,,,0.5,0.5,,,Casa Acqua Appennini con CO2,SII00801
3,"Coliformi, E-Coli, Enterococchi, Pseudomonas",MICRO_PS,SII00659,2014-01-03,Casa Acqua Lessona,16/14,Case acqua,,0.05,,...,,,,,0.0,0.0,,,Casa Acqua Lessona,SII00659
14,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",MICR_UT_PS,SII00659,2016-04-22,Casa Acqua Lessona,1860/16,case acqua,,0.03,,...,0.5,,,,0.5,0.5,,,Casa Acqua Lessona,SII00659
25,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",UTENZA,SII00802,2016-06-13,Casa Acqua Via Appennini non trattata,2819/16,case acqua,,0.02,0.770,...,0.5,,,,0.5,0.5,,,Casa Acqua Via Appennini non trattata,SII00802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88446,Casa Dkr,DKR,HOUSE_OVIDIO,2023-05-24,Casa dell'acqua Piazza Ovidio,2461/23,-,,,,...,,,,,0.0,0.0,,0.5,Casa dell'acqua Piazza Ovidio,HOUSE_OVIDIO
88458,Casa Dkr,DKR,HOUSE_CUOCO,2023-05-24,Casa dell'acqua Piazzale Cuoco,2462/23,-,,,,...,,,,,0.0,0.0,,0.5,Casa dell'acqua Piazzale Cuoco,HOUSE_CUOCO
88467,Casa Dkr,DKR,HOUSE_MONTE_PERALBA,2023-05-29,Casa dell'acqua Monte Peralba,2508/23,-,,,,...,,,,,0.0,0.0,,,Casa dell'acqua Monte Peralba,HOUSE_MONTE_PERALBA
88481,Casa Dkr,DKR,HOUSE_TABACCHI,2023-05-29,Casa dell'acqua Via Tabacchi,2509/23,-,,,,...,,,,,0.0,0.0,,,Casa dell'acqua Via Tabacchi,HOUSE_TABACCHI


In [10]:
supply_points_df = supply_points_df[
    (
        supply_points_df["filtro 1"].notna()
        | supply_points_df["filtro 2"].notna()
    )
]

supply_points_df.drop(
    columns=[
        "filtro 1",
        "filtro 2",
    ],
    inplace=True,
)

In [11]:
# if there are rows with the same values for Rapporto di prova, keep the one with the most complete data

# Count the number of non-NaN values in each row
supply_points_df["non_nan_count"] = supply_points_df.count(axis=1)

# Group by 'Rapporto di prova' and keep the row with the most non-NaN values
supply_points_df = supply_points_df.loc[
    supply_points_df.groupby("Rapporto di prova")["non_nan_count"].idxmax()
]

# Drop the 'non_nan_count' column as it's no longer needed
supply_points_df = supply_points_df.drop(columns="non_nan_count")

In [12]:
# drop rows that contain only nans for the columns that are not in metadata_columns
supply_points_df.dropna(
    axis=0,
    how="all",
    subset=[
        col
        for col in supply_points_df.columns.to_list()
        if col not in metadata_columns
    ],
    inplace=True,
)

# drop columns that contain only nans
supply_points_df.dropna(axis=1, how="all", inplace=True)

In [13]:
supply_points_df

Unnamed: 0,Analisi programmate,Campagna,Codice punto di prelievo,Data di prelievo,Punto di prelievo,Rapporto di prova,ZONA,Cloro residuo libero (al prelievo) (mg/L di Cl2),Cloro residuo libero (mg/L di Cl2),Colore (CU),...,Bromodiclorometano (µg/L),Bromoformio (µg/L),Carica batterica a 22°C (UFC/mL),Carica batterica a 37°C (UFC/mL),Cloroformio (µg/L),Dibromoclorometano (µg/L),Enterococchi (MPN/100 mL),Escherichia coli (MPN/100 mL),Legionella spp (UFC/L),Pseudomonas aeruginosa (UFC/250 mL)
70722,C Acq Friz,C_ACQ_FRIZ,HOUSE_ZUR2,2019-01-02,Via Zuretti-CO2-CA01,10/19,Case acqua,,,,...,,,0.0,0.0,,,,0.0,,0.0
56615,Analisi Generica,ANALISI,HOUSE_OME2,2017-03-09,V.le Omero - acqua con CO2,1003/17,Case Acqua,,,,...,,,0.5,0.5,,,,,,
83480,Casa Dkr,DKR,HOUSE_GASPARRI,2022-03-29,Casa dell'acqua piazza Cardinale Pietro Gasparri,1005/22,Case Acqua,,,,...,,,1.5,,,,0.0,0.0,,0.5
83493,Casa Dkr,DKR,HOUSE_FORTUNATO,2022-03-29,Casa dell'acqua Piazza Fortunato,1006/22,Case Acqua,,,,...,,,3.0,,,,0.0,0.0,,0.5
37513,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",UTENZA,HOUSE_LIV1,2013-03-12,Via Livigno - acqua non trattata,1009/13,Case acqua,,0.000,1.530,...,,,0.0,1.0,,,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56608,Analisi case dell'acqua,CASA_ACQUA,HOUSE_CGR2,2017-03-08,V.le Ca'Granda / Gatti - acqua con CO2,970/17,Case Acqua,,0.005,0.005,...,,,0.5,0.5,,,0.5,0.5,,
64178,Prelievi per il Progetto Pilgrimm,PG_PILGRIM,ING_TELAVIV,2018-01-08,Ingresso casa acqua L.go Tel Aviv,98/18,case acqua,,0.005,1.490,...,,,13.0,3.0,,,0.5,0.5,,
2499,Prelievi agli erogatori dell'acqua in sede ed ...,EROGATORI,SII00629,2015-03-11,Palazzo Marino - frizzante 2° piano,995/15,Erogatori,,0.000,,...,,,427.0,186.0,,,0.0,0.0,,
79390,Casa Acqua,CASA_ACQUA,HOUSE_LIV1,2021-03-18,Via Livigno-NT-CA03,997/21,Case acqua,,0.020,,...,,,0.5,,,,0.0,0.0,,0.5


In [14]:
supply_points_df["Codice punto di prelievo"] = supply_points_df[
    "Codice punto di prelievo"
].str.strip()

In [18]:
supply_points_df

Unnamed: 0,Analisi programmate,Campagna,Codice punto di prelievo,Data di prelievo,Punto di prelievo,Rapporto di prova,ZONA,Cloro residuo libero (al prelievo) (mg/L di Cl2),Cloro residuo libero (mg/L di Cl2),Colore (CU),...,Bromodiclorometano (µg/L),Bromoformio (µg/L),Carica batterica a 22°C (UFC/mL),Carica batterica a 37°C (UFC/mL),Cloroformio (µg/L),Dibromoclorometano (µg/L),Enterococchi (MPN/100 mL),Escherichia coli (MPN/100 mL),Legionella spp (UFC/L),Pseudomonas aeruginosa (UFC/250 mL)
70722,C Acq Friz,C_ACQ_FRIZ,HOUSE_ZUR2,2019-01-02,Via Zuretti-CO2-CA01,10/19,Case acqua,,,,...,,,0.0,0.0,,,,0.0,,0.0
56615,Analisi Generica,ANALISI,HOUSE_OME2,2017-03-09,V.le Omero - acqua con CO2,1003/17,Case Acqua,,,,...,,,0.5,0.5,,,,,,
83480,Casa Dkr,DKR,HOUSE_GASPARRI,2022-03-29,Casa dell'acqua piazza Cardinale Pietro Gasparri,1005/22,Case Acqua,,,,...,,,1.5,,,,0.0,0.0,,0.5
83493,Casa Dkr,DKR,HOUSE_FORTUNATO,2022-03-29,Casa dell'acqua Piazza Fortunato,1006/22,Case Acqua,,,,...,,,3.0,,,,0.0,0.0,,0.5
37513,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",UTENZA,HOUSE_LIV1,2013-03-12,Via Livigno - acqua non trattata,1009/13,Case acqua,,0.000,1.530,...,,,0.0,1.0,,,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56608,Analisi case dell'acqua,CASA_ACQUA,HOUSE_CGR2,2017-03-08,V.le Ca'Granda / Gatti - acqua con CO2,970/17,Case Acqua,,0.005,0.005,...,,,0.5,0.5,,,0.5,0.5,,
64178,Prelievi per il Progetto Pilgrimm,PG_PILGRIM,ING_TELAVIV,2018-01-08,Ingresso casa acqua L.go Tel Aviv,98/18,case acqua,,0.005,1.490,...,,,13.0,3.0,,,0.5,0.5,,
2499,Prelievi agli erogatori dell'acqua in sede ed ...,EROGATORI,SII00629,2015-03-11,Palazzo Marino - frizzante 2° piano,995/15,Erogatori,,0.000,,...,,,427.0,186.0,,,0.0,0.0,,
79390,Casa Acqua,CASA_ACQUA,HOUSE_LIV1,2021-03-18,Via Livigno-NT-CA03,997/21,Case acqua,,0.020,,...,,,0.5,,,,0.0,0.0,,0.5


In [16]:
supply_points_df.to_excel(
    os.path.join(datasets_folder, "All grab samples - supply points.xlsx"),
    index=False,
)