# All Grab Samples Data Preprocessing

In [16]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [17]:
data_folder = os.path.join(os.path.join("..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

datasets_folder = os.path.join(data_folder, "Intermediate Data")
store_folder = os.path.join(data_folder, "temporary results")

grab_samples_supply_points_path = os.path.join(
    raw_data_folder,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)
all_grab_samples_path = os.path.join(
    raw_data_folder, "Tutti punti - Grab Samples"
)

## Collect all grab samples

### Load all grab samples

In [18]:
# Load the columns
with open(os.path.join(store_folder, "columns_types.json"), "r") as f:
    column_types = json.load(f)

all_metadata_columns = column_types["metadata_columns"]
all_features_columns = column_types["features_columns"]
all_target_columns = column_types["targets_columns"]

all_columns = all_metadata_columns + all_features_columns + all_target_columns

In [19]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(set(df.columns.to_list()) & set(all_columns))
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples, ignore_index=True)

### Fix Redundant Columns

In [20]:
# combine all value columns in the mapping to the corresponding key column
column_mapping = {
    "Temperatura (°C)": [
        "Temperatura - °C",
    ],
    "Concentr. ioni idrogeno al prelievo (unità pH)": [
        "Concentr. ioni idrogeno (al prelievo) (unità pH)",
    ],
    "Torbidità (NTU)": [
        "Torbidità (NTu)",
    ],
    "Carica batterica a 37°C (UFC/mL)": [
        "Conta delle colonie a 37°C (UFC/mL)",
    ],
    "Batteri coliformi a 37°C (MPN/100 mL)": [
        "Batteri coliformi a 37°C (MPN / 100 mL)",
    ],
    "Colore (CU)": [
        "Colore (Cu)",
    ],
    "Escherichia coli (MPN/100 mL)": [
        "Escherichia Coli (MPN / 100mL)",
    ],
    "Enterococchi (MPN/100 mL)": [
        "Enterococchi (MPN / 100mL)",
    ],
    "Carica batterica a 22°C (UFC/mL)": [
        "Conta delle colonie a 22°C (UFC/mL)",
    ],
}

for final_column, original_columns in column_mapping.items():
    for original_column in original_columns:
        grab_samples_df[final_column] = grab_samples_df[
            final_column
        ].combine_first(grab_samples_df[original_column])
    grab_samples_df.drop(columns=original_columns, inplace=True)

In [21]:
metadata_columns = list(
    set(all_metadata_columns) & set(grab_samples_df.columns)
)
feature_columns = list(set(all_features_columns) & set(grab_samples_df.columns))
target_columns = list(set(all_target_columns) & set(grab_samples_df.columns))

In [22]:
grab_samples_df.dropna(
    axis=0, subset=feature_columns + target_columns, how="all", inplace=True
)
grab_samples_df.dropna(axis=1, how="all", inplace=True)

In [23]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [24]:
convert_columns = list(
    set(grab_samples_df.columns.to_list()) - set(metadata_columns)
)

In [25]:
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].applymap(
    convert_string_values
)

  grab_samples_df[convert_columns] = grab_samples_df[convert_columns].applymap(


In [26]:
# convert each column to the right data type
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].astype(
    float
)

In [27]:
# order columns based on their type (metadata, features, target)
ordered_columns = [col for col in all_columns if col in grab_samples_df.columns]

grab_samples_df = grab_samples_df[ordered_columns]

In [30]:
grab_samples_df["Data di prelievo"] = pd.to_datetime(
    grab_samples_df["Data di prelievo"], format="%Y/%m/%d"
)

In [31]:
grab_samples_df

Unnamed: 0,Analisi programmate,Campagna,Codice punto di prelievo,Data di prelievo,Punto di prelievo,Rapporto di prova,ZONA,Cloro residuo libero (al prelievo) (mg/L di Cl2),Cloro residuo libero (mg/L di Cl2),Colore (CU),...,Bromoformio (µg/L),Carica batterica a 22°C (UFC/mL),Carica batterica a 37°C (UFC/mL),Cloroformio (µg/L),Conteggio colonie a 30°C (UFC/mL),Dibromoclorometano (µg/L),Enterococchi (MPN/100 mL),Escherichia coli (MPN/100 mL),Legionella spp (UFC/L),Pseudomonas aeruginosa (UFC/250 mL)
0,Analisi all'utenza di acque potabili,UTENZA,TEST00216,2011-12-14,"""Al Tempio d'Oro"" - Via delle Leghe, 23",5006/11,varie,,,0.005,...,,,,,,,,,,
1,"Coliformi, E-Coli, Enterococchi, Pseudomonas",MICRO_PS,TEST00216,2014-01-02,"""Al Tempio d'Oro"" - Via delle Leghe, 23",15/14,varie,,0.00,,...,,,,,,,0.0,0.0,,
2,Analisi Generica,ANALISI,TEST00210,2011-11-22,"""Capoverde"" - Via Leoncavallo, 16",4641/11,varie,,0.05,0.670,...,,0.0,0.0,,,,0.0,0.0,,
7,Analisi all'utenza di acque potabili,UTENZA,TEST00228,2012-02-29,"""Non ho fretta"" - Via Teodosio, 19",860/12,varie,,0.07,0.005,...,,5.0,0.0,,,,0.0,0.0,,
10,"CBT 22° e 37°, Coliformi, E-Coli, Enterococchi...",MICR_UT_PS,FALC_02_02,2013-02-26,1A-1,733/13,falciola,,0.00,,...,,54.0,6.0,,,,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76091,Routine A,ROUTINE-A,015146C003,2023-05-31,Italia,2561/23,SUD - OVEST,,,0.500,...,0.25,,,,,0.25,0.0,0.0,,
76092,Tubazione,TUBAZIONE,TUBAZIONE,2023-05-31,Via Vespri Siciliani,2562/23,-,0.02,,,...,,,,,,,0.0,0.0,,
76093,Tubazione,TUBAZIONE,TUBAZIONE,2023-05-31,Via Calchi Taeggi,2563/23,-,0.02,,0.500,...,,,,,,,0.0,0.0,,
76094,Brick,BRICK,BRICK,2023-05-31,Acqua del Sindaco Lotto 30052023 Inizio Produz...,2571/23,-,0.02,,,...,0.25,0.5,0.5,0.25,,0.25,,,,0.0


In [32]:
grab_samples_df.to_excel(
    os.path.join(datasets_folder, "All grab samples.xlsx"), index=False
)