# Supply Points (Case dell'Acqua) Data Preprocessing

In [2]:
import os
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Paths

In [37]:
utils_folder = os.path.join('..', '..', 'utils')

with open(os.path.join(utils_folder, 'onedrive.txt'), 'r') as f:
    cloud_data_folder = os.path.join(f.readline().strip(), 'Case dell\'acqua')

grab_samples_folder = os.path.join(cloud_data_folder, "Grab Samples")
sensors_folder = os.path.join(cloud_data_folder, "Sensori")

local_data_folder = os.path.join('..', '..', 'data', 'second_phase')
clean_data_folder = os.path.join(local_data_folder, "Clean Data")

In [4]:
# Tra i grab non c'è l'ORP, mentre
# tra i sensori non c'è DOC (c'è il TOC) e L'UVA254

# Quindi in comune abbiamo:
# Color, TOC, Nitrati, Turbidity, pH, Temperature, Conductivity, Free Chlorine

# Load Grab Samples

In [5]:
grab_df = pd.DataFrame()
for filename in os.listdir(grab_samples_folder):
    if grab_df.empty:
        grab_df = pd.read_excel(os.path.join(grab_samples_folder, filename))
    else:
        df = pd.read_excel(os.path.join(grab_samples_folder, filename))
        grab_df = pd.concat([grab_df, df])

In [None]:
grab_df.head(5)

In [7]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)

In [8]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [9]:
# remove columns that are not in the column_types.json file
grab_df = grab_df[metadata_columns + features_columns + targets_columns]

In [None]:
grab_df.head(5)

## Fix LOD values

In [11]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [12]:
def set_label(value):
    if isinstance(value, (int, float)):
        return "Normal"
    elif pd.isna(value):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "Normal"

In [13]:
for column in features_columns + targets_columns:
    label_column = column + "_label"
    grab_df.loc[:, label_column] = grab_df[column].apply(set_label)

In [14]:
grab_df[features_columns] = grab_df[features_columns].map(
    convert_string_values
)

grab_df[targets_columns] = grab_df[targets_columns].map(
    convert_string_values
)

In [None]:
grab_df.columns.to_list()

# Load Sensor Samples

In [16]:
sensor_dict = {}

for sensor_file in os.listdir(sensors_folder):
    if sensor_file == ".DS_Store":
        continue
    
    sensor_folder = os.path.join(sensors_folder, sensor_file)
    for filename in os.listdir(sensor_folder): 
        
        if not filename.endswith(".xlsx"):
            continue
        
        house_code = filename.split("_")[0]
        if house_code not in sensor_dict:
            sensor_dict[house_code] = pd.read_excel(os.path.join(sensor_folder, filename), header=1)
        else:
            df = pd.read_excel(os.path.join(sensor_folder, filename), header=1)
            sensor_dict[house_code] = pd.concat([sensor_dict[house_code], df])

In [None]:
sensor_dict['via TABACCHI'].columns.to_list()

In [18]:
columns_mapping = {
    "Measurement interval=900[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=999[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=0[sec] (Export-Aggregation disabled)": "DateTime",
    "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)": "Color (CU)",
    "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)": "TOC (mg/l)",
    "NO3eq - Measured value [mg/l] (Limit:0.00-88.00)": "Nitrate (mg/l)",
    "UV254t - Measured value [Abs/m] (Limit:0.00-71.00)": "UVA254 (1/m)",
    "Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)": "Turbidity (FTU)",
    "pH - Measured value (Limit:0.00-14.00)": "pH",
    "Temperature - Measured value [C] (Limit:-5.00-100.00)": "Temperature (°C)",
    "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)": "Conductivity (μS/cm)",
    "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)": "Free Chlorine (mg/l)",
}


for house_code, df in sensor_dict.items():
    sensor_dict[house_code] = df.rename(columns=columns_mapping)
    
    # set to get unique values
    columns = set(columns_mapping.values())
    
    
    sensor_dict[house_code] = sensor_dict[house_code][list(columns)]

# Processing

## Grab Samples

In [19]:
grab_df.drop(
    columns=[
        'Codice punto di prelievo',
        'Rapporto di prova',
    ],
    inplace=True
)

In [None]:
grab_df['Punto di prelievo'].unique()

In [21]:
# change name of Punta di prelievo values to match codes
def change_name(name):
    if "Tognazzi" in name:
        return "Tognazzi"
    elif "Tabacchi" in name:
        return "Tabacchi"
    elif "Gramsci" in name:
        return "Gramsci"
    elif "Berna" in name:
        return "Berna"
    elif "Bande Nere" in name or "Piazzale Giovanni" in name:
        return "Bande Nere"
    elif "Prealpi" in name:
        return "Prealpi"
    elif "Chiostergi" in name:
        return "Chiostergi"
    elif "Montevideo" in name or "Montevid" in name:
        return "Montevideo"
    elif "Fortunato" in name:
        return "Fortunato"
    else:
        return name

In [22]:
grab_df['Punto di prelievo'] = grab_df['Punto di prelievo'].map(change_name)

In [23]:
grab_df['Data di prelievo'] = pd.to_datetime(grab_df['Data di prelievo'])

In [None]:
grab_df

## Sensor Samples

In [None]:
sensor_dict.keys()

In [26]:
# change the name of the keys to match the names in the grab_df
sensor_dict['Tabacchi'] = sensor_dict.pop('via TABACCHI')
sensor_dict['Tognazzi'] = sensor_dict.pop('via Tognazzi')
sensor_dict['Prealpi'] = sensor_dict.pop('Piazza Prealpi')

In [27]:
for code in grab_df['Punto di prelievo'].unique():
    sensor_df = sensor_dict.pop(code)
    sensor_df['DateTime'] = pd.to_datetime(sensor_df['DateTime'])
    sensor_df.set_index('DateTime', inplace=True)
    sensor_dict[code] = sensor_df

# Missing Values

## Grab

In [None]:
# compute number of missing values for each column
for code in grab_df['Punto di prelievo'].unique():
    code_df = grab_df[grab_df['Punto di prelievo'] == code]
    for column in features_columns + targets_columns:
        # count the number of missing values
        missing_values = code_df[column].isna().sum()
        if missing_values > 0:
            print(f"{code} has {missing_values} missing values in column {column}")

In [None]:
grab_df

In [None]:
# compute number of rows that have at least one missing value
for code in grab_df['Punto di prelievo'].unique():
    code_df = grab_df[grab_df['Punto di prelievo'] == code]
    
    missing_values = code_df[features_columns + targets_columns].isna().any(axis=1).sum()
    if missing_values > 0:
        print(f"{code} has {missing_values} rows with missing values")

In [31]:
# remove the Berna rows with missing values
row_index = grab_df[
    (grab_df['Punto di prelievo'] == "Berna") & (grab_df[features_columns + targets_columns].isna().any(axis=1))
].index

grab_df.drop(row_index, inplace=True)


In [32]:
# for the moment no imputation is done

## Sensor

In [None]:
for code in sensor_dict.keys():
    sensor_df = sensor_dict[code]
    for column in sensor_df.columns:
        missing_values = sensor_df[column].isna().sum()
        if missing_values > 0:
            print(f"{code} has {missing_values} missing values in column {column}")

In [34]:
# the number of missing values is very low, so we can do implicit imputation with time interpolation
for code in sensor_dict.keys():
    sensor_df = sensor_dict.pop(code)
    sensor_df.interpolate(method='time', inplace=True)
    sensor_dict[code] = sensor_df

# Store Data

In [362]:
grab_df.to_excel(os.path.join(clean_data_folder, "grab.xlsx"), index=False)

In [363]:
if not os.path.exists(os.path.join(clean_data_folder, "sensors")):
    os.mkdir(os.path.join(clean_data_folder, "sensors"))
    
for code in sensor_dict.keys():
    sensor_dict[code].to_excel(os.path.join(clean_data_folder, "sensors", f"{code}.xlsx"), index=True)