# Supply Points (Case dell'Acqua) Data Preprocessing

In [None]:
import os
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Paths

In [None]:
utils_folder = os.path.join('..', '..', 'utils')

with open(os.path.join(utils_folder, 'onedrive.txt'), 'r') as f:
    cloud_data_folder = os.path.join(f.readline().strip(), 'Case dell\'acqua')

grab_samples_folder = os.path.join(cloud_data_folder, "Grab Samples")
sensors_folder = os.path.join(cloud_data_folder, "Sensori") 

local_data_folder = os.path.join('..', '..', 'data')
intermediate_data_folder = os.path.join(local_data_folder, "Intermediate Data")
clean_data_folder = os.path.join(local_data_folder, "Clean Data")
raw_data_folder = os.path.join(local_data_folder, "Raw Data")

all_grab_samples_path = os.path.join(
    raw_data_folder, "Tutti punti - Grab Samples"
)

grab_samples_supply_points_path = os.path.join(
    raw_data_folder,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)

In [None]:
# Tra i grab non c'è l'ORP, mentre
# tra i sensori non c'è DOC (c'è il TOC) e L'UVA254

# Quindi in comune abbiamo:
# Color, TOC, Nitrati, Turbidity, pH, Temperature, Conductivity, Free Chlorine

# Load Grab Samples

In [None]:
grab_df = pd.DataFrame()
for filename in os.listdir(grab_samples_folder):
    if grab_df.empty:
        grab_df = pd.read_excel(os.path.join(grab_samples_folder, filename))
    else:
        df = pd.read_excel(os.path.join(grab_samples_folder, filename))
        grab_df = pd.concat([grab_df, df])

In [None]:
grab_df.head(5)

In [None]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)

In [None]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
common_metadata_columns = list(set(metadata_columns).intersection(grab_df.columns))
common_features_columns = list(set(features_columns).intersection(grab_df.columns))
common_targets_columns = list(set(targets_columns).intersection(grab_df.columns))

In [None]:
# remove columns that are not in the column_types.json file
grab_df = grab_df[common_metadata_columns + common_features_columns + common_targets_columns]

In [None]:
grab_df.head(5)

## Fix LOD values

In [None]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [None]:
def set_label(value):
    if pd.isna(value):
        return "NaN"
    elif isinstance(value, (int, float)):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "NaN"

In [None]:
for column in common_features_columns + common_targets_columns:
    label_column = column + "_label"
    grab_df.loc[:, label_column] = grab_df[column].apply(set_label)

In [None]:
grab_df[common_features_columns] = grab_df[common_features_columns].map(
    convert_string_values
)

grab_df[common_targets_columns] = grab_df[common_targets_columns].map(
    convert_string_values
)

In [None]:
grab_df.columns.to_list()

# Load Sensor Samples

In [None]:
sensor_dict = {}

for sensor_file in os.listdir(sensors_folder):
    if sensor_file == ".DS_Store":
        continue
    
    sensor_folder = os.path.join(sensors_folder, sensor_file)
    for filename in os.listdir(sensor_folder): 
        
        if not filename.endswith(".xlsx"):
            continue
        
        house_code = filename.split("_")[0]
        if house_code not in sensor_dict:
            sensor_dict[house_code] = pd.read_excel(os.path.join(sensor_folder, filename), header=1)
        else:
            df = pd.read_excel(os.path.join(sensor_folder, filename), header=1)
            sensor_dict[house_code] = pd.concat([sensor_dict[house_code], df])

In [None]:
sensor_dict['via TABACCHI'].columns.to_list()

In [None]:
columns_mapping = {
    "Measurement interval=900[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=999[sec] (Export-Aggregation disabled)": "DateTime",
    "Measurement interval=0[sec] (Export-Aggregation disabled)": "DateTime",
    "COLORtrue - Measured value [Hazen-eq.] (Limit:0.00-300.00)": "Color (CU)",
    "TOCeq - Measured value [mg/l] (Limit:0.00-22.00)": "TOC (mg/l)",
    "NO3eq - Measured value [mg/l] (Limit:0.00-88.00)": "Nitrate (mg/l)",
    "UV254t - Measured value [Abs/m] (Limit:0.00-71.00)": "UVA254 (1/m)",
    "Turbidity - Measured value [FTUeq] (Limit:0.00-170.00)": "Turbidity (FTU)",
    "pH - Measured value (Limit:0.00-14.00)": "pH",
    "Temperature - Measured value [C] (Limit:-5.00-100.00)": "Temperature (°C)",
    "Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)": "Conductivity (μS/cm)",
    "Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00)": "Free Chlorine (mg/l)",
}


for house_code, df in sensor_dict.items():
    sensor_dict[house_code] = df.rename(columns=columns_mapping)
    
    # set to get unique values
    columns = set(columns_mapping.values())
    
    
    sensor_dict[house_code] = sensor_dict[house_code][list(columns)]

# Processing

## Grab Samples

In [None]:
grab_df.drop(
    columns=[
        'Codice punto di prelievo',
        'Rapporto di prova',
    ],
    inplace=True
)

In [None]:
grab_df['Punto di prelievo'].unique()

In [None]:
# change name of Punta di prelievo values to match codes
def change_name(name):
    if "Tognazzi" in name:
        return "Tognazzi"
    elif "Tabacchi" in name:
        return "Tabacchi"
    elif "Gramsci" in name:
        return "Gramsci"
    elif "Berna" in name:
        return "Berna"
    elif "Bande Nere" in name or "Piazzale Giovanni" in name:
        return "Bande Nere"
    elif "Prealpi" in name:
        return "Prealpi"
    elif "Chiostergi" in name:
        return "Chiostergi"
    elif "Montevideo" in name or "Montevid" in name:
        return "Montevideo"
    elif "Fortunato" in name:
        return "Fortunato"
    else:
        return name

In [None]:
grab_df['Punto di prelievo'] = grab_df['Punto di prelievo'].map(change_name)

In [None]:
grab_df['Data di prelievo'] = pd.to_datetime(grab_df['Data di prelievo'])

In [None]:
grab_df

## Sensor Samples

In [None]:
sensor_dict.keys()

In [None]:
# change the name of the keys to match the names in the grab_df
sensor_dict['Tabacchi'] = sensor_dict.pop('via TABACCHI')
sensor_dict['Tognazzi'] = sensor_dict.pop('via Tognazzi')
sensor_dict['Prealpi'] = sensor_dict.pop('Piazza Prealpi')

In [None]:
for code in grab_df['Punto di prelievo'].unique():
    sensor_df = sensor_dict.pop(code)
    sensor_df['DateTime'] = pd.to_datetime(sensor_df['DateTime'])
    sensor_df.set_index('DateTime', inplace=True)
    sensor_dict[code] = sensor_df

# Missing Values

## Grab

In [None]:
# compute number of missing values for each column
for code in grab_df['Punto di prelievo'].unique():
    code_df = grab_df[grab_df['Punto di prelievo'] == code]
    for column in common_features_columns + common_targets_columns:
        # count the number of missing values
        missing_values = code_df[column].isna().sum()
        if missing_values > 0:
            print(f"{code} has {missing_values} missing values in column {column}")

In [None]:
grab_df

In [None]:
# compute number of rows that have at least one missing value
for code in grab_df['Punto di prelievo'].unique():
    code_df = grab_df[grab_df['Punto di prelievo'] == code]
    
    missing_values = code_df[common_features_columns + common_targets_columns].isna().any(axis=1).sum()
    if missing_values > 0:
        print(f"{code} has {missing_values} rows with missing values")

In [None]:
# remove the Berna rows with missing values
row_index = grab_df[
    (grab_df['Punto di prelievo'] == "Berna") & (grab_df[common_features_columns + common_targets_columns].isna().any(axis=1))
].index

grab_df.drop(row_index, inplace=True)


In [None]:
# for the moment no imputation is done

## Sensor

In [None]:
for code in sensor_dict.keys():
    sensor_df = sensor_dict[code]
    for column in sensor_df.columns:
        missing_values = sensor_df[column].isna().sum()
        if missing_values > 0:
            print(f"{code} has {missing_values} missing values in column {column}")

In [None]:
# the number of missing values is very low, so we can do implicit imputation with time interpolation
for code in sensor_dict.keys():
    sensor_df = sensor_dict.pop(code)
    sensor_df.interpolate(method='time', inplace=True)
    sensor_dict[code] = sensor_df

# Outliers

## Sensor Samples

In [None]:
# Notes

# - GRAMSCI
# Turbidity selected upper threshold is 1.5
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.8
# UVA254 selected upper threshold is 1.5


# - BERNA
# Turbidity selected upper threshold is 1.5
# Temperature selected upper threshold is 19.5
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.2

# - BANDE NERE
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Nitrate selected lower threshold is 20
# UVA254 selected upper threshold is 0.4

# - CHIOSTREGI
# free chlorine selected upper threshold is 0.06

# - FORTUNATO
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Nitrate selected lower threshold is 25
# UVA254 selected upper threshold is 0.4

# - MONTEVIDEO
# Color selected upper threshold is 4
# Turbidity selected upper threshold is 1
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 1
# Nitrate selected lower threshold is 20
# TOC selected upper threshold is 1
# UVA254 selected upper threshold is 4

# - PREALPI
# Turbidity selected upper threshold is 0.7
# UVA254 selected upper threshold is 1.5

# - TABACCHI

# - TOGNAZZI
# Conductivity selected lower threshold is 400
# Free Chlorine selected upper threshold is 0.4


In [None]:
for code in sensor_dict.keys():
    sensor_df = sensor_dict[code]
    for column in sensor_df.columns:
        
        # plot the data with the thresholds
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=sensor_df.index, y=sensor_df[column], mode='lines', name=column))
        
        if code == "Gramsci":
            if column == "Turbidity (FTU)":
                fig.add_hline(y=1.5, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Free Chlorine (mg/l)":
                fig.add_hline(y=0.8, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "UVA254 (1/m)":
                fig.add_hline(y=1.5, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Berna":
            if column == "Turbidity (FTU)":
                fig.add_hline(y=1.5, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Temperature (°C)":
                fig.add_hline(y=19.5, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Free Chlorine (mg/l)":
                fig.add_hline(y=0.2, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Bande Nere":
            if column == "Turbidity (FTU)":
                fig.add_hline(y=1, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Nitrate (mg/l)":
                fig.add_hline(y=20, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "UVA254 (1/m)":
                fig.add_hline(y=0.4, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Chiostergi":
            if column == "Free Chlorine (mg/l)":
                fig.add_hline(y=0.06, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Fortunato":
            if column == "Turbidity (FTU)":
                fig.add_hline(y=1, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Nitrate (mg/l)":
                fig.add_hline(y=25, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "UVA254 (1/m)":
                fig.add_hline(y=0.4, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Montevideo":
            if column == "Color (CU)":
                fig.add_hline(y=4, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Turbidity (FTU)":
                fig.add_hline(y=1, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Free Chlorine (mg/l)":
                fig.add_hline(y=1, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "Nitrate (mg/l)":
                fig.add_hline(y=20, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "TOC (mg/l)":
                fig.add_hline(y=1, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "UVA254 (1/m)":
                fig.add_hline(y=4, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Prealpi":
            if column == "Turbidity (FTU)":
                fig.add_hline(y=0.7, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
            elif column == "UVA254 (1/m)":
                fig.add_hline(y=1.5, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
        elif code == "Tabacchi":
            pass
        elif code == "Tognazzi":
            if column == "Conductivity (μS/cm)":
                fig.add_hline(y=400, line_dash="dot", line_color="red", annotation_text="Lower Threshold")
            elif column == "Free Chlorine (mg/l)":
                fig.add_hline(y=0.4, line_dash="dot", line_color="red", annotation_text="Upper Threshold")
                
        fig.update_layout(title=f"{code} - {column}")
        fig.show()

In [None]:
# remove the rows that have values outside the thresholds
for code in sensor_dict.keys():
    sensor_df = sensor_dict[code].copy()
    
    if code == "Gramsci":
        row_index = sensor_df[
            (sensor_df["Turbidity (FTU)"] > 1.5) |
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Free Chlorine (mg/l)"] > 0.8) |
            (sensor_df["UVA254 (1/m)"] > 1.5)
        ].index
    elif code == "Berna":
        row_index = sensor_df[
            (sensor_df["Turbidity (FTU)"] > 1.5) |
            (sensor_df["Temperature (°C)"] > 19.5) |
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Free Chlorine (mg/l)"] > 0.2)
        ].index
    elif code == "Bande Nere":
        row_index = sensor_df[
            (sensor_df["Turbidity (FTU)"] > 1) |
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Nitrate (mg/l)"] < 20) |
            (sensor_df["UVA254 (1/m)"] > 0.4)
        ].index
    elif code == "Chiostergi":
        row_index = sensor_df[
            (sensor_df["Free Chlorine (mg/l)"] > 0.06)
        ].index
    elif code == "Fortunato":
        row_index = sensor_df[
            (sensor_df["Turbidity (FTU)"] > 1) |
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Nitrate (mg/l)"] < 25) |
            (sensor_df["UVA254 (1/m)"] > 0.4)
        ].index
    elif code == "Montevideo":
        row_index = sensor_df[
            (sensor_df["Color (CU)"] > 4) |
            (sensor_df["Turbidity (FTU)"] > 1) |
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Free Chlorine (mg/l)"] > 1) |
            (sensor_df["Nitrate (mg/l)"] < 20) |
            (sensor_df["TOC (mg/l)"] > 1) |
            (sensor_df["UVA254 (1/m)"] > 4)
        ].index
    elif code == "Prealpi":
        row_index = sensor_df[
            (sensor_df["Turbidity (FTU)"] > 0.7) |
            (sensor_df["UVA254 (1/m)"] > 1.5)
        ].index
    elif code == "Tabacchi":
        pass
    elif code == "Tognazzi":
        row_index = sensor_df[
            (sensor_df["Conductivity (μS/cm)"] < 400) |
            (sensor_df["Free Chlorine (mg/l)"] > 0.4)
        ].index
        
    sensor_df.drop(row_index, inplace=True)
    
    sensor_df.rename(
        columns={
        'Conductivity (μS/cm)': 'Conductivity (uS/cm)',
        'TOC (mg/l)': 'TOC (mg/L)',
        'Nitrate (mg/l)': 'Nitrate (mg/L)',
        'Free Chlorine (mg/l)': 'Free Chlorine (mg/L)',
        'Turbidity (FTU)': 'Turbidity (NTU)',
        },
        inplace=True
    )
    
    sensor_dict.update({code: sensor_df})

# Combine Historical Grab Samples

In [None]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(set(df.columns.to_list()) & set(metadata_columns + features_columns + targets_columns))
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples, ignore_index=True)

In [None]:
grab_samples_df

In [None]:
column_list = "CS, CT"

meta_supply_points_df = pd.read_excel(
    grab_samples_supply_points_path, usecols=column_list, header=4
)

In [None]:
meta_supply_points_df

In [None]:
hist_grab_df = grab_samples_df.merge(
    meta_supply_points_df,
    left_on=["Punto di prelievo", "Codice punto di prelievo"],
    right_on=["filtro 1", "filtro 2"],
    how="inner",
)

In [None]:
# supply_points_df.drop(columns=["filtro 1", "filtro 2"], inplace=True)
hist_grab_df.drop_duplicates(inplace=True)

In [None]:
# combine all value columns in the mapping to the corresponding key column
column_mapping = {
    "Temperatura (°C)": [
        "Temperatura - °C",
        "Temperatura (al prelievo) (°C)",
    ],
    "Cloro residuo libero (mg/L di Cl2)": [
        "Cloro residuo libero (al prelievo) (mg/L di Cl2)",
    ],
    "Torbidità (NTU)": [
        "Torbidità (NTu)",
    ],
    "Batteri coliformi a 37°C (MPN/100 mL)": [
        "Batteri coliformi a 37°C (MPN / 100 mL)",
    ],
    "Colore (CU)": [
        "Colore (Cu)",
    ],
    "Escherichia coli (MPN/100 mL)": [
        "Escherichia Coli (MPN / 100mL)",
    ],
    "Enterococchi (MPN/100 mL)": [
        "Enterococchi (MPN / 100mL)",
    ],
}

for final_column, original_columns in column_mapping.items():
    for original_column in original_columns:
        hist_grab_df[final_column] = hist_grab_df[
            final_column
        ].combine_first(hist_grab_df[original_column])
    hist_grab_df.drop(columns=original_columns, inplace=True)

In [None]:
hist_grab_df.columns.to_list()

In [None]:
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
hist_common_metadata_columns = list(set(metadata_columns).intersection(hist_grab_df.columns))
hist_common_features_columns = list(set(features_columns).intersection(hist_grab_df.columns))
hist_common_targets_columns = list(set(targets_columns).intersection(hist_grab_df.columns))

In [None]:
def print_columns(title, columns):
    print(f"{title}:")
    for col in columns:
        print(f"  - {col}")
    print()

print_columns("Historical Common Metadata Columns", hist_common_metadata_columns)
print_columns("Common Metadata Columns", common_metadata_columns)
print_columns("Historical Common Features Columns", hist_common_features_columns)
print_columns("Common Features Columns", common_features_columns)
print_columns("Historical Common Targets Columns", hist_common_targets_columns)
print_columns("Common Targets Columns", common_targets_columns)

In [None]:
# do a mapping of the hist_common_features_columns to the common_features_columns (sorted)
# and the hist_common_targets_columns to the common_targets_columns (sorted)

# the mapping is done by sorting the columns and then zipping them together
mapping_features = dict(zip(sorted(hist_common_features_columns), sorted(common_features_columns)))
mapping_targets = dict(zip(sorted(hist_common_targets_columns), sorted(common_targets_columns)))

In [None]:
hist_grab_df.rename(columns=mapping_features, inplace=True)
hist_grab_df.rename(columns=mapping_targets, inplace=True)

In [None]:
hist_grab_df.columns.to_list()

In [None]:
hist_grab_df.drop(
    columns=[
        'filtro 1',
        'filtro 2',
    ],
    inplace=True
)

In [None]:
# get only the hist_grab_df rows that have the Punto di prelievo containing the grab_df Punto di prelievo
hist_grab_df = hist_grab_df[(hist_grab_df['Punto di prelievo'].str.contains("|".join(grab_df['Punto di prelievo'].unique()), case=False, na=False)) | (hist_grab_df['Codice punto di prelievo'].str.contains("|".join(grab_df['Punto di prelievo'].unique()), case=False, na=False))]

In [None]:
hist_grab_df

In [None]:
grab_df['Punto di prelievo'].unique()

In [None]:
grab_df.columns.to_list()

In [None]:
from operator import contains


# do a function that for a value, if an item of grab_df['Punto di prelievo'].unique() is contained in the value, then change the value to the item
def change_name(value):
    for name in grab_df['Punto di prelievo'].unique():
        if contains(value, name):
            return name
    return value

In [None]:
hist_grab_df['Punto di prelievo'] = hist_grab_df['Punto di prelievo'].map(change_name)

In [None]:
for column in common_features_columns + common_targets_columns:
    if column not in hist_grab_df.columns:
        continue
    label_column = column + "_label"
    hist_grab_df.loc[:, label_column] = hist_grab_df[column].apply(set_label)

In [None]:
for column in common_features_columns + common_targets_columns:
    if column not in hist_grab_df.columns:
        continue
    hist_grab_df[column] = hist_grab_df[column].map(convert_string_values)

In [None]:
grab_df.shape, hist_grab_df.shape

In [None]:
# give me the columns that are in grab_df but not in hist_grab_df
for column in grab_df.columns:
    if column not in hist_grab_df.columns:
        print(column)

In [None]:
for column in hist_grab_df.columns:
    if column not in grab_df.columns:
        print(column)

In [None]:
hist_grab_df.drop(
    columns=[
        'Rapporto di prova',
        'Codice punto di prelievo',
    ],
    inplace=True
)

In [None]:
grab_df = pd.concat([grab_df, hist_grab_df], ignore_index=True)

In [None]:
grab_df.shape

In [None]:
grab_df.columns.to_list()

In [None]:
# %%script false --no-raise-error
# FIXME this piece of code needs to be rearranged
columns_mapping = {
    "Data di prelievo": "DateTime",
    "Punto di prelievo": "Code",
    "Colore (Cu)": "Color (CU)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/L)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Torbidità (NTu)": "Turbidity (NTU)",
    "Nitrati (mg/L)": "Nitrate (mg/L)",
}

grab_df.rename(columns=columns_mapping, inplace=True)

# Combine with First Batch

## Grab

In [None]:
first_batch_grab_df = pd.read_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Grab Samples.xlsx")
)

In [None]:
first_batch_grab_df.head(5)

In [None]:
first_batch_grab_df = first_batch_grab_df[first_batch_grab_df['Type'] == 'Ingresso']
first_batch_grab_df.drop(columns=['Type'], inplace=True)

In [None]:
grab_df.head(5)

In [None]:
first_batch_grab_df['Code'].unique()

In [None]:
grab_df['Code'].unique()

In [None]:
code_mapping = {
    'HOUSE_BANDENERE': 'Bande Nere',
    'HOUSE_BERNA': 'Berna',
    'HOUSE_CHIOSTERGI': 'Chiostergi',
    'HOUSE_FORTUNATO': 'Fortunato',
    'HOUSE_GRAMSCI': 'Gramsci',
    'HOUSE_MONTEVIDEO': 'Montevideo',
    'HOUSE_PREALPI': 'Prealpi',
    'HOUSE_TABACCHI': 'Tabacchi',
    'HOUSE_TOGNAZZI': 'Tognazzi',
}

In [None]:
first_batch_grab_df['Code'] = first_batch_grab_df['Code'].map(code_mapping)

In [None]:
grab_df.shape, first_batch_grab_df.shape

In [None]:
for column in grab_df.columns:
    if column not in first_batch_grab_df.columns:
        print(column)

In [None]:
for column in first_batch_grab_df.columns:
    if column not in grab_df.columns:
        print(column)

In [None]:
# combine the two dataframes
grab_df = pd.concat([grab_df, first_batch_grab_df])

In [None]:
grab_df

In [None]:
grab_df.sort_values(by='DateTime', inplace=True)

## Sensors

In [None]:
first_batch_sensor_df = pd.read_excel(
    os.path.join(clean_data_folder, "Riunione 24-04-2024", "Sensor Data.xlsx")
)

In [None]:
first_batch_sensor_df

In [None]:
first_batch_sensor_df['Code'] = first_batch_sensor_df['Code'].map(code_mapping)

In [None]:
first_batch_sensor_df.drop(
    columns=[
        'Flow Rate (m³/s)'
    ],
    inplace=True
)

In [None]:
sensor_dict['Bande Nere']

In [None]:
for code in first_batch_sensor_df['Code'].unique():
    df = first_batch_sensor_df[first_batch_sensor_df['Code'] == code].copy()
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df.set_index('DateTime', inplace=True)
    
    df.drop(columns=['Code'], inplace=True)
    
    sensor_df = sensor_dict.pop(code)
    
    sensor_df = pd.concat([sensor_df, df])
    sensor_df.sort_index(inplace=True)
    sensor_dict[code] = sensor_df

In [None]:
sensor_dict['Berna']

# Store Data

In [None]:
grab_df.to_excel(os.path.join(clean_data_folder, "grab.xlsx"), index=False)

In [None]:
if not os.path.exists(os.path.join(clean_data_folder, "sensors")):
    os.mkdir(os.path.join(clean_data_folder, "sensors"))
    
for code in sensor_dict.keys():
    sensor_dict[code].to_excel(os.path.join(clean_data_folder, "sensors", f"{code}.xlsx"), index=True)