# Supply Points (Case dell'Acqua) Data Preprocessing

In [None]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [None]:
data_folder = os.path.join(os.path.join("..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

datasets_folder = os.path.join(data_folder, "Intermediate Data")
store_folder = os.path.join(data_folder, "temporary results")

grab_samples_supply_points_path = os.path.join(
    raw_data_folder,
    "Case dell'acqua - Grab Samples (main)/0. Case acqua - 2010-2023.xlsx",
)

## Collect all grab samples for Supply Points

### Load all grab samples

In [None]:
grab_samples_df = pd.read_excel(
    os.path.join(datasets_folder, "All grab samples.xlsx"),
)

### Import column types

In [None]:
# load from json file the columns
with open(os.path.join(store_folder, "columns_types.json"), "r") as f:
    column_types = json.load(f)

all_metadata_columns = column_types["metadata_columns"]
all_feature_columns = column_types["features_columns"]
all_target_columns = column_types["targets_columns"]

all_columns = all_metadata_columns + all_feature_columns + all_target_columns

metadata_columns = list(
    set(all_metadata_columns) & set(grab_samples_df.columns)
)
feature_columns = list(set(all_feature_columns) & set(grab_samples_df.columns))
target_columns = list(set(all_target_columns) & set(grab_samples_df.columns))

### Get Name and ID of Supply Points

In [None]:
column_list = "CS, CT"

meta_supply_points_df = pd.read_excel(
    grab_samples_supply_points_path, usecols=column_list, header=4
)

In [None]:
meta_supply_points_df

### Get Supply Points Grab Samples

In [None]:
supply_points_df = grab_samples_df.merge(
    meta_supply_points_df,
    left_on=["Punto di prelievo", "Codice punto di prelievo"],
    right_on=["filtro 1", "filtro 2"],
    how="inner",
)

In [None]:
# supply_points_df.drop(columns=["filtro 1", "filtro 2"], inplace=True)
supply_points_df.drop_duplicates(inplace=True)

In [None]:
supply_points_df

In [None]:
supply_points_df = supply_points_df[
    (
        supply_points_df["filtro 1"].notna()
        | supply_points_df["filtro 2"].notna()
    )
]

supply_points_df.drop(
    columns=[
        "filtro 1",
        "filtro 2",
    ],
    inplace=True,
)

In [None]:
# if there are rows with the same values for Rapporto di prova, keep the one with the most complete data

# Count the number of non-NaN values in each row
supply_points_df["non_nan_count"] = supply_points_df.count(axis=1)

# Group by 'Rapporto di prova' and keep the row with the most non-NaN values
supply_points_df = supply_points_df.loc[
    supply_points_df.groupby("Rapporto di prova")["non_nan_count"].idxmax()
]

# Drop the 'non_nan_count' column as it's no longer needed
supply_points_df = supply_points_df.drop(columns="non_nan_count")

In [None]:
# drop rows that contain only nans for the columns that are not in metadata_columns
supply_points_df.dropna(
    axis=0,
    how="all",
    subset=[
        col
        for col in supply_points_df.columns.to_list()
        if col not in metadata_columns
    ],
    inplace=True,
)

# drop columns that contain only nans
supply_points_df.dropna(axis=1, how="all", inplace=True)

In [None]:
supply_points_df

In [None]:
supply_points_df["Codice punto di prelievo"] = supply_points_df[
    "Codice punto di prelievo"
].str.strip()

In [None]:
supply_points_df

In [None]:
supply_points_df.to_excel(
    os.path.join(datasets_folder, "All grab samples - supply points.xlsx"),
    index=False,
)