# All Grab Samples Data Preprocessing

In [1]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [2]:
data_folder = os.path.join(os.path.join("..", "..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

datasets_folder = os.path.join(data_folder, "Intermediate Data")

all_grab_samples_path = os.path.join(
    raw_data_folder, "Tutti punti - Grab Samples"
)

utils_folder = os.path.join("..", "..", "utils")

## Collect all grab samples

### Load all grab samples

In [3]:
# Load the columns
with open(os.path.join(utils_folder, "columns_types.json"), "r") as f:
    column_types = json.load(f)

all_metadata_columns = column_types["metadata_columns"]
all_features_columns = column_types["features_columns"]
all_target_columns = column_types["targets_columns"]

all_columns = all_metadata_columns + all_features_columns + all_target_columns

In [None]:
all_columns

In [8]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(set(df.columns.to_list()) & set(all_columns))
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples, ignore_index=True)

### Fix Redundant Columns

In [6]:
# combine all value columns in the mapping to the corresponding key column
column_mapping = {
    "Temperatura (°C)": [
        "Temperatura - °C",
    ],
    "Torbidità (NTU)": [
        "Torbidità (NTu)",
    ],
    "Batteri coliformi a 37°C (MPN/100 mL)": [
        "Batteri coliformi a 37°C (MPN / 100 mL)",
    ],
    "Colore (CU)": [
        "Colore (Cu)",
    ],
    "Escherichia coli (MPN/100 mL)": [
        "Escherichia Coli (MPN / 100mL)",
    ],
    "Enterococchi (MPN/100 mL)": [
        "Enterococchi (MPN / 100mL)",
    ],
}

for final_column, original_columns in column_mapping.items():
    for original_column in original_columns:
        grab_samples_df[final_column] = grab_samples_df[
            final_column
        ].combine_first(grab_samples_df[original_column])
    grab_samples_df.drop(columns=original_columns, inplace=True)

In [None]:
grab_samples_df.columns.to_list()

In [None]:
grab_samples_df.columns

In [16]:
metadata_columns = list(
    set(all_metadata_columns) & set(grab_samples_df.columns)
)
feature_columns = list(set(all_features_columns) & set(grab_samples_df.columns))
target_columns = list(set(all_target_columns) & set(grab_samples_df.columns))

In [17]:
grab_samples_df.dropna(
    axis=0, subset=feature_columns + target_columns, how="all", inplace=True
)
grab_samples_df.dropna(axis=1, how="all", inplace=True)

In [18]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [19]:
def set_label(value):
    if pd.isna(value):
        return "NaN"
    elif isinstance(value, (int, float)):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "NaN"

In [20]:
convert_columns = list(
    set(grab_samples_df.columns.to_list()) - set(metadata_columns)
)

In [21]:
for col in convert_columns:
    label_col = col + "_label"
    grab_samples_df[label_col] = grab_samples_df[col].apply(set_label)

In [22]:
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].map(
    convert_string_values
)

In [23]:
# convert each column to the right data type
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].astype(
    float
)

In [24]:
# order columns based on their type (metadata, features, target)
ordered_columns = []
for col in all_columns:
    if col in grab_samples_df.columns:
        ordered_columns.append(col)
        label_col = col + "_label"
        if label_col in grab_samples_df.columns:
            ordered_columns.append(label_col)

grab_samples_df = grab_samples_df[ordered_columns]

In [25]:
grab_samples_df["Data di prelievo"] = pd.to_datetime(
    grab_samples_df["Data di prelievo"], format="%Y/%m/%d"
)

In [None]:
grab_samples_df

In [27]:
grab_samples_df.sort_values("Data di prelievo", inplace=True)

In [28]:
grab_samples_df.to_excel(
    os.path.join(datasets_folder, "All grab samples.xlsx"), index=False
)