# All Grab Samples Data Preprocessing

In [25]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Paths

In [26]:
data_folder = os.path.join(os.path.join("..", "..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

datasets_folder = os.path.join(data_folder, "Intermediate Data")

all_grab_samples_path = os.path.join(
    raw_data_folder, "Tutti punti - Grab Samples"
)

utils_folder = os.path.join('..', '..', 'utils')

## Collect all grab samples

### Load all grab samples

In [27]:
# Load the columns
with open(os.path.join(utils_folder, "columns_types.json"), "r") as f:
    column_types = json.load(f)

all_metadata_columns = column_types["metadata_columns"]
all_features_columns = column_types["features_columns"]
all_target_columns = column_types["targets_columns"]

all_columns = all_metadata_columns + all_features_columns + all_target_columns

In [42]:
grab_samples = []

for file in os.listdir(all_grab_samples_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=11)
    else:
        df = pd.read_excel(os.path.join(all_grab_samples_path, file), header=15)
    common_cols = list(set(df.columns.to_list()) & set(all_columns))
    df = df[common_cols]
    grab_samples.append(df)

grab_samples_df = pd.concat(grab_samples, ignore_index=True)

### Fix Redundant Columns

In [None]:
grab_samples_df.columns.to_list()

In [29]:
metadata_columns = list(
    set(all_metadata_columns) & set(grab_samples_df.columns)
)
feature_columns = list(set(all_features_columns) & set(grab_samples_df.columns))
target_columns = list(set(all_target_columns) & set(grab_samples_df.columns))

In [30]:
grab_samples_df.dropna(
    axis=0, subset=feature_columns + target_columns, how="all", inplace=True
)
grab_samples_df.dropna(axis=1, how="all", inplace=True)

In [31]:
import re


def convert_string_values(s):
    if isinstance(s, (int, float)):
        return s
    elif pd.isna(s):
        return None
    else:
        if "," in s:
            s = s.replace(",", ".")
        if "<" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) / 2 if number else None
        elif ">" in s:
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        elif "*" in s or re.search("[a-zA-Z]", s):
            number = re.findall(r"\d+\.?\d*", s)
            return float(number[0]) if number else None
        else:
            return None

In [120]:

def set_label(value):
    if pd.isna(value):
        return "NaN"
    elif isinstance(value, (int, float)):
        return "Normal"
    elif "<" in value:
        return "Less than"
    elif ">" in value:
        return "Greater than"
    else:
        return "Normal"

In [33]:
convert_columns = list(
    set(grab_samples_df.columns.to_list()) - set(metadata_columns)
)

In [34]:
for col in convert_columns:
    label_col = col + "_label"
    grab_samples_df[label_col] = grab_samples_df[col].apply(set_label)

In [35]:
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].map(
    convert_string_values
)

In [36]:
# convert each column to the right data type
grab_samples_df[convert_columns] = grab_samples_df[convert_columns].astype(
    float
)

In [37]:
# order columns based on their type (metadata, features, target)
ordered_columns = []
for col in all_columns:
    if col in grab_samples_df.columns:
        ordered_columns.append(col)
        label_col = col + "_label"
        if label_col in grab_samples_df.columns:
            ordered_columns.append(label_col)

grab_samples_df = grab_samples_df[ordered_columns]

In [38]:
grab_samples_df["Data di prelievo"] = pd.to_datetime(
    grab_samples_df["Data di prelievo"], format="%Y/%m/%d"
)

In [None]:
grab_samples_df

In [40]:
grab_samples_df.sort_values("Data di prelievo", inplace=True)

In [None]:
grab_samples_df

In [None]:
grab_samples_df.to_excel(
    os.path.join(datasets_folder, "All grab samples.xlsx"), index=False
)

In [121]:
df = pd.read_excel(
    os.path.join(all_grab_samples_path, 'PG 07_M13 Archivio Completo 2018-2022 safecrew.xlsx'), header=11
)

In [122]:
df['Data di prelievo'] = pd.to_datetime(df['Data di prelievo'], format='%d/%m/%Y')

In [123]:
df = df[(df['Data di prelievo'] >= '18-09-2023') & (df['Data di prelievo'] <= '18-09-2024')]

In [124]:
df.drop(
    columns=[
        'Rapporto di prova',
        'Codice punto di prelievo',
        'Campagna',
        'Analisi programmate',
        'filtro 1',
        'filtro 2',
        'Note'
    ],
    inplace=True,
)

In [125]:
df = df[all_target_columns]

In [None]:
df

In [127]:
for col in all_target_columns:
    label_col = col + "_label"
    df[label_col] = df[col].apply(set_label)

In [128]:
df[all_target_columns] = df[all_target_columns].map(convert_string_values)

In [134]:
# create a table with some statistics about the target columns
stats_df = pd.DataFrame(index=all_target_columns, columns=["mean", "std", "25%", "50%", "75%", "below LOD", 'above LOD', 'NaN'])

for col in all_target_columns:
    stats_df.loc[col, "mean"] = df[col].mean()
    stats_df.loc[col, "std"] = df[col].std()
    stats_df.loc[col, "25%"] = df[col].quantile(0.25)
    stats_df.loc[col, "50%"] = df[col].quantile(0.50)
    stats_df.loc[col, "75%"] = df[col].quantile(0.75)
    # count the number of values below the limit of detection
    col_df = df[str(col + "_label")]
    stats_df.loc[col, "below LOD"] = col_df[col_df == "Less than"].count()
    stats_df.loc[col, "above LOD"] = col_df[col_df == "Normal"].count()
    stats_df.loc[col, "NaN"] = col_df[col_df == "NaN"].count()

In [135]:
stats_df.to_excel('microbial_stats.xlsx')