# Ecoli Thames Data Processing

In [None]:
import os
import json
import pandas as pd
import bng_latlon as bl

### Specify the path(s) to the data

In [None]:
dir_new_samples_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/Reader Output"
dir_store_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/temporary results"
filename_new_samples = "merged_data.csv"
# file_old_samples_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/SafeCREW/Thames Ecoli/historical_samples.csv"
file_old_samples_path = None

# paths to further features
manual_counting_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/further_features/Thames Sampling Ecoli - manual counting.xlsx"
water_quality_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/further_features/Thames Sampling Water Quality Data.xlsx"
historic_discharges_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/further_features/Thames Water Historic Discharges by site (missing fleet main before 2023).xlsx"

### Validate paths

In [None]:
if not os.path.isdir(dir_new_samples_path):
    raise ValueError(
        "The path to the folder containing the new samples excel files is not a directory"
    )

if not os.path.isdir(dir_store_path):
    raise ValueError(
        "The path to store the concatenated excel file is not a directory"
    )

if file_old_samples_path is not None:
    if not os.path.isfile(file_old_samples_path):
        raise ValueError(
            "The path to the file containing the old samples excel file is not a file"
        )

if not filename_new_samples.endswith(".csv"):
    raise ValueError("The name of the new file must include the csv extension")

## Load data

In [None]:
from genericpath import isdir


def append_data(new_data_folder, old_dataset_path):
    """
    Append the data in the new_data_folder to the old_dataset
    """

    new_samples = []
    for root, dirs, files in os.walk(new_data_folder):
        for file in files:
            if file.endswith(".xlsx"):
                new_samples.append(pd.read_excel(os.path.join(root, file)))
            elif file.endswith(".csv"):
                new_samples.append(pd.read_csv(os.path.join(root, file)))
            elif file.endswith(".DS_Store"):
                pass
            else:
                raise ValueError("The file extension is not supported")

        for dir in dirs:
            if isdir(os.path.join(root, dir)):
                new_samples.append(
                    append_data(os.path.join(root, dir), old_dataset_path)
                )

    new_samples_df = pd.concat(new_samples, ignore_index=True)
    if old_dataset_path is not None:
        if old_dataset_path.endswith(".xlsx"):
            old_samples_df = pd.read_excel(old_dataset_path)
        elif old_dataset_path.endswith(".csv"):
            old_samples_df = pd.read_csv(old_dataset_path)
        else:
            raise ValueError("The file extension is not supported")
        new_samples_df = pd.concat(
            [old_samples_df, new_samples_df], ignore_index=True
        )
    return new_samples_df

In [None]:
raw_reader_output_df = append_data(dir_new_samples_path, None)

In [None]:
raw_reader_output_df

In [None]:
raw_reader_output_df

In [None]:
raw_manual_counting_df = pd.read_excel(manual_counting_path)
raw_water_quality_df = pd.read_excel(water_quality_path)
raw_historic_discharges_df = pd.read_excel(
    historic_discharges_path, header=None
)

In [None]:
raw_manual_counting_df

In [None]:
raw_water_quality_df

In [None]:
raw_historic_discharges_df

## Data Preprocessing

### Raw Automated Counting

In [None]:
reader_output_df = raw_reader_output_df.copy()

reader_output_df.drop_duplicates(inplace=True)

reader_output_df[["Date", "Time", "SiteSample"]] = raw_reader_output_df[
    "Sample ID"
].str.split("_", expand=True)

reader_output_df["Date"] = pd.to_datetime(
    reader_output_df["Date"], format="%Y%m%d"
).dt.date
reader_output_df["Time"].replace("XXXX", pd.NaT, inplace=True)  # type: ignore
reader_output_df["Time"] = pd.to_datetime(
    reader_output_df["Time"], format="%H%M", errors="coerce"
).dt.strftime("%H:%M")
reader_output_df["Image Date Time"] = pd.to_datetime(
    reader_output_df["Image Date Time"]
)

date_col = reader_output_df.pop("Date")
time_col = reader_output_df.pop("Time")

reader_output_df.insert(2, "Date", date_col)
reader_output_df.insert(3, "Time", time_col)

reader_output_df.insert(4, "Site", reader_output_df["SiteSample"].str.extract("([A-Za-z]+)", expand=True))  # type: ignore
reader_output_df.insert(5, "Sample", reader_output_df["SiteSample"].str.extract("([\d.]+)", expand=True))  # type: ignore

reader_output_df.drop(columns=["SiteSample"], inplace=True)
reader_output_df.drop(columns=["Sample ID"], inplace=True)

reader_output_df.drop(columns=["Barcode Text", "Plate Type"], inplace=True)

# drop useless columns
reader_output_df.drop(
    columns=[
        "Red With Gas Raw Count",
        "Red With Gas Edited Count",
        "Red Without Gas Raw Count",
        "Red Without Gas Edited Count",
        "Red Without Gas Calculated Result",
        "Blue With Gas Raw Count",
        "Blue With Gas Edited Count",
        "Blue Without Gas Raw Count",
        "Blue Without Gas Edited Count",
        "Blue Without Gas Calculated Result",
        "Comments",
    ],
    inplace=True,
)

reader_output_df.rename(
    {
        "Red With Gas Calculated Result": "Coliform (1ml)",
        "Blue With Gas Calculated Result": "Ecoli (1ml)",
    },
    axis=1,
    inplace=True,
)

reader_output_df["Sample"] = reader_output_df["Sample"].astype(float)
reader_output_df["Site"] = reader_output_df["Site"].str.upper()

reader_output_df.dropna(subset=["Coliform (1ml)", "Ecoli (1ml)"], inplace=True)


reader_output_df

In [None]:
raw_extra_data_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/2023_08_10_AM_pt2.csv"
raw_extra_data_df = pd.read_csv(raw_extra_data_path, sep=";")

In [None]:
extra_data_df = raw_extra_data_df.copy()

extra_data_df.drop_duplicates(inplace=True)

extra_data_df[["Date", "Time", "SiteSample"]] = raw_extra_data_df[
    "Sample ID"
].str.split("_", expand=True)

extra_data_df["Date"] = pd.to_datetime(
    extra_data_df["Date"], format="%Y%m%d"
).dt.date
extra_data_df["Time"].replace("XXXX", pd.NaT, inplace=True)  # type: ignore
extra_data_df["Time"] = pd.to_datetime(
    extra_data_df["Time"], format="%H%M", errors="coerce"
).dt.strftime("%H:%M")
extra_data_df["Image Date Time"] = pd.to_datetime(
    extra_data_df["Image Date Time"]
)

date_col = extra_data_df.pop("Date")
time_col = extra_data_df.pop("Time")

extra_data_df.insert(2, "Date", date_col)
extra_data_df.insert(3, "Time", time_col)

extra_data_df.insert(4, "Site", extra_data_df["SiteSample"].str.extract("([A-Za-z]+)", expand=True))  # type: ignore
extra_data_df.insert(5, "Sample", extra_data_df["SiteSample"].str.extract("([\d.]+)", expand=True))  # type: ignore

extra_data_df.drop(columns=["SiteSample"], inplace=True)
extra_data_df.drop(columns=["Sample ID"], inplace=True)
extra_data_df.drop(columns=["Plate Type"], inplace=True)

extra_data_df["Sample"] = extra_data_df["Sample"].astype(float)
extra_data_df["Site"] = extra_data_df["Site"].str.upper()

extra_data_df.dropna(subset=["Coliform (1ml)", "Ecoli (1ml)"], inplace=True)

In [None]:
reader_output_df = pd.concat([reader_output_df, extra_data_df], ignore_index=True)
reader_output_df.drop_duplicates(inplace=True)

### Raw Manual Counting

In [None]:
manual_counting_df = raw_manual_counting_df.copy()

manual_counting_df.drop_duplicates(inplace=True)

manual_counting_df["Sample"] = manual_counting_df["Sample"].astype(float)
manual_counting_df.rename(
    {"Counter": "Technician"},
    axis=1,
    inplace=True,
)

manual_counting_df["Date"] = manual_counting_df["Date"].dt.date
manual_counting_df["Time"] = pd.to_datetime(
    manual_counting_df["Time"], format="%H:%M:%S", errors="coerce"
).dt.strftime("%H:%M")

manual_counting_df.dropna(
    subset=["Coliform (1ml)", "Ecoli (1ml)"], inplace=True
)

In [None]:
manual_counting_df

### Raw Water Quality

In [None]:
water_quality_df = raw_water_quality_df.copy()

water_quality_df["Sample"] = water_quality_df["Sample"].astype(float)
water_quality_df.rename(
    {"Sampler": "Technician"},
    axis=1,
    inplace=True,
)

water_quality_df = water_quality_df[water_quality_df["FOLLOW UP"] != "X"]
water_quality_df["Date"] = water_quality_df["Date"].dt.date
water_quality_df["Time"] = pd.to_datetime(
    water_quality_df["Time"], format="%H:%M:%S", errors="coerce"
).dt.strftime("%H:%M")

water_quality_df.drop(columns=["FOLLOW UP"], inplace=True)
water_quality_df.dropna(
    subset=["Temp C", "Ph", "Cond (ms)"], inplace=True
)

In [None]:
water_quality_df

## Merge Data

In [None]:
reader_manual_df = pd.concat(
    [reader_output_df, manual_counting_df], ignore_index=True
)

In [None]:
full_df = pd.merge(
    reader_manual_df,
    water_quality_df,
    how="outer",
    on=["Technician", "Date", "Time", "Site", "Sample"],
)

## Final Cleaning

In [None]:
full_df["Coliform (1ml)"].replace(to_replace="TNTC", value=250, inplace=True)
full_df["Ecoli (1ml)"].replace(to_replace="TNTC", value=250, inplace=True)

full_df["Technician"].replace(to_replace=", ", value=" - ", inplace=True, regex=True)

In [None]:
full_df

In [None]:
full_df.to_csv(os.path.join(dir_store_path, "full_dataset.csv"), index=False)

## Site Positions

In [None]:
all_overflow_sites_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/sites/all_overflows.csv"
nearest_overflow_sites_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/sites/nearest_overflows.csv"
sampling_sites_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/sites/sampling.csv"

In [None]:
all_overflow_sites_df = pd.read_csv(all_overflow_sites_path)
nearest_overflow_sites_df = pd.read_csv(nearest_overflow_sites_path)
sampling_sites_df = pd.read_csv(sampling_sites_path)

In [None]:
final_df = full_df.copy()

In [None]:
final_df["Coliform (1ml)"] = final_df["Coliform (1ml)"].astype(float)
final_df["Ecoli (1ml)"] = final_df["Ecoli (1ml)"].astype(float)

In [None]:
final_df = final_df.groupby(["Technician", "Date", "Time", "Site", "Sample"], as_index=False).agg(
    {
        'Coliform (1ml)': ["mean", "std"],
        'Ecoli (1ml)': ["mean", "std"],
        'Temp C': ["mean", "std"],
        'Ph': ["mean", "std"],
        'Cond (ms)': ["mean", "std"],
    }
    ,
)

In [None]:
final_df