In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

In [41]:
temporary_results_path = '/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/temporary results'

full_dataset_path = os.path.join(temporary_results_path, 'full_dataset.csv')

In [None]:
dataset_df = pd.read_csv(full_dataset_path)

In [None]:
dataset_df.head(10)

In [None]:
tntc_df = dataset_df[
    (dataset_df["Coliform (1ml)"] == "TNTC")
    | (dataset_df["Ecoli (1ml)"] == "TNTC")
]

In [None]:
clean_df = dataset_df.drop(tntc_df.index)

In [None]:
clean_df["Coliform (1ml)"] = clean_df["Coliform (1ml)"].astype("float64")
clean_df["Ecoli (1ml)"] = clean_df["Ecoli (1ml)"].astype("float64")

In [None]:
clean_df

In [None]:
clean_df = (
    clean_df.groupby(
        ["Date", "Time", "Site", "Bottle", "Sample"], as_index=False
    )
    .agg(
        {
            "Technician Water Quality": "first",
            "Technician Counting": "first",
            "Temp C": ["mean", "std"],
            "Ph": ["mean", "std"],
            "Cond (ms)": ["mean", "std"],
            "Coliform (1ml)": ["mean", "std"],
            "Ecoli (1ml)": ["mean", "std"],
        },
    )
    .reset_index()
)

In [None]:
clean_df = clean_df[
    clean_df[["Coliform (1ml)", "Ecoli (1ml)", "Temp C", "Ph", "Cond (ms)"]]
    .notnull()
    .all(axis=1)
]

In [None]:
clean_df.columns = [
    "_".join(col) if col[1] == "mean" or col[1] == "std" else col[0]
    for col in clean_df.columns.values
]

In [None]:
clean_df

In [None]:
clean_df['CV_Coliform'] = clean_df['Coliform (1ml)_std'] / clean_df['Coliform (1ml)_mean']
clean_df['CV_Ecoli'] = clean_df['Ecoli (1ml)_std'] / clean_df['Ecoli (1ml)_mean']

# Data Visualization and Analysis

In [None]:
site_dict = {}
for site in clean_df["Site"].unique():
    site_dict[site] = clean_df[clean_df["Site"] == site]

In [None]:
for site in site_dict:
    print("Site: ", site)
    print("-" * 30)
    print(site_dict[site].describe().to_string())
    print("\n")
    print("Timespan: " + pd.to_datetime(site_dict[site]["Date"]).min().strftime('%Y-%m-%d') + ' - ' + pd.to_datetime(site_dict[site]["Date"]).max().strftime('%Y-%m-%d'))
    print("\n")

## Hypothesis Tests Correlations

### Coliform

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Temp C_mean"], site_dict[site]["Coliform (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ph_mean"], site_dict[site]["Coliform (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Cond (ms)_mean"],
        site_dict[site]["Coliform (1ml)_mean"],
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Temp C_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

### Ecoli

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ph_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Cond (ms)_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ecoli (1ml)_mean"],
        site_dict[site]["Coliform (1ml)_mean"],
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

## Correlation Matrix Heatmap

In [None]:
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]


# Pearson, used for two quantitative continuous variables which have a linear relationship
# Spearman, used for two quantitative variables if the link is partially linear, or for one qualitative ordinal variable and one quantitative variable
# Kendall, often used for two qualitative ordinal variables

for site in site_dict:
    corr = site_dict[site][cols].corr(method="pearson")

    plt.figure(figsize=(5, 5))
    plt.title("Site: " + site)
    ax = sns.heatmap(
        corr,
        vmin=-1,
        vmax=1,
        center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True,
        annot=True,
        fmt=".3f",
    )
    ax.set_xticklabels(
        ax.get_xticklabels(), rotation=45, horizontalalignment="right"
    )

## Scatter Plots

In [None]:
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]

for site in site_dict:
    plot = sns.pairplot(data=site_dict[site][cols])
    plot.fig.suptitle("Site: " + site, y=1.08)

## Boxplots

In [None]:
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]

for site in site_dict:
    for col in cols:
        sns.boxplot(y=site_dict[site][col], orient="v")
        plt.title("Site: " + site + " - " + col)
        plt.show()

## Timeseries

In [None]:
for site in site_dict:
    for col in cols:
        plt.title("Site: " + site + " - " + col)
        site_dict[site].plot(x="Date", y=col, figsize=(15, 5), grid=True)

# Further Processing

In [None]:
full_df = clean_df.copy()

In [None]:
rename_dict = {
    "Temp C": "Temp C_mean",
    "Ph": "Ph_mean",
    "Cond (ms)": "Cond (ms)_mean",
    "Coliform (1ml)": "Coliform (1ml)_mean",
    "Ecoli (1ml)": "Ecoli (1ml)_mean",
}

In [None]:
tntc_df.rename(columns=rename_dict, inplace=True)

In [None]:
tntc_df.drop(columns=["Image Date Time", "Dilution"], inplace=True)

In [None]:
full_df = pd.concat([full_df, tntc_df])

In [None]:
full_df

In [None]:
full_df.drop(
    columns=[
        "Temp C_std",
        "Ph_std",
        "Cond (ms)_std",
        "Coliform (1ml)_std",
        "Ecoli (1ml)_std",
        "CV_Coliform",
        "CV_Ecoli",
    ],
    inplace=True,
)

In [42]:
processed_dataset_path = os.path.join(temporary_results_path, 'processed_dataset.csv')

full_df.to_csv(processed_dataset_path)