In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from scipy import stats

In [2]:
dataset_path = "/Users/massimilianoarca/Library/CloudStorage/OneDrive-PolitecnicodiMilano/Thames Ecoli/temporary results/full_dataset.csv"

In [3]:
dataset_df = pd.read_csv(dataset_path)

In [4]:
dataset_df.head(10)

Unnamed: 0,Technician Counting,Date,Time,Site,Bottle,Sample,Image Date Time,Dilution,Coliform (1ml),Ecoli (1ml),Technician Water Quality,Temp C,Ph,Cond (ms)
0,Admin,2023-09-05,10:10,B,1,1.0,2023-10-13 13:05:35,1:1,334,7,,22.0,7.6,2.15625
1,Admin,2023-09-05,10:10,B,1,2.0,2023-10-13 13:06:12,1:1,364,9,,22.0,7.6,2.15625
2,Admin,2023-09-05,10:10,B,2,1.0,2023-10-13 13:06:57,1:1,26,4,,21.0,7.6,2.125
3,Admin,2023-09-05,10:10,B,2,2.0,2023-10-13 13:07:50,1:1,43,10,,21.0,7.6,2.125
4,Admin,2023-09-05,11:26,B,1,1.0,2023-10-13 13:09:26,1:1,36,5,,21.0,7.6,1.8125
5,Admin,2023-09-05,11:26,B,1,2.0,2023-10-13 13:10:12,1:1,33,6,,21.0,7.6,1.8125
6,Admin,2023-09-05,11:26,B,2,1.0,2023-10-13 13:11:07,1:1,39,8,,21.0,7.6,1.8125
7,Admin,2023-09-05,11:26,B,2,2.0,2023-10-13 13:11:58,1:1,58,5,,21.0,7.6,1.8125
8,Admin,2023-09-05,11:42,B,1,1.0,2023-10-13 13:13:11,1:1,101,9,,23.0,7.6,1.734375
9,Admin,2023-09-05,11:42,B,1,2.0,2023-10-13 13:14:04,1:1,123,11,,23.0,7.6,1.734375


In [5]:
tntc_df = dataset_df[
    (dataset_df["Coliform (1ml)"] == "TNTC")
    | (dataset_df["Ecoli (1ml)"] == "TNTC")
]

In [6]:
clean_df = dataset_df.drop(tntc_df.index)

In [7]:
clean_df["Coliform (1ml)"] = clean_df["Coliform (1ml)"].astype("float64")
clean_df["Ecoli (1ml)"] = clean_df["Ecoli (1ml)"].astype("float64")

In [8]:
clean_df

Unnamed: 0,Technician Counting,Date,Time,Site,Bottle,Sample,Image Date Time,Dilution,Coliform (1ml),Ecoli (1ml),Technician Water Quality,Temp C,Ph,Cond (ms)
0,Admin,2023-09-05,10:10,B,1,1.0,2023-10-13 13:05:35,1:1,334.0,7.0,,22.0,7.6,2.15625
1,Admin,2023-09-05,10:10,B,1,2.0,2023-10-13 13:06:12,1:1,364.0,9.0,,22.0,7.6,2.15625
2,Admin,2023-09-05,10:10,B,2,1.0,2023-10-13 13:06:57,1:1,26.0,4.0,,21.0,7.6,2.12500
3,Admin,2023-09-05,10:10,B,2,2.0,2023-10-13 13:07:50,1:1,43.0,10.0,,21.0,7.6,2.12500
4,Admin,2023-09-05,11:26,B,1,1.0,2023-10-13 13:09:26,1:1,36.0,5.0,,21.0,7.6,1.81250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2507,,2023-09-07,14:45,C,1,,,,,,,27.0,7.9,1.61000
2508,,2023-09-07,14:45,C,2,,,,,,,26.0,7.8,1.60000
2509,,2023-09-07,14:45,C,2,,,,,,,25.0,7.7,1.65000
2510,,2023-09-07,15:45,E,1,,,,,,,25.0,7.8,1.52000


In [9]:
clean_df = (
    clean_df.groupby(
        ["Date", "Time", "Site", "Bottle", "Sample"], as_index=False
    )
    .agg(
        {
            "Technician Water Quality": "first",
            "Technician Counting": "first",
            "Temp C": ["mean", "std"],
            "Ph": ["mean", "std"],
            "Cond (ms)": ["mean", "std"],
            "Coliform (1ml)": ["mean", "std"],
            "Ecoli (1ml)": ["mean", "std"],
        },
    )
    .reset_index()
)

In [10]:
clean_df = clean_df[
    clean_df[["Coliform (1ml)", "Ecoli (1ml)", "Temp C", "Ph", "Cond (ms)"]]
    .notnull()
    .all(axis=1)
]

In [11]:
clean_df.columns = [
    "_".join(col) if col[1] == "mean" or col[1] == "std" else col[0]
    for col in clean_df.columns.values
]

In [12]:
clean_df

Unnamed: 0,index,Date,Time,Site,Bottle,Sample,Technician Water Quality,Technician Counting,Temp C_mean,Temp C_std,Ph_mean,Ph_std,Cond (ms)_mean,Cond (ms)_std,Coliform (1ml)_mean,Coliform (1ml)_std,Ecoli (1ml)_mean,Ecoli (1ml)_std
16,16,2023-02-07,08:18,D,1,0.0,Randa,Randa,6.0,0.0,8.1,0.0,0.80,0.0,16.0,0.000000,6.0,0.000000
17,17,2023-02-07,08:18,D,2,0.0,Randa,Randa,6.0,0.0,8.2,0.0,0.77,0.0,10.0,0.000000,6.0,0.000000
28,28,2023-02-14,09:55,D,1,0.0,Randa,Randa,9.0,0.0,7.8,0.0,0.80,0.0,11.0,0.000000,5.0,0.000000
29,29,2023-02-14,09:55,D,2,0.0,Randa,Randa,8.0,0.0,8.0,0.0,0.80,0.0,22.0,0.000000,7.0,0.000000
40,40,2023-02-21,16:10,D,1,0.0,Randa,Randa,11.0,0.0,8.0,0.0,1.25,0.0,19.0,0.000000,6.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,892,2023-09-04,14:06,D,2,2.0,,Admin,21.9,0.0,7.8,0.0,1.46,0.0,35.5,0.707107,3.5,0.707107
901,901,2023-09-04,14:22,D,1,1.0,,Admin,23.0,0.0,7.5,0.0,1.49,0.0,67.5,0.707107,8.0,0.000000
1000,1000,2023-09-05,14:15,D,1,1.0,,Vee,23.9,0.0,7.6,0.0,1.46,0.0,34.5,0.707107,8.0,0.000000
1001,1001,2023-09-05,14:15,D,1,2.0,,Vee,23.9,0.0,7.6,0.0,1.46,0.0,36.5,0.707107,8.5,0.707107


In [13]:
clean_df['CV_Coliform'] = clean_df['Coliform (1ml)_std'] / clean_df['Coliform (1ml)_mean']
clean_df['CV_Ecoli'] = clean_df['Ecoli (1ml)_std'] / clean_df['Ecoli (1ml)_mean']

# Data Visualization and Analysis

In [None]:
site_dict = {}
for site in clean_df["Site"].unique():
    site_dict[site] = clean_df[clean_df["Site"] == site]

In [None]:
for site in site_dict:
    print("Site: ", site)
    print("-" * 30)
    print(site_dict[site].describe().to_string())

## Hypothesis Tests Correlations

### Coliform

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Temp C_mean"], site_dict[site]["Coliform (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ph_mean"], site_dict[site]["Coliform (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Cond (ms)_mean"],
        site_dict[site]["Coliform (1ml)_mean"],
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Temp C_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

### Ecoli

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ph_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Cond (ms)_mean"], site_dict[site]["Ecoli (1ml)_mean"]
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

In [None]:
for site in site_dict:
    print("-" * 30)
    print("Site: ", site)
    r, p = stats.pearsonr(
        site_dict[site]["Ecoli (1ml)_mean"],
        site_dict[site]["Coliform (1ml)_mean"],
    )

    print("Pearsons correlation: ", r)
    print("Pearsons p-value: ", p)

## Correlation Matrix Heatmap

In [None]:
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]


# Pearson, used for two quantitative continuous variables which have a linear relationship
# Spearman, used for two quantitative variables if the link is partially linear, or for one qualitative ordinal variable and one quantitative variable
# Kendall, often used for two qualitative ordinal variables

for site in site_dict:
    corr = site_dict[site][cols].corr(method="pearson")

    plt.figure(figsize=(5, 5))
    plt.title("Site: " + site)
    ax = sns.heatmap(
        corr,
        vmin=-1,
        vmax=1,
        center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True,
        annot=True,
        fmt=".3f",
    )
    ax.set_xticklabels(
        ax.get_xticklabels(), rotation=45, horizontalalignment="right"
    )

## Scatter Plots

In [None]:
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]

for site in site_dict:
    plot = sns.pairplot(data=site_dict[site][cols])
    plot.fig.suptitle("Site: " + site, y=1.08)

## Boxplots

In [None]:
# TODO finire i grafici per sito e continuare il lavoro dopo. LEGGI APPUNTI SUL QUADERNO per safecrew
cols = [
    "Temp C_mean",
    "Ph_mean",
    "Cond (ms)_mean",
    "Coliform (1ml)_mean",
    "Ecoli (1ml)_mean",
]

for site in site_dict:
    for col in cols:
        sns.boxplot(y=site_dict[site][col], orient="v")
        plt.title("Site: " + site + " - " + col)
        plt.show()

## Timeseries

In [None]:
for site in site_dict:
    for col in cols:
        plt.title("Site: " + site + " - " + col)
        site_dict[site].plot(x="Date", y=col, figsize=(15, 5), grid=True)

# Modelling - Classification

In [None]:
classification_df = clean_df.copy()

In [None]:
rename_dict = {
    "Temp C": "Temp C_mean",
    "Ph": "Ph_mean",
    "Cond (ms)": "Cond (ms)_mean",
    "Coliform (1ml)": "Coliform (1ml)_mean",
    "Ecoli (1ml)": "Ecoli (1ml)_mean",
}

In [None]:
tntc_df.rename(columns=rename_dict, inplace=True)
tntc_df.drop(columns=["Image Date Time", "Dilution"], inplace=True)

In [None]:
classification_df = pd.concat([classification_df, tntc_df])

In [None]:
classification_df

In [None]:
classification_df["isTNTC"] = np.where(
    (classification_df["Coliform (1ml)_mean"] == "TNTC")
    | (classification_df["Ecoli (1ml)_mean"] == "TNTC"),
    1,
    0,
)

In [None]:
classification_df.drop(
    columns=[
        "Temp C_std",
        "Ph_std",
        "Cond (ms)_std",
        "Coliform (1ml)_std",
        "Ecoli (1ml)_std",
        "Coliform (1ml)_mean",
        "Ecoli (1ml)_mean",
    ],
    inplace=True,
)

In [None]:
classification_df

In [None]:
filtered_df = classification_df[['Site', 'Temp C_mean', 'Ph_mean', 'Cond (ms)_mean', 'isTNTC']]

filtered_df['isTNTC'] = filtered_df['isTNTC'].astype('category')

In [None]:
for site in site_dict.keys():
    fig = px.scatter_3d(filtered_df[filtered_df['Site'] == site], x='Temp C_mean', y='Ph_mean', z='Cond (ms)_mean', color='isTNTC')
    fig.update_layout(title_text="Site: " + site)
    fig.show()