# Supply Points Analysis between Grab and Sensors

In [None]:
import os
import json
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.figure_factory import create_distplot

import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

import umap
import umap.plot

from sklearn.metrics import pairwise

In [445]:
utils_folder = os.path.join("..", "..", "utils")

data_folder = os.path.join("..", "..", "data")
clean_data_folder = os.path.join(data_folder, "Clean Data")
metadata_folder = os.path.join(data_folder, "Metadata")
plot_folder = os.path.join(data_folder, "Plots")

sensor_folder = os.path.join(clean_data_folder, "sensors")

# Load Data

In [446]:
grab_df = pd.read_excel(os.path.join(clean_data_folder, "grab.xlsx"))

In [447]:
sensor_dict = {}

for file in os.listdir(sensor_folder):
    if file.endswith(".xlsx"):
        sensor_dict[file.split(".")[0]] = pd.read_excel(
            os.path.join(sensor_folder, file)
        )

In [448]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)

metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
grab_df

In [450]:
from operator import contains


label_columns = [col for col in grab_df.columns if contains(col, "label")]

In [None]:
label_columns

In [452]:
# rename grab columns
feature_mapping = {
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/L)",
    "Colore (Cu)": "Color (CU)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Torbidità (NTu)": "Turbidity (NTU)",
    "Nitrati (mg/L)": "Nitrate (mg/L)",
}

targets_mapping = {
    "Batteri coliformi a 37°C (MPN/100 mL)": "Coliforms (MPN/100mL)",
    "Bromodiclorometano (µg/L)": "Bromodichloromethane (µg/L)",
    "Bromoformio (µg/L)": "Bromoform (µg/L)",
    "Cloroformio (µg/L)": "Chloroform (µg/L)",
    "Conta delle colonie a 22°C (UFC/mL)": "Colony count at 22°C (UFC/mL)",
    "Conteggio colonie a 30°C (UFC/mL)": "Colony count at 30°C (UFC/mL)",
    "Conta delle colonie a 37°C (UFC/mL)": "Colony count at 37°C (UFC/mL)",
    "Dibromoclorometano (µg/L)": "Dibromochloromethane (µg/L)",
    "Enterococchi (MPN/100 mL)": "Enterococci (MPN/100mL)",
    "Escherichia coli (MPN/100 mL)": "Escherichia coli (MPN/100mL)",
    "Pseudomonas aeruginosa (UFC/250 mL)": "Pseudomonas aeruginosa (UFC/250mL)",
    "Acido Perfluoroottanoico PFOA (µg/L)": "Perfluorooctanoic acid PFOA (µg/L)",
    "Acido Perfluoroottansolfonico PFOS (µg/L)": "Perfluorooctanesulfonic acid PFOS (µg/L)",
    "Somma di PFAS (µg/L)": "Sum of PFAS (µg/L)",
}

In [453]:
# rename grab_df columns that contain features
for column in grab_df.columns:
    if column in targets_mapping:
        grab_df.rename(columns={column: targets_mapping[column]}, inplace=True)

    if len(column.split("_")) > 1:
        if column.split("_")[0] in feature_mapping:
            new_name = feature_mapping[column.split("_")[0]]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

        if column.split("_")[0] in targets_mapping:
            new_name = targets_mapping[column.split("_")[0]]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [454]:
# rename the label columns
for column in grab_df.columns:
    if column in label_columns:
        variable_name = column.split("_")[0]

        if variable_name in feature_mapping:
            new_name = feature_mapping[variable_name]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

        if variable_name in targets_mapping:
            new_name = targets_mapping[variable_name]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [None]:
grab_df

# Metadata Info

## Grab

In [456]:
feature_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product(
        [
            feature_mapping.values(),
            [
                "N° Entries",
                "N° Valid Samples",
                "% Missing",
                "N° < LOQ",
                "Mean Valid",
                "Std Valid",
                "LOQ values",
                "Start Date",
                "End Date",
            ],
        ]
    ),
    index=grab_df["Code"].unique(),
)

In [457]:
for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        df = grab_df[grab_df["Code"] == code][
            ["DateTime", feature, feature + "_label"]
        ].copy()

        if df.dropna().shape[0] == 0:
            continue

        df["DateTime"] = pd.to_datetime(df["DateTime"])

        start_date = df.dropna()["DateTime"].min().strftime("%Y-%m-%d")
        end_date = df.dropna()["DateTime"].max().strftime("%Y-%m-%d")

        df = df[(df["DateTime"] >= start_date) & (df["DateTime"] <= end_date)]

        missing_values = (
            df[df[feature + "_label"].isna()].shape[0] / df.shape[0] * 100
        )

        feature_df.loc[code, (feature, "N° Entries")] = df.shape[0]

        feature_df.loc[code, (feature, "% Missing")] = round(missing_values, 2)

        feature_df.loc[code, (feature, "N° < LOQ")] = df[
            df[feature + "_label"] == "Less than"
        ].shape[0]
        
        
        valid_df = df[df[feature + "_label"] == "Normal"]
        loq_df = df[df[feature + "_label"] == "Less than"]
        
        feature_df.loc[code, (feature, "N° Valid Samples")] = valid_df.shape[0]
        feature_df.loc[code, (feature, "N° < LOQ")] = loq_df.shape[0]
        

        feature_df.loc[code, (feature, "Mean Valid")] = round(valid_df[feature].mean(), 2)
        feature_df.loc[code, (feature, "Std Valid")] = round(valid_df[feature].std(), 2)
        
        loq_values = loq_df[feature].unique() * 2
        loq_values = [str(value) for value in loq_values]
        feature_df.loc[code, (feature, "LOQ values")] = ", ".join(loq_values)

        feature_df.loc[code, (feature, "Start Date")] = start_date
        feature_df.loc[code, (feature, "End Date")] = end_date

In [None]:
feature_df

In [459]:
# sort the indexes
feature_df.sort_index(inplace=True)

In [460]:
# sort the first level of the columns and maintain the order of the second level
feature_df = feature_df.sort_index(axis=1, level=0, sort_remaining=False, key=lambda x: x.str.lower())

In [461]:
targets_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product(
        [
            targets_mapping.values(),
            [
                "N° Entries",
                "N° Valid Samples",
                "% Missing",
                "N° < LOQ",
                "Mean Valid",
                "Std Valid",
                "LOQ values",
                "Start Date",
                "End Date",
            ],
        ]
    ),
    index=grab_df["Code"].unique(),
)

In [462]:
for code in grab_df["Code"].unique():
    for target in targets_mapping.values():
        df = grab_df[grab_df["Code"] == code][
            ["DateTime", target, target + "_label"]
        ].copy()

        if df.dropna().shape[0] == 0:
            continue

        df["DateTime"] = pd.to_datetime(df["DateTime"])

        start_date = df.dropna()["DateTime"].min().strftime("%Y-%m-%d")
        end_date = df.dropna()["DateTime"].max().strftime("%Y-%m-%d")

        df = df[(df["DateTime"] >= start_date) & (df["DateTime"] <= end_date)]

        missing_values = (
            df[df[target + "_label"].isna()].shape[0] / df.shape[0] * 100
        )

        targets_df.loc[code, (target, "N° Entries")] = df.shape[0]

        valid_df = df[df[target + "_label"] == "Normal"]
        loq_df = df[df[target + "_label"] == "Less than"]
        
        
        targets_df.loc[code, (target, "% Missing")] = round(missing_values, 2)

        targets_df.loc[code, (target, "N° Valid Samples")] = valid_df.shape[0]
        targets_df.loc[code, (target, "N° < LOQ")] = loq_df.shape[0]
        
        targets_df.loc[code, (target, "Mean Valid")] = round(valid_df[target].mean(), 2)
        targets_df.loc[code, (target, "Std Valid")] = round(valid_df[target].std(), 2)
        
        loq_values = loq_df[target].unique() * 2
        loq_values = [str(value) for value in loq_values]
        
        targets_df.loc[code, (target, "LOQ values")] = ", ".join(loq_values)

        targets_df.loc[code, (target, "Start Date")] = start_date
        targets_df.loc[code, (target, "End Date")] = end_date

In [None]:
targets_df

In [464]:
targets_df.sort_index(inplace=True)

In [465]:
targets_df = targets_df.sort_index(axis=1, level=0, sort_remaining=False, key=lambda x: x.str.lower())

In [466]:
# %%script false --no-raise-error
feature_df.to_excel(os.path.join(metadata_folder, "Grab", "features.xlsx"))

targets_df.to_excel(os.path.join(metadata_folder, "Grab", "targets.xlsx"))

## Sensor

In [467]:
### Fix Conductivity name
for sensor in sensor_dict:
    sensor_dict[sensor].rename(
        columns={"Conductivity (μS/cm)": "Conductivity (uS/cm)"}, inplace=True
    )

In [468]:
sensor_columns = sensor_dict["Berna"].columns.difference(["DateTime"])

In [None]:
sensor_columns

In [470]:
sensors_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product(
        [sensor_columns, ["N° Data", "N° Missing", "Mean", "Std", "Start Date", "End Date"]]
    ),
    index=list(sensor_dict.keys()),
)

In [471]:
for sensor in sensor_dict.keys():
    for column in sensor_columns:
        if sensor == "Berna" and column == "Turbidity (FTU)":
            df = sensor_dict[sensor].copy()
            # remove rows with Turbidity > 2
            df = df[df["Turbidity (FTU)"] <= 2]

            sensors_df.loc[sensor, (column, "N° Data")] = df[column].count()
            sensors_df.loc[sensor, (column, "N° Missing")] = (
                df[column].isna().sum()
            )
            sensors_df.loc[sensor, (column, "Mean")] = df[column].mean()
            sensors_df.loc[sensor, (column, "Std")] = df[column].std()
            continue

        sensors_df.loc[sensor, (column, "N° Data")] = sensor_dict[sensor][
            column
        ].count()
        sensors_df.loc[sensor, (column, "N° Missing")] = (
            sensor_dict[sensor][column].isna().sum()
        )
        sensors_df.loc[sensor, (column, "Mean")] = sensor_dict[sensor][
            column
        ].mean()
        sensors_df.loc[sensor, (column, "Std")] = sensor_dict[sensor][
            column
        ].std()
        
        start_date = sensor_dict[sensor]["DateTime"].min().strftime("%Y-%m-%d")
        end_date = sensor_dict[sensor]["DateTime"].max().strftime("%Y-%m-%d")
        
        sensors_df.loc[sensor, (column, "Start Date")] = start_date
        sensors_df.loc[sensor, (column, "End Date")] = end_date

In [472]:
sensors_df.sort_index(inplace=True)

In [None]:
sensors_df

In [474]:
sensors_df.to_excel(os.path.join(metadata_folder, "Sensor", "sensors.xlsx"))

# Check on Chiostergi

Anomalous value in pseudomonas aeruginosa, check date

In [476]:
chiostergi_df = grab_df[grab_df["Code"] == "Chiostergi"].copy()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=chiostergi_df["DateTime"],
        y=chiostergi_df['Pseudomonas aeruginosa (UFC/250mL)'],
        mode="lines",
        name="Pseudomonas aeruginosa (UFC/250mL)",
    )
)

fig.update_layout(
    title="Pseudomonas aeruginosa (UFC/250mL) in Chiostergi",
    xaxis_title="Date",
    yaxis_title="Pseudomonas aeruginosa (UFC/250mL)",
)

fig.show()

In [478]:
# October and December 2021 are the anomalous dates

In [None]:
# check other variables
for column in grab_df[feature_mapping.values()]:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=chiostergi_df["DateTime"],
            y=chiostergi_df[column],
            mode="lines",
            name=column,
        )
    )

    fig.update_layout(
        title=f"{column} in Chiostergi",
        xaxis_title="Date",
        yaxis_title=column,
    )

    fig.show()
    

In [None]:
# check other variables
for column in grab_df[targets_mapping.values()]:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=chiostergi_df["DateTime"],
            y=chiostergi_df[column],
            mode="lines",
            name=column,
        )
    )

    fig.update_layout(
        title=f"{column} in Chiostergi",
        xaxis_title="Date",
        yaxis_title=column,
    )

    fig.show()
    

In [483]:
# Nothing weird in the other variables

# Scatter Plot Pair

In [None]:
# scatter plot pair grid for grab features
fig = plt.figure(figsize=(20, 20))

sns.pairplot(grab_df, vars=feature_mapping.values(), hue="Code")

plt.show()

# Time Series Filtering

In [33]:
%%script false --no-raise-error
for code in sensor_dict:
    sensor_df = sensor_dict[code].copy()
    
    for column in sensor_df.columns:
        if column == 'DateTime':
            continue
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle(f'{code} - {column}', fontsize=16)
        
        # Before Moving Average
        sns.lineplot(ax=axes[0, 0], x=sensor_df['DateTime'], y=sensor_df[column], color='blue')
        axes[0, 0].set_title('Before MA')
        axes[0, 0].set_xlabel('Time')
        axes[0, 0].set_ylabel(column)
        axes[0, 0].grid()
        
        # After Moving Average
        df = sensor_df[['DateTime', column]].copy()
        df.set_index('DateTime', inplace=True)
        
        df = df[~df.index.duplicated(keep='first')]
        
        n_hours = 24
        df = df.rolling(window=4*n_hours).mean()
        sns.lineplot(ax=axes[0, 1], x=sensor_df['DateTime'], y=sensor_df[column], color='blue', alpha=0.3)
        sns.lineplot(ax=axes[0, 1], x=df.index, y=df[column], color='green')
        axes[0, 1].set_title('After MA')
        axes[0, 1].set_xlabel('Time')
        axes[0, 1].set_ylabel(column)
        axes[0, 1].grid()
        
        # Histogram Before Moving Average
        sns.histplot(ax=axes[1, 0], data=sensor_df, x=column, color='purple', kde=True, stat='density')
        axes[1, 0].set_title('Before MA')
        axes[1, 0].set_xlabel(column)
        axes[1, 0].set_ylabel('Density')
        
        # Histogram After Moving Average
        sns.histplot(ax=axes[1, 1], data=sensor_df, x=column, color='purple', stat='density', alpha=0.3)
        sns.histplot(ax=axes[1, 1], data=df, x=column, color='green', kde=True, stat='density')
        axes[1, 1].set_title('After MA')
        axes[1, 1].set_xlabel(column)
        axes[1, 1].set_ylabel('Density')
        
        plt.tight_layout()
        
        path = os.path.join(plot_folder, 'Clean Data', 'Moving Average', code)
        
        if not os.path.exists(path):
            os.makedirs(path)
            
        column_ = column.replace('/', '_')
        
        fig.savefig(os.path.join(path, column_ + '.png'), dpi=300)
        plt.close(fig)
        
        # plt.show()


# Time Series Comparison

In [34]:
# plot the time series of the sensors and the grab data

n_hours = 24

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = grab_df[grab_df["Code"] == code].copy()
        
        g_df = g_df[["DateTime", feature]].copy()
        g_df.dropna(inplace=True)

        s_df = sensor_dict[code].copy()
        
        start_date = s_df["DateTime"].min().strftime("%Y-%m-%d")
        
        g_df = g_df[g_df["DateTime"] >= start_date]

        # moving average on sensor data

        ma_s_df = s_df.copy()

        ma_s_df.set_index("DateTime", inplace=True)
        ma_s_df = ma_s_df.rolling(window=4 * n_hours).mean()

        
        std = g_df[feature].std()

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=s_df["DateTime"], y=s_df[feature], mode="lines", name="Sensor"
            )
        )

        fig.add_trace(
            go.Scatter(
                x=ma_s_df.index,
                y=ma_s_df[feature],
                mode="lines",
                name="Sensor MA",
                line=dict(color="green"),
            )
        )

        fig.add_trace(
            go.Scatter(
                x=g_df["DateTime"],
                y=g_df[feature],
                mode="markers",
                name="Grab",
                marker=dict(size=8, color="red"), 
            )
        )
        
        # add the std to each point of the grab data

        fig.update_layout(
            title=f"{code} - {feature}",
            xaxis_title="DateTime",
            yaxis_title=feature,
        )

        if not os.path.exists(
            os.path.join(plot_folder, "Comparison", "15min", "Timeseries", code)
        ):
            os.makedirs(
                os.path.join(
                    plot_folder, "Comparison", "15min", "Timeseries", code
                )
            )

        feature_ = feature.replace("/", "_")

        fig.write_image(
            os.path.join(
                plot_folder,
                "Comparison",
                "15min",
                "Timeseries",
                code,
                f"{feature_}.png",
            ),
            height=600,
            width=1200,
            scale=3,
        )

        # fig.show()

# Boxplot Comparison 

In [35]:
def month_to_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    if month in [3, 4, 5]:
        return "Spring"
    if month in [6, 7, 8]:
        return "Summer"
    if month in [9, 10, 11]:
        return "Autumn"

In [None]:
# plot the box plot of grab data and sensor data
for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = grab_df[grab_df["Code"] == code].copy()

        s_df = sensor_dict[code].copy()

        # moving average on sensor data
        s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
        s_df.set_index("DateTime", inplace=True)
        s_df = s_df.rolling(window=4 * n_hours).mean()

        sensor_start_date = s_df.index.min().strftime("%Y-%m-%d")

        g_df["DateTime"] = pd.to_datetime(g_df["DateTime"])
        
        before_g_df = g_df[g_df["DateTime"] < sensor_start_date]
        after_g_df = g_df[g_df["DateTime"] >= sensor_start_date]
        
        valid_g_df = g_df[g_df[feature + "_label"] == "Normal"]
        loq_g_df = g_df[g_df[feature + "_label"] == "Less than"]
        
        valid_before_g_df = valid_g_df[valid_g_df["DateTime"] < sensor_start_date]
        valid_after_g_df = valid_g_df[valid_g_df["DateTime"] >= sensor_start_date]
        
        loq_before_g_df = loq_g_df[loq_g_df["DateTime"] < sensor_start_date]
        loq_after_g_df = loq_g_df[loq_g_df["DateTime"] >= sensor_start_date]

        # divide before and after into seasons
        valid_before_g_df["Season"] = valid_before_g_df["DateTime"].dt.month.apply(
            month_to_season
        )
        valid_after_g_df["Season"] = valid_after_g_df["DateTime"].dt.month.apply(
            month_to_season
        )

        loq_before_g_df["Season"] = loq_before_g_df["DateTime"].dt.month.apply(
            month_to_season
        )
        loq_after_g_df["Season"] = loq_after_g_df["DateTime"].dt.month.apply(
            month_to_season
        )

        fig = make_subplots(
            rows=3,
            cols=1,
            specs=[[{"type": "xy"}], [{"type": "table"}], [{"type": "table"}]],
            subplot_titles=(
                "",
                f"Grab Samples Before {sensor_start_date}",
                f"Grab Samples After {sensor_start_date}",
            ),
        )

        fig.add_trace(
            go.Box(
                y=valid_before_g_df[feature],
                name=f"Valid Old Grab<br>N° Points: {valid_before_g_df[feature].count()}",
            ),
            row=1,
            col=1,
        )
        
        fig.add_trace(
            go.Box(
                y=loq_before_g_df[feature],
                name=f"LOQ Old Grab<br>N° Points: {loq_before_g_df[feature].count()}",
            ),
            row=1,
            col=1,
        )
        
        fig.add_trace(
            go.Box(
                y=before_g_df[feature],
                name=f"Overall Old Grab<br>N° Points: {before_g_df[feature].count()}",
            ),
            row=1,
            col=1,
        )
        
        fig.add_trace(
            go.Box(
                y=valid_after_g_df[feature],
                name=f"Valid New Grab<br>N° Points: {valid_after_g_df[feature].count()}",
            ),
            row=1,
            col=1,
        )
        
        fig.add_trace(
            go.Box(
                y=loq_after_g_df[feature],
                name=f"LOQ New Grab<br>N° Points: {loq_after_g_df[feature].count()}",
            ),
            row=1,
            col=1
        )
        
        fig.add_trace(
            go.Box(
                y=after_g_df[feature],
                name=f"Overall New Grab<br>N° Points: {after_g_df[feature].count()}",
            ),
            row=1,
            col=1
        )
        

        fig.add_trace(go.Box(y=s_df[feature], name="Sensor"), row=1, col=1)

        # divide by season for both old and new grab data
        fig.add_trace(
            go.Table(
                header=dict(
                    values=["Season Valid", "N° Points Valid", "Mean Valid", "Std Valid", "Season LOQ", "N° Points LOQ", "Mean LOQ", "Std LOQ"],
                    align="center",
                ),
                cells=dict(
                    values=[
                        valid_before_g_df.groupby("Season").size().index,
                        valid_before_g_df.groupby("Season")[feature].count().values,
                        valid_before_g_df.groupby("Season")[feature]
                        .mean()
                        .values.round(2),
                        valid_before_g_df.groupby("Season")[feature]
                        .std()
                        .values.round(2),
                        loq_before_g_df.groupby("Season").size().index,
                        loq_before_g_df.groupby("Season")[feature].count().values,
                        loq_before_g_df.groupby("Season")[feature]
                        .mean()
                        .values.round(2),
                        loq_before_g_df.groupby("Season")[feature]
                        .std()
                        .values.round(2),
                    ],
                    align="center",
                ),
            ),
            row=2,
            col=1,
        )

        fig.add_trace(
            go.Table(
                header=dict(
                    values=["Season Valid", "N° Points Valid", "Mean Valid", "Std Valid", "Season LOQ", "N° Points LOQ", "Mean LOQ", "Std LOQ"],
                    align="center",
                ),
                cells=dict(
                    values=[
                        valid_after_g_df.groupby("Season").size().index,
                        valid_after_g_df.groupby("Season")[feature].count(),
                        valid_after_g_df.groupby("Season")[feature]
                        .mean()
                        .values.round(2),
                        valid_after_g_df.groupby("Season")[feature]
                        .std()
                        .values.round(2),
                        loq_after_g_df.groupby("Season").size().index,
                        loq_after_g_df.groupby("Season")[feature].count(),
                        loq_after_g_df.groupby("Season")[feature]
                        .mean()
                        .values.round(2),
                        loq_after_g_df.groupby("Season")[feature]
                        .std()
                        .values.round(2),
                    ],
                    align="center",
                ),
            ),
            row=3,
            col=1,
        )

        fig.update_layout(
            title=f"{code} - {feature}",
            yaxis_title=feature,
        )
        
        fig.add_annotation(
            dict(
                x=-0.022,
                y=1.07,
                xref="paper",
                yref="paper",
                showarrow=False,
                text=f"Grab Samples divided by date {sensor_start_date}",
                font=dict(size=12, color="gray"),
            )
        )

        if not os.path.exists(
            os.path.join(plot_folder, "Comparison", "Daily", "Boxplot", code)
        ):
            os.makedirs(
                os.path.join(
                    plot_folder, "Comparison", "Daily", "Boxplot", code
                )
            )

        feature_ = feature.replace("/", "_")

        fig.write_image(
            os.path.join(
                plot_folder, "Comparison", "Daily", "Boxplot", code, f'{feature_}.png',
            ),
            height=800,
            width=1200,
            scale=3
        )

        fig.show()

# Bland-Altman Test

In [37]:
import seaborn as sns

In [None]:
# With all the supply points together

total_g_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])
total_s_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = grab_df[grab_df["Code"] == code].copy()
        
        # if code == "Berna" and feature == "Free Chlorine (mg/L)":
        #     pass

        s_df = sensor_dict[code].copy()

        # moving average on sensor data
        s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
        s_df.set_index("DateTime", inplace=True)
        # 2 hours moving average
        s_df = s_df.rolling(window=4 * 2).mean()
        
        # fix the date of the sensor data to have a frequency of 15 minutes for easier comparison and interpolate to not have nan value
        s_df = s_df.resample("15min").mean().interpolate(method="time")

        sensor_start_date = s_df.index.dropna().min().strftime("%Y-%m-%d")
        sensor_end_date = s_df.dropna().index.max().strftime("%Y-%m-%d")
        
        g_df.set_index("DateTime", inplace=True)
        
        g_df = g_df[(g_df.index >= sensor_start_date) & (g_df.index <= sensor_end_date)]
        
        g_df = g_df[feature]
        g_df.dropna(inplace=True)
        
        # keep only the sensor values that have the date in the grab data and the hour is between 9 and 11
        
        dates = pd.Series(s_df.index.date, index=s_df.index).isin(g_df.index.date)
        dates = dates[dates.values]
        
        s_df = s_df.loc[dates.index]
        s_df = s_df[(s_df.index.hour == 10) & (s_df.index.minute >= 0) & (s_df.index.minute <= 14)]
        
        # if there is more than one value for the same date, take the mean
        s_df = s_df.groupby(s_df.index.date).mean()
        
        total_g_df = pd.concat([total_g_df, pd.DataFrame({"Code": code, "DateTime": g_df.index, "Feature": feature, "Value": g_df.values})])
        total_s_df = pd.concat([total_s_df, pd.DataFrame({"Code": code, "DateTime": s_df.index, "Feature": feature, "Value": s_df[feature].values})])


for feature in feature_mapping.values():
    
    g_df = total_g_df[total_g_df["Feature"] == feature]
    s_df = total_s_df[total_s_df["Feature"] == feature]
    
    g_df["DateTime"] = pd.to_datetime(g_df["DateTime"])
    s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
    
        
    df = pd.merge(g_df, s_df, on=["Code", "DateTime"], suffixes=("_Grab", "_Sensor"))
    df["Difference"] = df["Value_Grab"] - df["Value_Sensor"]
    df['Mean'] = (df["Value_Grab"] + df["Value_Sensor"]) / 2
    

    difference_mean = np.mean(df["Difference"].values)
    difference_std = np.std(df["Difference"].values)
    std_error = difference_std / np.sqrt(g_df.shape[0])

    ci_difference_mean = 1.96 * std_error
    
    
    f, ax = plt.subplots(1, 1, figsize=(15, 10))
    sns.scatterplot(data=df, x="Mean", y="Difference", hue="Code", ax=ax, s=100)
    
    ax.axhline(y=difference_mean, color='green', linestyle='--', label='Mean')
    
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean + std_error, s=f'Mean: {difference_mean:.2f}', color='green')
    
    ax.axhline(y=difference_mean + 1.96 * difference_std, color='red', linestyle='--', label='1.96 * Std')
    # add text over the horizontal line
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean + 1.96 * difference_std + std_error, s=f'+ 1.96 * Std', color='red')
    
    ax.axhline(y=difference_mean - 1.96 * difference_std, color='red', linestyle='--', label='-1.96 * Std')
    # add text over the horizontal line
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean - 1.96 * difference_std + std_error, s=f'-1.96 * Std', color='red')
    
    ax.axhline(y=0, color='black', linestyle='--')


    plt.annotate(
        f'Std error: {std_error:.2f}\nDifference mean CI: {difference_mean:.2f} ± {ci_difference_mean:.2f}',
        xy=(0.05, 0.9),
        xycoords='axes fraction',
        bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="lightblue"),
        color='green'
    )
    
    plt.title(f'{feature} - Bland-Altman Plot')
    plt.tight_layout()

    # display Bland-Altman plot
    if not os.path.exists(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All")):
        os.makedirs(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All"))
        
    feature_ = feature.replace("/", "_")

    plt.savefig(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All", f'{feature_}.png'), dpi=300)
    plt.close(f)

    # plt.show()

In [None]:
# With all the supply points together

total_g_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])
total_s_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = grab_df[grab_df["Code"] == code].copy()
        
        # if code == "Berna" and feature == "Free Chlorine (mg/L)":
        #     pass

        s_df = sensor_dict[code].copy()

        # moving average on sensor data
        s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
        s_df.set_index("DateTime", inplace=True)
        # 2 hours moving average
        s_df = s_df.rolling(window=4 * 2).mean()
        
        # fix the date of the sensor data to have a frequency of 15 minutes for easier comparison and interpolate to not have nan value
        s_df = s_df.resample("15min").mean().interpolate(method="time")

        sensor_start_date = s_df.index.dropna().min().strftime("%Y-%m-%d")
        sensor_end_date = s_df.dropna().index.max().strftime("%Y-%m-%d")
        
        g_df.set_index("DateTime", inplace=True)
        
        g_df = g_df[(g_df.index >= sensor_start_date) & (g_df.index <= sensor_end_date)]
        
        g_df = g_df[feature]
        g_df.dropna(inplace=True)
        
        # keep only the sensor values that have the date in the grab data and the hour is between 9 and 11
        
        dates = pd.Series(s_df.index.date, index=s_df.index).isin(g_df.index.date)
        dates = dates[dates.values]
        
        s_df = s_df.loc[dates.index]
        s_df = s_df[(s_df.index.hour == 10) & (s_df.index.minute >= 0) & (s_df.index.minute <= 14)]
        
        # if there is more than one value for the same date, take the mean
        s_df = s_df.groupby(s_df.index.date).mean()
        
        total_g_df = pd.concat([total_g_df, pd.DataFrame({"Code": code, "DateTime": g_df.index, "Feature": feature, "Value": g_df.values})])
        total_s_df = pd.concat([total_s_df, pd.DataFrame({"Code": code, "DateTime": s_df.index, "Feature": feature, "Value": s_df[feature].values})])


for feature in feature_mapping.values():
    
    g_df = total_g_df[total_g_df["Feature"] == feature]
    s_df = total_s_df[total_s_df["Feature"] == feature]
    
    g_df["DateTime"] = pd.to_datetime(g_df["DateTime"])
    s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
    
        
    df = pd.merge(g_df, s_df, on=["Code", "DateTime"], suffixes=("_Grab", "_Sensor"))
    df["Difference"] = df["Value_Grab"] - df["Value_Sensor"]
    df['Mean'] = (df["Value_Grab"] + df["Value_Sensor"]) / 2
    

    difference_mean = np.mean(df["Difference"].values)
    difference_std = np.std(df["Difference"].values)
    std_error = difference_std / np.sqrt(g_df.shape[0])

    ci_difference_mean = 1.96 * std_error
    
    
    f, ax = plt.subplots(1, 1, figsize=(15, 10))
    sns.scatterplot(data=df, x="DateTime", y="Difference", hue="Code", ax=ax, s=100)
    
    ax.axhline(y=difference_mean, color='green', linestyle='--', label='Mean')
    
    ax.text(x=pd.Timestamp('2024-08-15'), y=difference_mean + std_error, s=f'Mean: {difference_mean:.2f}', color='green')
    
    ax.axhline(y=difference_mean + 1.96 * difference_std, color='red', linestyle='--', label='1.96 * Std')
    # add text over the horizontal line
    ax.text(x=pd.Timestamp('2024-08-15'), y=difference_mean + 1.96 * difference_std + std_error, s=f'+ 1.96 * Std', color='red')
    
    ax.axhline(y=difference_mean - 1.96 * difference_std, color='red', linestyle='--', label='-1.96 * Std')
    # add text over the horizontal line
    ax.text(x=pd.Timestamp('2024-08-15'), y=difference_mean - 1.96 * difference_std + std_error, s=f'-1.96 * Std', color='red')
    
    ax.axhline(y=0, color='black', linestyle='--')


    plt.annotate(
        f'Std error: {std_error:.2f}\nDifference mean CI: {difference_mean:.2f} ± {ci_difference_mean:.2f}',
        xy=(0.05, 0.9),
        xycoords='axes fraction',
        bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="lightblue"),
        color='green'
    )
    
    plt.title(f'{feature} - Bland-Altman Plot')
    plt.tight_layout()

    # display Bland-Altman plot
    if not os.path.exists(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All_Date")):
        os.makedirs(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All_Date"))
        
    feature_ = feature.replace("/", "_")

    plt.savefig(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "All_Date", f'{feature_}.png'), dpi=300)
    plt.close(f)

    # plt.show()

In [None]:
# For each supply point

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = grab_df[grab_df["Code"] == code].copy()
        
        # if code == "Berna" and feature == "Free Chlorine (mg/L)":
        #     pass

        s_df = sensor_dict[code].copy()

        # moving average on sensor data
        s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
        s_df.set_index("DateTime", inplace=True)
        # 2 hours moving average
        s_df = s_df.rolling(window=4 * 2).mean()
        
        # fix the date of the sensor data to have a frequency of 15 minutes for easier comparison and interpolate to not have nan value
        s_df = s_df.resample("15min").mean().interpolate(method="time")

        sensor_start_date = s_df.index.dropna().min().strftime("%Y-%m-%d")
        sensor_end_date = s_df.dropna().index.max().strftime("%Y-%m-%d")
        
        g_df.set_index("DateTime", inplace=True)
        
        g_df = g_df[(g_df.index >= sensor_start_date) & (g_df.index <= sensor_end_date)]
        
        g_df = g_df[feature]
        g_df.dropna(inplace=True)
        
        # keep only the sensor values that have the date in the grab data and the hour is between 9 and 11
        
        dates = pd.Series(s_df.index.date, index=s_df.index).isin(g_df.index.date)
        dates = dates[dates.values]
        
        s_df = s_df.loc[dates.index]
        s_df = s_df[(s_df.index.hour == 10) & (s_df.index.minute >= 0) & (s_df.index.minute <= 14)]
        
        # if there is more than one value for the same date, take the mean
        s_df = s_df.groupby(s_df.index.date).mean()

        
        f, ax = plt.figure(figsize=(10, 6)), plt.gca()
        
        # try:
        #     sm.graphics.mean_diff_plot(g_df.values, s_df[feature].values, ax = ax)
        # except:
        #     pass
        sm.graphics.mean_diff_plot(g_df.values, s_df[feature].values, ax = ax)
        
        
        plt.title(f'{code} - {feature}')
        
        difference_mean = np.mean(g_df.values - s_df[feature].values)
        difference_std = np.std(g_df.values - s_df[feature].values)
        std_error = difference_std / np.sqrt(g_df.shape[0])
        
        ci_difference_mean = 1.96 * std_error
        
        
        plt.annotate(
            f'Std error: {std_error:.2f}\nDifference mean CI: {difference_mean:.2f} ± {ci_difference_mean:.2f}',
            xy=(0.05, 0.9),
            xycoords='axes fraction',
            bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="lightblue"),
            color='red'
        )
        
        

        #display Bland-Altman plot
        if not os.path.exists(os.path.join(plot_folder, "Bland-Altman", "Supply Points", code)):
            os.makedirs(os.path.join(plot_folder, "Bland-Altman", "Supply Points", code))
            
        feature_ = feature.replace("/", "_")
        
        plt.savefig(os.path.join(plot_folder, "Bland-Altman", "Supply Points", code, f'{feature_}.png'), dpi=300)
        plt.close(f)
        
        # plt.show()

# Check Measurement Time between Supply Points

In [None]:
# check if the timestamps are the same for each code

common_dates = pd.Series()

for code in grab_df["Code"].unique():
    
    code_df = grab_df[grab_df["Code"] == code]["DateTime"].copy()
        
        
    if common_dates.empty:
        common_dates = code_df
    
    else:
        common_dates = pd.Series(list(set(common_dates).intersection(set(code_df))))
        
common_dates = common_dates.sort_values()

common_dates

In [42]:
# cannot compare measurements on the same day, because the grab samples are taken at different times

In [None]:
for feature in list(feature_mapping.values()):
    fig = go.Figure()
    
    for code in grab_df["Code"].unique():
        code_df = grab_df[grab_df["Code"] == code].copy()
        
        fig.add_trace(
            go.Scatter(
                x=code_df["DateTime"], y=code_df[feature], mode="markers", name=code
            )
        )
        
    fig.update_layout(
        title=feature,
        xaxis_title="DateTime",
        yaxis_title=feature
    )
    
    fig.show()

# Data Imputation with MICE

In [None]:
grab_df.columns.to_list()

In [45]:
def replace_non_normal(row, column):
    return np.nan if row[column + "_label"] != "Normal" else row[column]

In [46]:
# get the columns that contain the word 'label'
label_columns = [col for col in grab_df.columns if 'label' in col]

In [47]:
label_removed_grab_df = grab_df.copy()

for column in grab_df.columns.difference(["Code", "DateTime"] + label_columns):
    label_removed_grab_df[column] = label_removed_grab_df.apply(
        lambda row: replace_non_normal(row, column), axis=1
    )

In [48]:
# assert that the columns have been replaced correctly
for column in label_removed_grab_df.columns.difference(["Code", "DateTime"] + label_columns):
    # assert that all the values that are nan should have a label different from 'Normal'
    assert label_removed_grab_df[(label_removed_grab_df[column].isna()) & (label_removed_grab_df[column + "_label"] == 'Normal')].shape[0] == 0 

    # assert that all the values that are not nan should have a label equal to 'Normal'
    assert label_removed_grab_df[(label_removed_grab_df[column].notna()) & (label_removed_grab_df[column + "_label"] != 'Normal')].shape[0] == 0

In [None]:
# for every feature, save the indexes of the values that are less than the LOQ
indexes_df = pd.DataFrame(columns=feature_mapping.values(), index=grab_df["Code"].unique())

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        df = grab_df[grab_df["Code"] == code].copy()
        
        df = df[df[feature + "_label"] == "Less than"]
        
        indexes_df.loc[code, feature] = df.index.to_list()
    
indexes_df

In [50]:
# remove the label columns
label_removed_grab_df = label_removed_grab_df.drop(columns=label_columns)

In [None]:
import miceforest as mf

In [52]:
df = label_removed_grab_df.copy()

df = df[list(feature_mapping.values()) + ['DateTime'] + ['Code']]

# convert datetime column to float
df['DateTime'] = pd.to_numeric(df['DateTime'])
df['Code'] = df['Code'].astype('category')

df = df.reset_index(drop=True)

kernel = mf.ImputationKernel(
    data=df,
    variable_schema=df.columns.difference(['DateTime', 'Code']).to_list(),
    random_state=42,
    mean_match_strategy='shap',
)

In [None]:
print(df[df['Code'] == code].isnull().all(axis=1).sum())  # Rows with all NaN

In [None]:
df[df['Code'] == code]['Turbidity (NTU)'].isnull().sum()

In [None]:
print(df[df['Code'] == code].isnull().all(axis=0).sum())  # Columns with all NaN

In [None]:
# Perform MICE imputation
kernel.mice(4, verbose=True)

In [57]:
completed_dataset = kernel.complete_data(dataset=0)

In [None]:
completed_dataset

In [59]:
# comparison of the less than LOQ values before and after imputation


for feature in ['Free Chlorine (mg/L)', 'Color (CU)', 'Turbidity (NTU)']:
    
    fig = go.Figure()
    
    for code in grab_df["Code"].unique():
    
        indexes = indexes_df.loc[code, feature]
        
        df = label_removed_grab_df.copy()
        
        df = df[(df["Code"] == code) & (df.index.isin(indexes))]
        
        df = df[[feature]]
        
        df['Imputed'] = completed_dataset.loc[df.index, feature]
        
        
        fig.add_trace(
            go.Scatter(
                x=df.index, y=df['Imputed'], mode='markers', name=code
            )
        )
    if feature == 'Free Chlorine (mg/L)':
        loq_value = 0.04
    elif feature == 'Color (CU)':
        loq_value = 1
    else:
        loq_value = 0.3
        
    fig.add_hline(y=loq_value, line_dash='dash', line_color='black', opacity=0.3)
        
    
    fig.update_layout(
        title=f'{feature} - LOQ: {loq_value}',
        xaxis_title='Index',
        yaxis_title=feature
    )
    
    if not os.path.exists(os.path.join(plot_folder, 'Imputation', 'LOQ')):
            os.makedirs(os.path.join(plot_folder, 'Imputation', 'LOQ'))

    feature_ = feature.replace('/', '_')

    fig.write_image(os.path.join(plot_folder, 'Imputation', 'LOQ', f'{feature_}.png'), height=600, width=1200, scale=3)

    
    

In [60]:
# convert the datetime column back to datetime
completed_dataset['DateTime'] = pd.to_datetime(completed_dataset['DateTime'])

## Bland-Altman Imputed Data

In [None]:
# With all the supply points together

total_g_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])
total_s_df = pd.DataFrame(columns=["Code", "DateTime", "Feature", "Value"])

for code in grab_df["Code"].unique():
    for feature in feature_mapping.values():
        g_df = completed_dataset[completed_dataset["Code"] == code].copy()
        
        # if code == "Berna" and feature == "Free Chlorine (mg/L)":
        #     pass

        s_df = sensor_dict[code].copy()

        # moving average on sensor data
        s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
        s_df.set_index("DateTime", inplace=True)
        # 2 hours moving average
        s_df = s_df.rolling(window=4 * 2).mean()
        
        # fix the date of the sensor data to have a frequency of 15 minutes for easier comparison and interpolate to not have nan value
        s_df = s_df.resample("15min").mean().interpolate(method="time")

        sensor_start_date = s_df.index.dropna().min().strftime("%Y-%m-%d")
        sensor_end_date = s_df.dropna().index.max().strftime("%Y-%m-%d")
        
        g_df.set_index("DateTime", inplace=True)
        
        g_df = g_df[(g_df.index >= sensor_start_date) & (g_df.index <= sensor_end_date)]
        
        g_df = g_df[feature]
        g_df.dropna(inplace=True)
        
        # keep only the sensor values that have the date in the grab data and the hour is between 9 and 11
        
        dates = pd.Series(s_df.index.date, index=s_df.index).isin(g_df.index.date)
        dates = dates[dates.values]
        
        s_df = s_df.loc[dates.index]
        s_df = s_df[(s_df.index.hour == 10) & (s_df.index.minute >= 0) & (s_df.index.minute <= 14)]
        
        # if there is more than one value for the same date, take the mean
        s_df = s_df.groupby(s_df.index.date).mean()
        
        total_g_df = pd.concat([total_g_df, pd.DataFrame({"Code": code, "DateTime": g_df.index, "Feature": feature, "Value": g_df.values})])
        total_s_df = pd.concat([total_s_df, pd.DataFrame({"Code": code, "DateTime": s_df.index, "Feature": feature, "Value": s_df[feature].values})])


for feature in feature_mapping.values():
    
    g_df = total_g_df[total_g_df["Feature"] == feature]
    s_df = total_s_df[total_s_df["Feature"] == feature]
    
    g_df["DateTime"] = pd.to_datetime(g_df["DateTime"])
    s_df["DateTime"] = pd.to_datetime(s_df["DateTime"])
    
        
    df = pd.merge(g_df, s_df, on=["Code", "DateTime"], suffixes=("_Grab", "_Sensor"))
    df["Difference"] = df["Value_Grab"] - df["Value_Sensor"]
    df['Mean'] = (df["Value_Grab"] + df["Value_Sensor"]) / 2
    

    difference_mean = np.mean(df["Difference"].values)
    difference_std = np.std(df["Difference"].values)
    std_error = difference_std / np.sqrt(g_df.shape[0])

    ci_difference_mean = 1.96 * std_error
    
    
    f, ax = plt.subplots(1, 1, figsize=(15, 10))
    sns.scatterplot(data=df, x="Mean", y="Difference", hue="Code", ax=ax, s=100)
    
    ax.axhline(y=difference_mean, color='green', linestyle='--', label='Mean')
    
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean + std_error, s=f'Mean: {difference_mean:.2f}', color='green')
    
    ax.axhline(y=difference_mean + 1.96 * difference_std, color='red', linestyle='--', label='1.96 * Std')
    # add text over the horizontal line
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean + 1.96 * difference_std + std_error, s=f'+ 1.96 * Std', color='red')
    
    ax.axhline(y=difference_mean - 1.96 * difference_std, color='red', linestyle='--', label='-1.96 * Std')
    # add text over the horizontal line
    ax.text(x=df['Mean'].quantile(0.97), y=difference_mean - 1.96 * difference_std + std_error, s=f'-1.96 * Std', color='red')
    
    ax.axhline(y=0, color='black', linestyle='--')


    plt.annotate(
        f'Std error: {std_error:.2f}\nDifference mean CI: {difference_mean:.2f} ± {ci_difference_mean:.2f}',
        xy=(0.05, 0.9),
        xycoords='axes fraction',
        bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="lightblue"),
        color='green'
    )
    
    plt.title(f'{feature} - Bland-Altman Plot')
    plt.tight_layout()

    # display Bland-Altman plot
    if not os.path.exists(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "Imputed")):
        os.makedirs(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "Imputed"))
        
    feature_ = feature.replace("/", "_")

    plt.savefig(os.path.join(plot_folder, "Bland-Altman", "Supply Points", "Imputed", f'{feature_}.png'), dpi=300)
    plt.close(f)

    # plt.show()

# Data Imputation with LOQ/2

Value points with the label 'Less than' are imputed with LOQ/2, while value points with label 'NaN' are imputed with MICE

In [157]:
def replace_loq(row, column):
    return row[column] if row[column + "_label"] != "Less than" else row[column] / 2

In [158]:
label_columns = [col for col in grab_df.columns if 'label' in col]

In [159]:
df = grab_df.copy()

for column in grab_df.columns.difference(["Code", "DateTime"] + label_columns):
    df[column] = df.apply(
        lambda row: replace_loq(row, column), axis=1
    )

In [160]:
df = df.drop(columns=label_columns)

In [161]:
df = df[list(feature_mapping.values()) + ['DateTime'] + ['Code']]

# convert datetime column to float
df['DateTime'] = pd.to_numeric(df['DateTime'])
df['Code'] = df['Code'].astype('category')

df = df.reset_index(drop=True)

In [162]:
for code in grab_df["Code"].unique():
    if df[df['Code'] == code].isnull().all(axis=1).sum() > 0: # Rows with all NaN
        print(f'{code} has {df[df['Code'] == code].isnull().all(axis=1).sum()} rows with all NaN')

In [None]:
for code in grab_df["Code"].unique():
    if df[df['Code'] == code]['Turbidity (NTU)'].isnull().sum() > 0:
        print(f'{code} has {df[df["Code"] == code]["Turbidity (NTU)"].isnull().sum()} NaN values for Turbidity (NTU)')

In [164]:
for code in grab_df["Code"].unique():
    if df[df['Code'] == code].isnull().all(axis=0).sum() > 0: # Columns with all NaN
        print(f'{code} has {df[df["Code"] == code].isnull().all(axis=0).sum()} columns with all NaN')

In [166]:
# drop the turbidity since it has a lot of missing values and the sensors are not well calibrated
df.drop(columns=['Turbidity (NTU)'], inplace=True)


In [None]:
feature_mapping.pop('Torbidità (NTu)')

In [None]:
df

In [None]:
# Perform MICE imputation

kernel = mf.ImputationKernel(
    data=df,
    variable_schema=df.columns.difference(['DateTime', 'Code']).to_list(),
    random_state=42,
    mean_match_strategy='shap',
)

kernel.mice(4, verbose=True)

In [170]:
completed_dataset = kernel.complete_data(dataset=0)

In [None]:
completed_dataset.head(5)

# UMAP Visualization

In [264]:
house_codes = grab_df["Code"].unique()

code_mapping = {code: i for i, code in enumerate(house_codes)}
df = completed_dataset[['Code'] + list(feature_mapping.values())].copy()

df['Code'] = df['Code'].map(code_mapping)
df['Code'] = df['Code'].astype('category')

In [265]:
# revert the code mapping
# df['Code'] = df['Code'].map({v: k for k, v in code_mapping.items()})

In [None]:
df.shape

In [267]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
mapper = umap.UMAP(n_neighbors=10).fit(df[['Code'] + list(feature_mapping.values())])

In [None]:
code_mapping

In [None]:
umap.plot.points(mapper, labels=df['Code'])

In [271]:
hover_data = pd.DataFrame({
    'index': np.arange(df.shape[0]),
    'label': df['Code']
})

hover_data['item'] = hover_data.label.map(
    {v: k for k, v in code_mapping.items()}
)

In [None]:
umap.plot.output_notebook()

In [None]:
p = umap.plot.interactive(mapper, labels=df['Code'], hover_data=hover_data, point_size=4)
umap.plot.show(p)

In [274]:
from bokeh.plotting import save, output_file

In [None]:
output_file(os.path.join(plot_folder, 'Imputation', 'UMAP.html'))

save(p)

In [276]:
# sembra che Tognazzi sia abbastanza distante dagli altri, anche il pair plot fa notare una leggera differenza rispetto agli altri

# Clustering

## COP - KMeans

In [277]:
from copkmeans.cop_kmeans import cop_kmeans

In [None]:
df

In [279]:
from itertools import combinations

In [303]:
n_clusters = 5

must_link = []

code_column = df['Code']

for code in df['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(df[df['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = df[list(feature_mapping.values())].to_numpy()

clusters, centers = cop_kmeans(np_df, n_clusters, ml=must_link)

In [304]:
df['Cluster'] = clusters
df['Code'] = code_column

In [None]:
code_mapping

In [None]:
centers

In [None]:
for cluster in df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = df[df['Cluster'] == cluster]['Code'].unique().tolist()
    # get the key from the value
    codes = [k for k, v in code_mapping.items() if v in codes]
    print(codes)

In [326]:
from sklearn.cluster import AgglomerativeClustering
from scipy import sparse

In [309]:
df.drop(columns=['Cluster'], inplace=True)

In [334]:
# define the connectivity matrix such that each sample with the same code is connected
connectivity = np.zeros((df.shape[0], df.shape[0]))

for code in df['Code'].unique():
    # indexes = df[df['Code'] == code].index.to_numpy()
    # # Set connectivity for all pairs of samples with the same 'Code' to 1
    # connectivity[indexes[:, None], indexes] = 1
    
    index_pairs = list(combinations(df[df['Code'] == code].index, 2))
    for i, j in index_pairs:
        connectivity[i, j] = 1
        connectivity[j, i] = 1

connectivity = sparse.csr_matrix(connectivity)

In [374]:
ward = AgglomerativeClustering(n_clusters=9, linkage='ward', connectivity=connectivity)

In [None]:
ward.fit(df[['Code'] + list(feature_mapping.values())])

In [376]:
df['Cluster'] = ward.labels_

In [None]:
for cluster in df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = df[df['Cluster'] == cluster]['Code'].unique().tolist()
    # get the key from the value
    codes = [k for k, v in code_mapping.items() if v in codes]
    print(codes)

In [108]:
from sklearn.preprocessing import MinMaxScaler
# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df1 = completed_dataset[completed_dataset['Code'] == 'Berna']['Temperature (°C)'].copy()
df2 = completed_dataset[completed_dataset['Code'] == 'Tabacchi']['Temperature (°C)'].copy()

print("Wasserstein no standardization:", wasserstein_distance(df1, df2))
scaler = MinMaxScaler()

df1 = pd.Series(scaler.fit_transform(df1.values.reshape(-1, 1)).flatten())

df2 = pd.Series(scaler.fit_transform(df2.values.reshape(-1, 1)).flatten())

print("Wasserstein with standardization:", wasserstein_distance(df1, df2))

In [110]:
from itertools import combinations

In [None]:
for house1, house2 in combinations(house_codes, 2):
    df1 = completed_dataset[completed_dataset['Code'] == house1][feature_mapping.values()].copy()
    df2 = completed_dataset[completed_dataset['Code'] == house2][feature_mapping.values()].copy()
    
    print(f"House1: {house1} - House2: {house2}")
    print(np.mean(cosine_similarity(df1, df2)))
    print()

In [112]:
# standadization affects the wasserstein distance

## On THMs

In [378]:
thms_columns = ['Bromodichloromethane (µg/L)', 'Bromoform (µg/L)', 'Chloroform (µg/L)', 'Dibromochloromethane (µg/L)']

In [None]:
grab_df[['Code'] + thms_columns]

In [380]:
house_codes = grab_df["Code"].unique()

code_mapping = {code: i for i, code in enumerate(house_codes)}

In [381]:
df = grab_df[['Code'] + thms_columns].copy()

df['Code'] = df['Code'].map(code_mapping)
df['Code'] = df['Code'].astype('category')

In [396]:
dataframe = df.copy()
dataframe = dataframe.dropna()

In [397]:
dataframe['TTHM'] = dataframe[thms_columns].sum(axis=1)

In [398]:
dataframe.reset_index(drop=True, inplace=True)

In [420]:
must_link = []


for code in dataframe['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(dataframe[dataframe['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = dataframe['TTHM'].to_numpy().reshape(-1, 1)

In [421]:
variances = []

for n_cluster in range(2, 10):
    clusters, centers = cop_kmeans(np_df, n_cluster, ml=must_link)
    
    dataframe['Cluster'] = clusters
    
    # compute the variance of each cluster
    variance = 0
    for cluster in dataframe['Cluster'].unique():
        cluster_df = dataframe[dataframe['Cluster'] == cluster].copy()
        variance += np.var(cluster_df['TTHM']).mean()
    
    # compute average variance for n_cluster
    variance /= n_cluster    
    
    variances.append(variance)
    
    dataframe.drop(columns=['Cluster'], inplace=True)
    

In [None]:
# plot the elbow curve
plt.plot(range(2, 10), variances)

plt.xlabel('Number of clusters')
plt.ylabel('Average Variance')
plt.show()




In [None]:
# the optimal number of clusters is 3

In [423]:
n_clusters = 3

must_link = []


for code in dataframe['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(dataframe[dataframe['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = dataframe['TTHM'].to_numpy().reshape(-1, 1)

clusters, centers = cop_kmeans(np_df, n_clusters, ml=must_link)

In [424]:
dataframe['Cluster'] = clusters

In [None]:
code_mapping

In [None]:
centers

In [None]:
for cluster in dataframe['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = dataframe[dataframe['Cluster'] == cluster]['Code'].unique().tolist()
    # get the key from the value
    codes = [k for k, v in code_mapping.items() if v in codes]
    print(codes)

In [None]:
# plot the clusters
fig = go.Figure()

# for each code, use a different marker
# for each cluster, use a different color


symbols = ["circle", "square", "diamond", "cross", "triangle-up", "triangle-down", 
           "pentagon", "hexagon", "star"]

colors = ["red", "blue", "green"]

for i, cluster in enumerate(dataframe['Cluster'].unique()):
    for j, code in enumerate(dataframe['Code'].unique()):
        subset = dataframe[(dataframe["Cluster"] == cluster) & (dataframe["Code"] == code)]
        
        code_name = [k for k, v in code_mapping.items() if v == code][0]
        
        if not subset.empty:
            fig.add_trace(go.Scatter(
                x=subset['Code'].map({v: k for k, v in code_mapping.items()}),  # X-axis as the index
                y=subset["TTHM"],  # Y-axis as the variable
                mode="markers",
                marker=dict(
                    symbol=symbols[j],  # Unique symbol for each code
                    size=10,
                    line=dict(width=1),
                    color=colors[i],  # Unique color for each cluster
                ),
                name=f"{code_name}, Cluster: {cluster}"
            ))

fig.update_layout(
    title="Clusters",
    xaxis_title="Code",
    yaxis_title="TTHM"
)

fig.show()