# Treatment Plants Analysis

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Paths

In [None]:
data_folder = os.path.join(os.path.join("..", "..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")
intermediate_data_folder = os.path.join(data_folder, "Intermediate Data")

# Load Data

In [None]:
grab_samples_df = pd.read_excel(
    os.path.join(intermediate_data_folder, "All grab samples.xlsx")
)

In [None]:
grab_samples_df

In [None]:
# filter by ZONA where the 4 treatment plants are located
tp_names = ["NORD - OVEST", "NORD - EST", "SUD - OVEST", "SUD - EST"]

grab_samples_df = grab_samples_df[grab_samples_df["ZONA"].isin(tp_names)]

In [None]:
grab_samples_df.columns.to_list()

In [None]:
feature_columns = [
    "Torbidità (NTU)",
    "Conduttività a 20°C (µS/cm)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)",
    "Temperatura (°C)",
    "Nitrati (mg/L)",
    "Nitriti (mg/L)",
    "TOC - carbonio organico totale (mg/L di C)",
    "Concentrazione ioni idrogeno (unità pH)",
]

target_columns = [
    "Batteri coliformi a 37°C (MPN/100 mL)",
    "Bromodiclorometano (µg/L)",
    "Bromoformio (µg/L)",
    "Carica batterica a 22°C (UFC/mL)",
    "Carica batterica a 37°C (UFC/mL)",
    "Cloroformio (µg/L)",
    "Conteggio colonie a 30°C (UFC/mL)",
    "Dibromoclorometano (µg/L)",
    "Enterococchi (MPN/100 mL)",
    "Escherichia coli (MPN/100 mL)",
    "Legionella spp (UFC/L)",
    "Pseudomonas aeruginosa (UFC/250 mL)",
    "Sommatoria totale trialometani  (µg/L)"
]

# Inspection

## Time series

### Features

In [None]:
# Define color and symbol mappings for the labels
color_mapping = {
    "Less than": "red",
    "Greater than": "blue",
    "Normal": "green",
}

symbol_mapping = {
    "Less than": "circle",
    "Greater than": "square",
    "Normal": "diamond",
}

for col in feature_columns:
    fig = go.Figure()
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]

        tp_df.loc[:, "Data di prelievo"] = pd.to_datetime(tp_df["Data di prelievo"])
        tp_df = tp_df.sort_values("Data di prelievo")

        # Add line trace
        fig.add_trace(
            go.Scatter(
                x=tp_df["Data di prelievo"],
                y=tp_df[col],
                mode="lines",
                name=tp_name,
                showlegend=True,
            )
        )

        # Add scatter traces for each label
        for label in color_mapping.keys():
            label_df = tp_df[tp_df[col + "_label"] == label]
            fig.add_trace(
                go.Scatter(
                    x=label_df["Data di prelievo"],
                    y=label_df[col],
                    mode="markers",
                    name=f"{tp_name} - {label}",
                    marker=dict(
                        color=color_mapping[label],
                        symbol=symbol_mapping[label],
                    ),
                    showlegend=True,
                )
            )

    fig.update_layout(title=col)
    fig.show()

### Targets

In [None]:
# for each target column, plot line plot for each treatment plant

for col in target_columns:
    fig = go.Figure()
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]

        tp_df.loc[:, "Data di prelievo"] = pd.to_datetime(
            tp_df["Data di prelievo"]
        )
        tp_df = tp_df.sort_values("Data di prelievo")

        # Add line trace
        fig.add_trace(
            go.Scatter(
                x=tp_df["Data di prelievo"],
                y=tp_df[col],
                mode="lines",
                name=tp_name,
                showlegend=True,
            )
        )

        # Add scatter traces for each label
        for label in color_mapping.keys():
            label_df = tp_df[tp_df[col + "_label"] == label]
            fig.add_trace(
                go.Scatter(
                    x=label_df["Data di prelievo"],
                    y=label_df[col],
                    mode="markers",
                    name=f"{label}",
                    marker=dict(
                        color=color_mapping[label],
                        symbol=symbol_mapping[label],
                    ),
                    showlegend=True,
                )
            )


    # get legend names
    legend_names = []
    for trace in fig.data:
        legend_names.append(trace.name)
    
    # remove duplicate legend entries
    names = set()
    fig.for_each_trace(
        lambda trace:
            trace.update(showlegend=False)
            if (trace.name in names) and (legend_names.count(trace.name) > 1) else names.add(trace.name))

    fig.update_layout(title=col)
    fig.show()

## Boxplots

### Features

In [None]:
# for each feature column, plot box plot for each treatment plant

color_mapping = {
    "Less than": "red",
    "Greater than": "green",
    "Normal": "blue",
}

for col in feature_columns:
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]
        
        count = tp_df[col].count()
        total = tp_df[col].shape[0]

        perc = count / total * 100

        # remove outliers
        k = 3
        q1 = tp_df[col].quantile(0.25)
        q3 = tp_df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - k * iqr
        upper_bound = q3 + k * iqr

        # set outliers to None
        tp_df.loc[tp_df[col] < lower_bound, col] = None
        tp_df.loc[tp_df[col] > upper_bound, col] = None
        
        grab_samples_df.loc[grab_samples_df["ZONA"] == tp_name, col] = tp_df[col]
        
    fig = px.strip(
        grab_samples_df,
        x="ZONA",
        y=col,
        color=col + "_label",
        title=col,
        labels={col: col, "ZONA": "Treatment Plant"},
        stripmode="overlay",
        color_discrete_map=color_mapping,
    )
    
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]
        
        fig.add_trace(
            go.Box(
                x=[tp_name] * len(tp_df[col]),
                y=tp_df[col],
                name=tp_name,

            )
        )


    fig.update_layout(title=col)
    fig.show()

### Targets

In [None]:
# for each feature column, plot box plot for each treatment plant

color_mapping = {
    "Less than": "red",
    "Greater than": "green",
    "Normal": "blue",
}

for col in target_columns:
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]
        
        count = tp_df[col].count()
        total = tp_df[col].shape[0]

        perc = count / total * 100

        # remove outliers
        k = 3
        q1 = tp_df[col].quantile(0.25)
        q3 = tp_df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - k * iqr
        upper_bound = q3 + k * iqr

        # set outliers to None
        tp_df.loc[tp_df[col] < lower_bound, col] = None
        tp_df.loc[tp_df[col] > upper_bound, col] = None
        
        grab_samples_df.loc[grab_samples_df["ZONA"] == tp_name, col] = tp_df[col]
        
    fig = px.strip(
        grab_samples_df,
        x="ZONA",
        y=col,
        color=col + "_label",
        title=col,
        labels={col: col, "ZONA": "Treatment Plant"},
        stripmode="overlay",
        color_discrete_map=color_mapping,
    )
    
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["ZONA"] == tp_name]
        
        fig.add_trace(
            go.Box(
                x=[tp_name] * len(tp_df[col]),
                y=tp_df[col],
                name=tp_name,

            )
        )


    fig.update_layout(title=col)
    fig.show()