# Treatment Plants Analysis

In [30]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Paths

In [31]:
data_folder = os.path.join(os.path.join("..", "..", "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")
intermediate_data_folder = os.path.join(data_folder, "Intermediate Data")

treatments_plants_folder = os.path.join(data_folder, "Plots", "Riunione 24-04-2024", "Treatment Plants")

# Load Data

In [32]:
grab_samples_df = pd.read_excel(
    os.path.join(intermediate_data_folder, "All grab samples.xlsx")
)

In [None]:
grab_samples_df

In [34]:
# filter by ZONA where the 4 treatment plants are located
# tp_names = ["NORD - OVEST", "NORD - EST", "SUD - OVEST", "SUD - EST"]

# grab_samples_df = grab_samples_df[grab_samples_df["ZONA"].isin(tp_names)]

tp_names = [
    'Abbiategrasso',
    'Anfossi',
    'Armi',
    'Assiano',
    'Baggio',
    'Bicocca',
    'Cantore',
    'Chiusabella',
    'Cimabue',
    'Comasina',
    'Crescenzago',
    'Este',
    'Feltre',
    'Gorla',
    'Italia',
    'Lambro',
    'Linate',
    'Martini',
    'Novara',
    'Ovidio',
    'Padova',
    'Parco',
    'Salemi',
    'San Siro',
    'Suzzani',
    'Tonezza',
    'Vialba'
]

grab_samples_df = grab_samples_df[grab_samples_df["Punto di prelievo"].isin(tp_names)]

In [None]:
grab_samples_df.columns.to_list()

In [84]:
feature_columns = {
    "Torbidità (NTU)": "Turbidity (NTU)",
    "Conduttività a 20°C (µS/cm)": "Conductivity at 20°C (µS/cm)",
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free chlorine (mg/L)",
    "Temperatura (°C)": "Temperature (°C)",
    "Nitrati (mg/L)": "Nitrates (mg/L)",
    "Nitriti (mg/L)": "Nitrites (mg/L)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
}

target_columns = {
    "Batteri coliformi a 37°C (MPN/100 mL)": "Coliform bacteria at 37°C (MPN/100 mL)",
    "Bromodiclorometano (µg/L)": "Bromodichloromethane (µg/L)",
    "Bromoformio (µg/L)": "Bromoform (µg/L)",
    "Carica batterica a 22°C (UFC/mL)": "Bacterial load at 22°C (CFU/mL)",
    "Carica batterica a 37°C (UFC/mL)": "Bacterial load at 37°C (CFU/mL)",
    "Cloroformio (µg/L)": "Chloroform (µg/L)",
    "Conteggio colonie a 30°C (UFC/mL)": "Colony count at 30°C (CFU/mL)",
    "Dibromoclorometano (µg/L)": "Dibromochloromethane (µg/L)",
    "Enterococchi (MPN/100 mL)": "Enterococci (MPN/100 mL)",
    "Escherichia coli (MPN/100 mL)": "Escherichia coli (MPN/100 mL)",
    "Legionella spp (UFC/L)": "Legionella spp (CFU/L)",
    "Pseudomonas aeruginosa (UFC/250 mL)": "Pseudomonas aeruginosa (CFU/250 mL)",
    "Sommatoria totale trialometani  (µg/L)": "Total trihalomethanes (µg/L)"
}

# Inspection

## Time series

In [37]:
time_series_folder = os.path.join(treatments_plants_folder, "Time Series")

### Features

In [38]:
# Define color and symbol mappings for the labels
color_mapping = {
    "Less than": "red",
    "Greater than": "blue",
    "Normal": "green",
}

symbol_mapping = {
    "Less than": "circle",
    "Greater than": "square",
    "Normal": "diamond",
}

for col in feature_columns.keys():
    fig = go.Figure()
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]

        tp_df.loc[:, "Data di prelievo"] = pd.to_datetime(tp_df["Data di prelievo"])
        tp_df = tp_df.sort_values("Data di prelievo")

        # Add line trace
        fig.add_trace(
            go.Scatter(
                x=tp_df["Data di prelievo"],
                y=tp_df[col],
                mode="lines",
                name=tp_name,
                showlegend=True,
            )
        )

        # Add scatter traces for each label
        for label in color_mapping.keys():
            label_df = tp_df[tp_df[col + "_label"] == label]
            fig.add_trace(
                go.Scatter(
                    x=label_df["Data di prelievo"],
                    y=label_df[col],
                    mode="markers",
                    name=f"{tp_name} - {label}",
                    marker=dict(
                        color=color_mapping[label],
                        symbol=symbol_mapping[label],
                    ),
                    showlegend=True,
                )
            )

    fig.update_layout(title=col)
    
    col_ = col.replace("/", "_")
    
    fig.write_html(
        os.path.join(time_series_folder, f"{col_}.html")
    )

### Targets

In [39]:
# for each target column, plot line plot for each treatment plant

for col in target_columns:
    fig = go.Figure()
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]

        tp_df.loc[:, "Data di prelievo"] = pd.to_datetime(
            tp_df["Data di prelievo"]
        )
        tp_df = tp_df.sort_values("Data di prelievo")

        # Add line trace
        fig.add_trace(
            go.Scatter(
                x=tp_df["Data di prelievo"],
                y=tp_df[col],
                mode="lines",
                name=tp_name,
                showlegend=True,
            )
        )

        # Add scatter traces for each label
        for label in color_mapping.keys():
            label_df = tp_df[tp_df[col + "_label"] == label]
            fig.add_trace(
                go.Scatter(
                    x=label_df["Data di prelievo"],
                    y=label_df[col],
                    mode="markers",
                    name=f"{label}",
                    marker=dict(
                        color=color_mapping[label],
                        symbol=symbol_mapping[label],
                    ),
                    showlegend=True,
                )
            )


    # get legend names
    legend_names = []
    for trace in fig.data:
        legend_names.append(trace.name)
    
    # remove duplicate legend entries
    names = set()
    fig.for_each_trace(
        lambda trace:
            trace.update(showlegend=False)
            if (trace.name in names) and (legend_names.count(trace.name) > 1) else names.add(trace.name))

    fig.update_layout(title=col)
    
    col_ = col.replace("/", "_")
    
    fig.write_html(
        os.path.join(time_series_folder, f"{col_}.html")
    )

## Boxplots

In [40]:
boxplots_folder = os.path.join(treatments_plants_folder, "Boxplots")

### Features

In [45]:
import matplotlib.cm as cm

In [85]:
# for each feature column, plot box plot for each treatment plant

color_mapping = {
    "Less than": "red",
    "Greater than": "green",
    "Normal": "blue",
}

num_colors = len(tp_names)
colors = cm.plasma(np.linspace(0, 1, num_colors))

for col in feature_columns:
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]
        
        count = tp_df[col].count()
        total = tp_df[col].shape[0]

        perc = count / total * 100

        # remove outliers
        k = 3
        q1 = tp_df[col].quantile(0.25)
        q3 = tp_df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - k * iqr
        upper_bound = q3 + k * iqr

        # set outliers to None
        tp_df.loc[tp_df[col] < lower_bound, col] = None
        tp_df.loc[tp_df[col] > upper_bound, col] = None
        
        grab_samples_df.loc[grab_samples_df["Punto di prelievo"] == tp_name, col] = tp_df[col]
        
    # fig = px.strip(
    #     grab_samples_df,
    #     x="ZONA",
    #     y=col,
    #     color=col + "_label",
    #     title=col,
    #     labels={col: col, "Punto di prelievo": "Treatment Plant"},
    #     stripmode="overlay",
    #     color_discrete_map=color_mapping,
    # )
    
    fig = go.Figure()
    
    for i, tp_name in enumerate(tp_names):
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]
        
        color = f'rgb({int(colors[i][0]*255)}, {int(colors[i][1]*255)}, {int(colors[i][2]*255)})'
        
        fig.add_trace(
            go.Box(
                x=[tp_name] * len(tp_df[col]),
                y=tp_df[col],
                name=tp_name,
                marker_color=color,
            )
        )


    fig.update_layout(
        yaxis_title=feature_columns[col],
        showlegend=False
    )
    
    col_ = col.replace("/", "_")
    
    fig.write_image(
        os.path.join(boxplots_folder, f"{col_}.png")
    )
    

### Targets

In [86]:
# for each feature column, plot box plot for each treatment plant

color_mapping = {
    "Less than": "red",
    "Greater than": "green",
    "Normal": "blue",
}

for col in target_columns:
    for tp_name in tp_names:
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]
        
        count = tp_df[col].count()
        total = tp_df[col].shape[0]

        perc = count / total * 100

        # remove outliers
        k = 3
        q1 = tp_df[col].quantile(0.25)
        q3 = tp_df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - k * iqr
        upper_bound = q3 + k * iqr

        # set outliers to None
        tp_df.loc[tp_df[col] < lower_bound, col] = None
        tp_df.loc[tp_df[col] > upper_bound, col] = None
        
        grab_samples_df.loc[grab_samples_df["Punto di prelievo"] == tp_name, col] = tp_df[col]
        
    # fig = px.strip(
    #     grab_samples_df,
    #     x="Punto di prelievo",
    #     y=col,
    #     color=col + "_label",
    #     title=col,
    #     labels={col: col, "Punto di prelievo": "Treatment Plant"},
    #     stripmode="overlay",
    #     color_discrete_map=color_mapping,
    # )
    
    fig = go.Figure()
    
    for i, tp_name in enumerate(tp_names):
        tp_df = grab_samples_df[grab_samples_df["Punto di prelievo"] == tp_name]
        
        color = f'rgb({int(colors[i][0]*255)}, {int(colors[i][1]*255)}, {int(colors[i][2]*255)})'
 
        
        fig.add_trace(
            go.Box(
                x=[tp_name] * len(tp_df[col]),
                y=tp_df[col],
                name=tp_name,
                marker_color=color
            )
        )


    fig.update_layout(
        yaxis_title=target_columns[col],
        showlegend=False
    )
    
    col_ = col.replace("/", "_")
    
    fig.write_image(
        os.path.join(boxplots_folder, f"{col_}.png")
    )