In [None]:
import os

import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from prophet import Prophet
from prophet.plot import plot_components_plotly, plot_plotly

import statsmodels.api as sm

from sklearn.metrics import mean_squared_error

# Paths

In [None]:
data_folder = os.path.join(os.path.join("..", '..', "data"))
raw_data_folder = os.path.join(data_folder, "Raw Data")

reunion_folder = os.path.join(raw_data_folder, "Riunione 24-04-2024")
intermediate_data_folder = os.path.join(data_folder, "Intermediate Data")

feltre_folder = os.path.join(reunion_folder, "feltre")

metadata_folder = os.path.join(data_folder, "Metadata")
metadata_feltre_folder = os.path.join(metadata_folder, "Feltre")

utils_folder = os.path.join("..", '..', "utils")

# Load Data

In [None]:
spectra_df = pd.read_excel(
    os.path.join(feltre_folder, "Spectra_ISOIL_21.06.2024-06.08.2024.xlsx"),
    header=1,
)
data_df = pd.read_excel(
    os.path.join(feltre_folder, "Data_ISOIL_21.06.2024-06.08.2024.xlsx"),
    header=1,
)
bacteria_df = pd.read_excel(
    os.path.join(feltre_folder, "Bactosense_dati_Luglio2024.xlsx")
)

grab_samples_df = pd.read_excel(
    os.path.join(intermediate_data_folder, "All grab samples.xlsx")
)

# Inspection

3 dataset diversi:

* Spectra: misurazioni di assorbanza a diverse lunghezze d'onda, da 200 a 420nm,

* Data: misurazioni di:
    * Cloro Libero
    * Cloro Totale (no valori)
    * Conduttività
    * DOC (valori non attendibili)
    * Nitrati
    * Nitriti (valori non attendibili)
    * pH
    * Pressione
    * Temperatura
    * TOC
    * Torbidità
    * UVA254
    
* BactoSense: misurazioni di:
    * ICC [1/mL]: concentrazione di cellule intatte
    * HNAC [1/mL]: concentrazione di cellule ad alto contenuto di acido nucleico
    * LNAC [1/mL]: concentrazione di cellule a basso contenuto di acido nucleico
    * HNAP [%]: frazione di ICC costituita da cellule ad alto contenuto di acido nucleico
    * TCC [1/mL] (no valori)
    * GATE+ [1/mL] (no valori)
    * ACC [1/mL] (no valori)
    * HACC [1/mL] (no valori)
    * LACC [1/mL] (no valori)
    * HACP [%] (no valori)


In questo [sito](https://amf.ch/application/microfluidic-flow-cytometry-quality-water-analysis/) pare che il TCC si possa ricavare dall'HNAP
HNAP = HNAC/TCC, però pare che dalla mail di Marco sia HNAC/ICC

## Spectra

In [None]:
spectra_df

In [None]:
spectra_df.rename(
    columns={
        "Measurement interval=0[sec] (Export-Aggregation disabled)": "DateTime",
    },
    inplace=True,
)

# drop rows where in the Status column is not present the word "OK"
spectra_df = spectra_df[spectra_df["Status (Source:0)"].str.contains("Ok")]
spectra_df.drop(columns=["Status (Source:0)"], inplace=True)

spectra_df.set_index("DateTime", inplace=True)

## Other Params

In [None]:
data_df

In [None]:
# drop columns that contain the word "Status"
data_df = data_df.loc[:, ~data_df.columns.str.contains("Status")]

data_df.rename(
    columns={
        "Measurement interval=0[sec] (Export-Aggregation disabled)": "DateTime",
    },
    inplace=True,
)

data_df = data_df.loc[:, ~data_df.columns.str.contains("Status")]

data_df.set_index("DateTime", inplace=True)

## Bacteria

Dato isolato al 16/07, poi dal 22/07 ogni due ore.

• ICC [1/mL]: concentrazione di cellule intatte;

• HNAC [1/mL]: concentrazione di cellule ad alto contenuto di acido nucleico;

• LNAC [1/mL]: concentrazione di cellule a basso contenuto di acido nucleico;

• HNAP [%]: frazione di ICC costituita da cellule ad alto contenuto di acido nucleico.

Tendenzialmente, HNAC+LNAC dovrebbe risultare in ICC.

In [None]:
bacteria_df

In [None]:
bacteria_df.columns.to_list()

In [None]:
# drop columns that contain the word "Status"
bacteria_df.drop(
    columns=[
        "Timestamp",
        "Date [local]",
        "Date [GMT]",
        "Instrument Name",
        "Instrument SN",
        "Mode",
        "Name",
        "Protocol",
        "Warnings",
        "Alarms",
        "Cartridge Fill",
    ],
    inplace=True,
)

bacteria_df.rename(
    columns={
        "Sampling Date [local]": "DateTime",
    },
    inplace=True,
)

bacteria_df.set_index("DateTime", inplace=True)

# remove first row
bacteria_df = bacteria_df.iloc[1:]

# Confront UV254

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=data_df.index,
        y=data_df["UV254 - Result (Limit:-Infinity-+Infinity)"],
        mode="lines",
        name="UV254 - Data",
        line=dict(color="red"),
    )
)

fig.add_trace(
    go.Scatter(
        x=spectra_df.index,
        y=spectra_df["254 nm"],
        mode="lines",
        name="UV254 - Spectra",
        line=dict(color="blue"),
    )
)

fig.update_layout(
    showlegend=True,
)

fig.show()

In [None]:
rmse = np.sqrt(
    mean_squared_error(
        data_df["UV254 - Result (Limit:-Infinity-+Infinity)"],
        spectra_df["254 nm"],
    )
)
rmse

# Plots

## Time-series

### Spectra

In [None]:
columns = ["254 nm"]

for col in columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=spectra_df.index,
            y=spectra_df[col],
            mode="lines",
            name=col,
        )
    )

    fig.update_layout(showlegend=True)

    fig.show()

### Other Params

In [None]:
# get the first part of the name of each column in the data_df before the '-'
columns = data_df.columns
columns = [column.split("-")[0] for column in columns]

# get unique values
columns = sorted(list(set(columns)))

In [None]:
import plotly.graph_objects as go

for col in columns:
    df = data_df.filter(regex=col, axis=1)

    fig = go.Figure()

    for column in df.columns:
        full_name = column.split("]")
        name = full_name[0] + "]"
        if len(full_name) > 1:
            limit = full_name[1]
        else:
            limit = ""

        # Ensure each trace has a unique name
        trace_name = name + limit

        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[column],
                mode="lines",
                name=trace_name,
            )
        )

    fig.update_layout(
        showlegend=True,
        legend=dict(
            orientation="h", yanchor="top", y=-0.12, font=dict(size=10)
        ),
    )

    fig.show()

### Bacteria

In [None]:
for col in bacteria_df.columns:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=bacteria_df.index,
            y=bacteria_df[col],
            mode="lines",
            name=col,
        )
    )

    fig.update_layout(showlegend=True)

    fig.show()

#### HNAC+LNAC dovrebbe risultare uguale a ICC

In [None]:
bacteria_df.columns

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=bacteria_df.index,
        y=bacteria_df["ICC [1/mL]"],
        mode="lines",
        name="ICC [1/mL]",
    )
)

fig.add_trace(
    go.Scatter(
        x=bacteria_df.index,
        y=bacteria_df["HNAC [1/mL]"] + bacteria_df["LNAC [1/mL]"],
        mode="lines",
        name="HNAC [1/mL] + LNAC [1/mL]",
    )
)

fig.show()

#### HNAP dovrebbe essere uguale a HNAC/ICC

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=bacteria_df.index,
        y=bacteria_df["HNAP [%]"],
        mode="lines",
        name="ICC [1/mL]",
    )
)

fig.add_trace(
    go.Scatter(
        x=bacteria_df.index,
        y=bacteria_df["HNAC [1/mL]"] / bacteria_df["ICC [1/mL]"],
        mode="lines",
        name="HNAC [1/mL] / ICC [1/mL]",
    )
)

fig.show()

### Before and After 4th July

#### Before

##### Spectra

In [None]:
columns = ["254 nm"]

for col in columns:
    before_df = spectra_df[spectra_df.index < "2024-07-04 00:00:00"]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=before_df.index,
            y=before_df[col],
            mode="lines",
            name=col,
        )
    )

    fig.update_layout(showlegend=True)

    fig.show()

##### Other Params

In [None]:
# get the first part of the name of each column in the data_df before the '-'
columns = data_df.columns
columns = [column.split("-")[0] for column in columns]

# get unique values
columns = sorted(list(set(columns)))

In [None]:
for col in columns:
    # regex that matches the column name and contains the word "Measured" or "Result"
    regex = col + ".*(?=Measured|Result)"

    df = data_df.filter(regex=regex, axis=1)

    fig = go.Figure()

    column = df.columns.to_list()[0]

    full_name = column.split("-")

    name = full_name[0]

    # take the unit of measurement between the square brackets
    unit = (
        full_name[1].split("[")[1].split("]")[0] if "[" in full_name[1] else ""
    )
    unit = "[" + unit + "]" if unit else ""

    before_df = df[df.index < "2024-07-04 00:00:00"]

    # add box plot before and after the change
    fig.add_trace(
        go.Scatter(
            x=before_df.index,
            y=before_df[column],
            mode="lines",
            name=name + unit,
        )
    )

    fig.update_layout(
        showlegend=True,
        title=dict(
            text=f"{name + unit} before 4th July 2024",
            font=dict(size=20),
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="right",
            x=1,
            font=dict(size=20),  # Adjust font size for legend
        ),
        yaxis_title=name + unit,
        # change x-axis font
        xaxis=dict(tickfont=dict(size=20)),
    )

    fig.show()

#### After

##### Spectra

In [None]:
columns = ["254 nm"]

for col in columns:
    after_df = spectra_df[spectra_df.index > "2024-07-04 23:59:59"]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=after_df.index,
            y=after_df[col],
            mode="lines",
            name=col,
        )
    )

    fig.update_layout(showlegend=True)

    fig.show()

##### Other Params

In [None]:
# get the first part of the name of each column in the data_df before the '-'
columns = data_df.columns
columns = [column.split("-")[0] for column in columns]

# get unique values
columns = sorted(list(set(columns)))

In [None]:
for col in columns:
    # regex that matches the column name and contains the word "Measured" or "Result"
    regex = col + ".*(?=Measured|Result)"

    df = data_df.filter(regex=regex, axis=1)

    fig = go.Figure()

    column = df.columns.to_list()[0]

    full_name = column.split("-")

    name = full_name[0]

    # take the unit of measurement between the square brackets
    unit = (
        full_name[1].split("[")[1].split("]")[0] if "[" in full_name[1] else ""
    )
    unit = "[" + unit + "]" if unit else ""

    after_df = df[df.index > "2024-07-04 23:59:59"]

    # add box plot before and after the change
    fig.add_trace(
        go.Scatter(
            x=after_df.index,
            y=after_df[column],
            mode="lines",
            name=name + unit,
        )
    )

    fig.update_layout(
        showlegend=True,
        title=dict(
            text=f"{name + unit} after 4th July 2024",
            font=dict(size=20),
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="right",
            x=1,
            font=dict(size=20),  # Adjust font size for legend
        ),
        yaxis_title=name + unit,
        # change x-axis font
        xaxis=dict(tickfont=dict(size=20)),
    )

    fig.show()

## Boxplots

It seems like something happened in 4th July, so we do two different boxplots before and after that date.

### Spectra

In [None]:
columns = ["254 nm"]

for col in columns:
    fig = go.Figure()

    df = spectra_df[col]

    before_df = df[df.index < "2024-07-04 00:00:00"]
    after_df = df[df.index > "2024-07-04 23:59:59"]

    # add box plot before and after the change
    fig.add_trace(go.Box(y=before_df, name="Before", marker_color="blue"))

    fig.add_trace(go.Box(y=after_df, name="After", marker_color="red"))

    fig.update_layout(
        title=f"Box plot of {col} before and after 4th July 2024",
        yaxis_title=col,
    )

    fig.show()

### Other Params

In [None]:
# get the first part of the name of each column in the data_df before the '-'
columns = data_df.columns
columns = [column.split("-")[0] for column in columns]

# get unique values
columns = sorted(list(set(columns)))

In [None]:
data_df.filter(regex="Temperature", axis=1).columns.to_list()

In [None]:
# drop redundant temperature columns
data_df.drop(
    columns=[
        "Temperature - Measured value [C] (Limit:-5.00-100.00_Coefs:-0.40 1.00 0.00 0.00)",
        "Temperature - Clean value [C] (Limit:-5.00-100.00)",
        "Temperature - Measured value [°C] (Limit:0.00-45.00_Coefs:0.00 0.00 0.00 0.00)",
        "Temperature - Clean value [°C] (Limit:0.00-45.00)",
    ],
    inplace=True,
)

In [None]:
for col in columns:
    # regex that matches the column name and contains the word "Measured" or "Result"
    regex = col + ".*(?=Measured|Result)"

    df = data_df.filter(regex=regex, axis=1)

    fig = go.Figure()

    column = df.columns.to_list()[0]

    full_name = column.split("-")

    name = full_name[0]

    # take the unit of measurement between the square brackets
    unit = (
        full_name[1].split("[")[1].split("]")[0] if "[" in full_name[1] else ""
    )
    unit = "[" + unit + "]" if unit else ""

    before_df = df[df.index < "2024-07-04 00:00:00"]
    after_df = df[df.index > "2024-07-04 23:59:59"]

    # add box plot before and after the change
    fig.add_trace(
        go.Box(
            y=before_df[column],
            name="Before",
        )
    )

    fig.add_trace(
        go.Box(
            y=after_df[column],
            name="After",
        )
    )

    fig.update_layout(
        showlegend=True,
        title=dict(
            text=f"{name + unit} before and after 4th July 2024",
            font=dict(size=20),
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="right",
            x=1,
            font=dict(size=20),  # Adjust font size for legend
        ),
        yaxis_title=name + unit,
        # change x-axis font
        xaxis=dict(tickfont=dict(size=20)),
    )

    fig.show()

# Trend Analysis

## Other Params

In [None]:
data_df.columns

In [None]:
# remove Total Chlorine column
data_df.drop(
    columns=[
        "Total Chlorine - Measured value [mg/l] (Limit:0.00-2.00_Coefs:0.00 1.00 0.00 0.00)"
    ],
    inplace=True,
)

In [None]:
# get the first part of the name of each column in the data_df before the '-'
columns = data_df.columns
columns = [column.split("-")[0] for column in columns]

# get unique values
columns = sorted(list(set(columns)))

In [None]:
for col in columns:
    # regex that matches the column name and contains the word "Measured" or "Result"
    regex = col + ".*(?=Measured|Result)"

    df = data_df.filter(regex=regex, axis=1)

    df = df.resample("2h").mean()

    fig = go.Figure()

    column = df.columns.to_list()[0]

    full_name = column.split("-")

    name = full_name[0]

    # take the unit of measurement between the square brackets
    unit = (
        full_name[1].split("[")[1].split("]")[0] if "[" in full_name[1] else ""
    )
    unit = "[" + unit + "]" if unit else ""

    after_df = df[df.index > "2024-07-04 23:59:59"]

    model = Prophet(weekly_seasonality=False, changepoint_prior_scale=0.1)

    after_df = after_df.reset_index()
    after_df.rename(columns={"DateTime": "ds", column: "y"}, inplace=True)
    model.fit(after_df)

    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    fig = plot_components_plotly(model, forecast)

    fig.add_trace(
        go.Scatter(
            x=after_df["ds"],
            y=after_df["y"],
            mode="lines",
            name="Data",
        )
    )

    fig.update_layout(
        legend=dict(traceorder="normal"),
        title=dict(
            text=f"{name + unit} after 4th July 2024",
            font=dict(size=20),
        ),
    )

    fig.show()

In [None]:
for col in columns:
    # regex that matches the column name and contains the word "Measured" or "Result"
    regex = col + ".*(?=Measured|Result)"

    df = data_df.filter(regex=regex, axis=1)

    df = df.resample("2h").mean()

    fig = go.Figure()

    column = df.columns.to_list()[0]

    full_name = column.split("-")

    name = full_name[0]

    # take the unit of measurement between the square brackets
    unit = (
        full_name[1].split("[")[1].split("]")[0] if "[" in full_name[1] else ""
    )
    unit = "[" + unit + "]" if unit else ""

    after_df = df[df.index > "2024-07-04 23:59:59"]

    model = Prophet()

    after_df = after_df.reset_index()
    after_df.rename(columns={"DateTime": "ds", column: "y"}, inplace=True)
    model.fit(after_df)

    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # # plot trend with changepoints
    # fig.add_trace(
    #     go.Scatter(
    #         x=forecast['ds'],
    #         y=forecast['trend'],
    #         mode="lines",
    #         name="Trend",
    #     )
    # )

    # fig.add_trace(
    #     go.Scatter(
    #         x=after_df['ds'],
    #         y=after_df['y'],
    #         mode="lines",
    #         name="Data",
    #     )
    # )

    fig = plot_plotly(
        model,
        forecast,
        trend=True,
        changepoints=True,
        xlabel="Date",
        ylabel=name + unit,
        changepoints_threshold=0.8,
    )

    # remove trace legends that are None
    fig.for_each_trace(
        lambda trace: trace.update(showlegend=False)
        if trace.name is None
        else ()
    )

    fig.update_layout(
        showlegend=True,
        legend=dict(traceorder="normal"),
        title=dict(
            text=f"{name + unit} after 4th July 2024",
            font=dict(size=20),
        ),
    )

    fig.show()

## Bacteria

In [None]:
bacteria_df.columns

In [None]:
# remove 0 columns
bacteria_df.drop(
    columns=[
        "TCC [1/mL]",
        "GATE+ [1/mL]",
        "ACC [1/mL]",
        "HACC [1/mL]",
        "LACC [1/mL]",
        "HACP [%]",
    ],
    inplace=True,
)

In [None]:
for column in bacteria_df.columns:
    df = bacteria_df[column]

    model = Prophet()

    df = df.reset_index()
    df.rename(columns={"DateTime": "ds", column: "y"}, inplace=True)
    model.fit(df)

    forecast = model.predict()

    # Create subplots
    fig = plot_plotly(
        model,
        forecast,
        trend=True,
        changepoints=True,
        xlabel="Date",
        ylabel=column,
        changepoints_threshold=0.5,
    )

    # get the change points chosen to be shown
    signif_changepoints = model.changepoints[
        np.abs(np.nanmean(model.params["delta"], axis=0)) >= 0.5
    ]

    # compute the sensor frequency
    sensor_freq = df["ds"].diff().dropna().mean()

    annotation_text = ""

    if len(signif_changepoints) > 0:
        before_change = forecast[forecast["ds"] < signif_changepoints.values[0]]
        after_change = forecast[forecast["ds"] > signif_changepoints.values[0]]

        # compute slope before and after the change
        X = sm.add_constant(before_change.index.values)
        y = before_change["trend"].values
        model = sm.OLS(y, X)
        results = model.fit()
        slope_before = results.params[1]

        X = sm.add_constant(after_change.index.values)
        y = after_change["trend"].values
        model = sm.OLS(y, X)
        results = model.fit()
        slope_after = results.params[1]

        # add annotations with the slope before and after the change
        annotation_text = (
            f"Slope before change: {slope_before:.2f}<br>"
            f"Slope after change: {slope_after:.2f}"
        )
    else:
        X = sm.add_constant(forecast.index.values)
        y = forecast["trend"].values
        model = sm.OLS(y, X)
        results = model.fit()
        slope = results.params[1]

        # add annotations with the slope
        annotation_text = f"Slope: {slope:.3f}"

    # add a single annotation with all the information
    fig.add_annotation(
        x=0.05,
        y=0.95,
        xref="paper",
        yref="paper",
        text=annotation_text
        + f"<br>Mean Sensor frequency: {sensor_freq.components.hours}h {sensor_freq.components.minutes}min",
        showarrow=False,
        font=dict(size=20),
        align="left",
        bordercolor="black",
        borderwidth=2,
        borderpad=4,
        bgcolor="lightgrey",
    )

    # remove trace legends that are None
    fig.for_each_trace(
        lambda trace: trace.update(showlegend=False)
        if trace.name is None
        else ()
    )

    # sort the legends alphabetically
    fig.update_layout(
        showlegend=True,
        legend=dict(traceorder="normal"),
        title=dict(
            text=f"{column}",
            font=dict(size=20),
        ),
    )

    fig.show()

# Comparison between Historical Grab and New Sensors

In [None]:
data_df.columns

In [None]:
# just the data after the 4th of July 2024
data_df = data_df[data_df.index > "2024-07-04 23:59:59"]

In [None]:
grab_samples_df = grab_samples_df[grab_samples_df['Punto di prelievo'] == 'Feltre']

## Summary Historical Feltre

In [None]:
grab_samples_df

In [None]:
from operator import contains

info_columns = [x for x in grab_samples_df.columns[4:] if (not contains(x, '_label'))]

In [None]:
info_columns

In [None]:
info_df = pd.DataFrame(
    index=pd.Index(
        [
            "N° Entries",
            "N° Valid Samples",
            "N° Missing",
            "N° < LOQ",
            "Mean",
            "Std",
            "Start Date",
            "End Date",
        ],
        name="Info",
    ),
    columns=info_columns,
)


In [None]:
# store the information in the station_info_df
for column in info_columns:
    
    df = grab_samples_df[['Data di prelievo', column, column + "_label" ]].copy()
    
    if df.dropna().shape[0] == 0:
        continue
    
    df['Data di prelievo'] = pd.to_datetime(df['Data di prelievo'])

  
    start_date = df.dropna()['Data di prelievo'].min().strftime("%Y-%m-%d")
    end_date = df.dropna()['Data di prelievo'].max().strftime("%Y-%m-%d")
    

    df = df[(df['Data di prelievo'] >= start_date) & (df['Data di prelievo'] <= end_date)]

    missing_values = df[df[column + '_label'].isna()].shape[0] / df.shape[0] * 100
    
    info_df.loc["N° Entries", column] = df.shape[0]

    info_df.loc["N° Valid Samples", column] = (
        df[column + "_label"].notna().sum()
    )
    info_df.loc[
        "N° Missing", column
    ] = round(missing_values, 2)
    
    info_df.loc["N° < LOQ", column] = df[df[column + "_label"] == "Less than"].shape[0]
    
    info_df.loc["Mean",  column] = df[column].mean().round(2)
    info_df.loc["Std", column] = df[column].std().round(2)
    
    info_df.loc["Start Date", column] = start_date
    info_df.loc["End Date", column] = end_date

In [None]:
info_df

In [None]:
info_df.to_excel(os.path.join(metadata_feltre_folder, "Summary_Historical.xlsx"))

## Compare only the time period there is in common between sensors and grabs

In [None]:
# just the data from the month of July
grab_samples_df = grab_samples_df[(grab_samples_df['Data di prelievo'].dt.month == 7) & (grab_samples_df['Data di prelievo'].dt.year > 2015)]

In [None]:
grab_samples_df

In [None]:
column_renames = {
    'TOCeq - Measured value [mg/l] (Limit:0.00-8.00_Coefs:0.00 1.00 0.00 0.00)': 'TOC (mg/L)',
    'Turbidity - Measured value [FTU] (Limit:0.00-20.00_Coefs:0.00 1.00 0.00 0.00)': 'Turbidity (FTU)',
    'Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)': 'Conductivity (uS/cm)',
    'Temperature - Measured value [°C] (Limit:-20.00-130.00)': 'Temperature (°C)',
    'pH - Measured value (Limit:0.00-14.00_Coefs:-2.40 0.97 0.00 0.00)': 'pH',
    'Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00_Coefs:0.00 7.60 0.00 0.00)': 'Free Chlorine (mg/L)',
    'nitrati - Result (Limit:-Infinity-+Infinity)': 'Nitrates (mg/L)',
}

In [None]:
columns_mapping = {
    'TOCeq - Measured value [mg/l] (Limit:0.00-8.00_Coefs:0.00 1.00 0.00 0.00)': 'TOC - carbonio organico totale (mg/L di C)',
    'Turbidity - Measured value [FTU] (Limit:0.00-20.00_Coefs:0.00 1.00 0.00 0.00)': 'Torbidità (NTU)',
    'Conductivity - Measured value [uS/cm] (Limit:0.10-600000.00)': 'Conduttività a 20°C (µS/cm)',
    'Temperature - Measured value [°C] (Limit:-20.00-130.00)': 'Temperatura (°C)',
    'pH - Measured value (Limit:0.00-14.00_Coefs:-2.40 0.97 0.00 0.00)': 'Concentrazione ioni idrogeno (unità pH)',
    'Free Chlorine - Measured value [mg/l] (Limit:0.00-2.00_Coefs:0.00 7.60 0.00 0.00)': 'Cloro residuo libero (mg/L di Cl2)',
    'nitrati - Result (Limit:-Infinity-+Infinity)': 'Nitrati (mg/L)'
}

In [None]:
# boxplot to compare the grab samples with the data
for sensor_column, grab_column in columns_mapping.items():
    fig = go.Figure()
    
    s_df = data_df[sensor_column].copy()
    s_df = s_df.resample('D').mean()
    
    print(grab_samples_df[grab_column].count())

    # add box plot before and after the change
    fig.add_trace(go.Box(y=s_df, name="Sensor", marker_color="blue"))

    fig.add_trace(go.Box(y=grab_samples_df[grab_column], name="Grab", marker_color="red"))

    fig.update_layout(
        title=f"Feltre - {column_renames[sensor_column]}",
        yaxis_title=column_renames[sensor_column],
    )

    fig.show(
        renderer='svg',
        width=1000,
    )