# Supply Points Analysis between Grab and Sensors

In [None]:
import os
import json
import pandas as pd
import plotly.graph_objects as go

from statsmodels.tsa.seasonal import STL

In [None]:
utils_folder = os.path.join('..', '..', 'utils')

data_folder = os.path.join('..', '..', 'data')
clean_data_folder = os.path.join(data_folder, 'Clean Data')
metadata_folder = os.path.join(data_folder, 'Metadata')
plot_folder = os.path.join(data_folder, 'Plots')

sensor_folder = os.path.join(clean_data_folder, 'sensors')

# Load Data

In [None]:
grab_df = pd.read_excel(os.path.join(clean_data_folder, 'grab.xlsx'))

In [None]:
sensor_dict = {}

for file in os.listdir(sensor_folder):
    if file.endswith('.xlsx'):
        sensor_dict[file.split('.')[0]] = pd.read_excel(os.path.join(sensor_folder, file))

In [None]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)
    
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
grab_df

In [None]:
from operator import contains


label_columns = [col for col in grab_df.columns if contains(col, 'label')]

In [None]:
label_columns

In [None]:
# rename grab columns
feature_mapping = {
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/L)",
    "Colore (Cu)": "Color (CU)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Torbidità (NTu)": "Turbidity (NTU)",
    "Nitrati (mg/L)": "Nitrate (mg/L)",
    
}

targets_mapping = {
    "Batteri coliformi a 37°C (MPN/100 mL)": "Coliforms (MPN/100mL)",
    "Bromodiclorometano (µg/L)": "Bromodichloromethane (µg/L)",
    "Bromoformio (µg/L)": "Bromoform (µg/L)",
    "Cloroformio (µg/L)": "Chloroform (µg/L)",
    "Conta delle colonie a 22°C (UFC/mL)": "Colony count at 22°C (UFC/mL)",
    "Conteggio colonie a 30°C (UFC/mL)": "Colony count at 30°C (UFC/mL)",
    "Conta delle colonie a 37°C (UFC/mL)": "Colony count at 37°C (UFC/mL)",
    "Dibromoclorometano (µg/L)": "Dibromochloromethane (µg/L)",
    "Enterococchi (MPN/100 mL)": "Enterococci (MPN/100mL)",
    "Escherichia coli (MPN/100 mL)": "Escherichia coli (MPN/100mL)",
    "Pseudomonas aeruginosa (UFC/250 mL)": "Pseudomonas aeruginosa (UFC/250mL)",
    "Acido Perfluoroottanoico PFOA (µg/L)": "Perfluorooctanoic acid PFOA (µg/L)",
    "Acido Perfluoroottansolfonico PFOS (µg/L)": "Perfluorooctanesulfonic acid PFOS (µg/L)",
    "Somma di PFAS (µg/L)": "Sum of PFAS (µg/L)",
}

In [None]:
# rename grab_df columns that contain features
for column in grab_df.columns:
    if column in targets_mapping:
        grab_df.rename(columns={column: targets_mapping[column]}, inplace=True)
        
    if len(column.split('_')) > 1:
        if column.split('_')[0] in feature_mapping:
            new_name = feature_mapping[column.split('_')[0]]
            new_name = new_name + '_' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)
            
        if column.split('_')[0] in targets_mapping:
            new_name = targets_mapping[column.split('_')[0]]
            new_name = new_name + '_' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [None]:
# rename the label columns
for column in grab_df.columns:
    if column in label_columns:
        
        variable_name = column.split('_')[0]
        
        if variable_name in feature_mapping:
            new_name = feature_mapping[variable_name]
            new_name = new_name + '_' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)
            
        if variable_name in targets_mapping:
            new_name = targets_mapping[variable_name]
            new_name = new_name + '_' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [None]:
grab_df

# Metadata Info

## Grab

In [None]:
feature_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([feature_mapping.values(), ['N° Entries', 'N° Valid Samples', 'N° Missing', 'N° < LOQ', 'Mean', 'Std', 'Start Date', 'End Date']]),
    index=grab_df['Code'].unique()
)

In [None]:
for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        df = grab_df[grab_df['Code'] == code][['DateTime', feature, feature + "_label" ]].copy()
    
        if df.dropna().shape[0] == 0:
            continue
        
        df['DateTime'] = pd.to_datetime(df['DateTime'])

    
        start_date = df.dropna()['DateTime'].min().strftime("%Y-%m-%d")
        end_date = df.dropna()['DateTime'].max().strftime("%Y-%m-%d")
        

        df = df[(df['DateTime'] >= start_date) & (df['DateTime'] <= end_date)]

        missing_values = df[df[feature + '_label'].isna()].shape[0] / df.shape[0] * 100
        
        feature_df.loc[code, (feature, 'N° Entries')] = df.shape[0]

        feature_df.loc[code, (feature, 'N° Valid Samples')] = (
            df[feature + "_label"].notna().sum()
        )
        feature_df.loc[
            code, (feature, "N° Missing")
        ] = round(missing_values, 2)
        
        feature_df.loc[code, (feature, 'N° < LOQ')] = df[df[feature + "_label"] == "Less than"].shape[0]
        
        feature_df.loc[code, (feature, "Mean")] = df[feature].mean()
        feature_df.loc[code, (feature, "Std")] = df[feature].std()
        
        feature_df.loc[code, (feature, "Start Date")] = start_date
        feature_df.loc[code, (feature, "End Date")] = end_date

In [None]:
feature_df

In [None]:
targets_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([targets_mapping.values(), ['N° Entries', 'N° Valid Samples', 'N° Missing', 'N° < LOQ', 'Mean', 'Std', 'Start Date', 'End Date']]),
    index=grab_df['Code'].unique()
)

In [None]:
for code in grab_df['Code'].unique():
    for target in targets_mapping.values():
        df = grab_df[grab_df['Code'] == code][['DateTime', target, target + "_label" ]].copy()
    
        if df.dropna().shape[0] == 0:
            continue
        
        df['DateTime'] = pd.to_datetime(df['DateTime'])

    
        start_date = df.dropna()['DateTime'].min().strftime("%Y-%m-%d")
        end_date = df.dropna()['DateTime'].max().strftime("%Y-%m-%d")
        

        df = df[(df['DateTime'] >= start_date) & (df['DateTime'] <= end_date)]

        missing_values = df[df[target + '_label'].isna()].shape[0] / df.shape[0] * 100
        
        targets_df.loc[code, (target, 'N° Entries')] = df.shape[0]

        targets_df.loc[code, (target, 'N° Valid Samples')] = (
            df[target + "_label"].notna().sum()
        )
        targets_df.loc[
            code, (target, "N° Missing")
        ] = round(missing_values, 2)
        
        targets_df.loc[code, (target, 'N° < LOQ')] = df[df[target + "_label"] == "Less than"].shape[0]
        
        targets_df.loc[code, (target, "Mean")] = df[target].mean()
        targets_df.loc[code, (target, "Std")] = df[target].std()
        
        targets_df.loc[code, (target, "Start Date")] = start_date
        targets_df.loc[code, (target, "End Date")] = end_date

In [None]:
targets_df

In [None]:
# %%script false --no-raise-error
feature_df.to_excel(
    os.path.join(metadata_folder, 'Grab', 'features.xlsx')
)

targets_df.to_excel(
    os.path.join(metadata_folder, 'Grab', 'targets.xlsx')
)

## Sensor

In [None]:
### Fix Conductivity name
for sensor in sensor_dict:
    sensor_dict[sensor].rename(columns={'Conductivity (μS/cm)': 'Conductivity (uS/cm)'}, inplace=True)

In [None]:
sensor_columns = sensor_dict['Berna'].columns.difference(['DateTime'])

In [None]:
sensor_columns

In [None]:
sensors_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([sensor_columns, ['N° Data', 'N° Missing', 'Mean', 'Std']]),
    index=list(sensor_dict.keys())
)

In [None]:
for sensor in sensor_dict.keys():
    for column in sensor_columns:
        
        if sensor == 'Berna' and column == 'Turbidity (FTU)':
            
            df = sensor_dict[sensor].copy()
            # remove rows with Turbidity > 2
            df = df[df['Turbidity (FTU)'] <= 2]
            
            
            sensors_df.loc[sensor, (column, 'N° Data')] = df[column].count()
            sensors_df.loc[sensor, (column, 'N° Missing')] = df[column].isna().sum()
            sensors_df.loc[sensor, (column, 'Mean')] = df[column].mean()
            sensors_df.loc[sensor, (column, 'Std')] = df[column].std()
            continue
            
        sensors_df.loc[sensor, (column, 'N° Data')] = sensor_dict[sensor][column].count()
        sensors_df.loc[sensor, (column, 'N° Missing')] = sensor_dict[sensor][column].isna().sum()
        sensors_df.loc[sensor, (column, 'Mean')] = sensor_dict[sensor][column].mean()
        sensors_df.loc[sensor, (column, 'Std')] = sensor_dict[sensor][column].std()

In [None]:
sensors_df

In [None]:
sensors_df.to_excel(
    os.path.join(metadata_folder, 'Sensor', 'sensors.xlsx')
)

# Time Series Comparison

In [None]:
# plot the time series of the sensors and the grab data

for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        
        g_df = grab_df[grab_df['Code'] == code].copy()
        
        s_df = sensor_dict[code].copy()
        
        
        # moving average on sensor data
        
        ma_s_df = s_df.copy()
        
        ma_s_df.set_index('DateTime', inplace=True)
        ma_s_df = ma_s_df.rolling(window=4*24).mean()  
        
        loess_s_df = s_df.copy()
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x=g_df['DateTime'],
                y=g_df[feature],
                mode='markers',
                name='Grab'
            )
        )
        
        fig.add_trace(
            go.Scatter(
                x=s_df['DateTime'],
                y=s_df[feature],
                mode='lines',
                name='Sensor'
            )
        )
        
        fig.add_trace(
            go.Scatter(
                x=ma_s_df.index,
                y=ma_s_df[feature],
                mode='lines',
                name='Sensor MA'
            )
        )
        
        fig.update_layout(
            title=f'{code} - {feature}',
            xaxis_title='DateTime',
            yaxis_title=feature
        )
        
        if not os.path.exists(os.path.join(plot_folder, "Comparison", "15min", "Timeseries", code)):
            os.makedirs(os.path.join(plot_folder, "Comparison", "15min", 'Timeseries', code))
        
        feature_ = feature.replace('/', '_')
        
        # fig.write_image(
        #     os.path.join(
        #         plot_folder, "Comparison", "15min", "Timeseries", code, f'{feature_}.png'
        #     )
        # )
        
        fig.show()

# Boxplot Comparison 

In [None]:
# plot the boxplots of the sensor data all together, without the code

sensor_df = pd.concat(sensor_dict.values())

for column in sensor_columns:
    
    fig = go.Figure()
        
    fig.add_trace(
        go.Box(
            y=sensor_df[column],
            name=column
        )
    )
        
    fig.update_layout(
        title=f'{column}',
        yaxis_title=column
    )
    
    column_ = column.replace('/', '_')
    
    fig.write_image(
        os.path.join(
            metadata_folder, 'Sensor', f'{column_}.png'
        )
    )



In [None]:
# plot the box plot of grab data and sensor data

for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        
        g_df = grab_df[grab_df['Code'] == code].copy()
        
        s_df = sensor_dict[code].copy()
        
        # resample the sensor data
        s_df['DateTime'] = pd.to_datetime(s_df['DateTime'])
        s_df.set_index('DateTime', inplace=True)
        s_df = s_df.resample('D').mean().reset_index()
        
        if feature == 'Free Chlorine (mg/l)':
            s_df = s_df[s_df[feature] < 5]
            
        if feature == 'TOC (mg/l)':
            s_df = s_df[s_df[feature] < 2]
            
        if feature == 'Turbidity (FTU)':
            s_df = s_df[s_df[feature] < 1.5]
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Box(
                y=g_df[feature],
                name='Grab'
            )
        )
        
        fig.add_trace(
            go.Box(
                y=s_df[feature],
                name='Sensor'
            )
        )
        
        fig.update_layout(
            title=f'{code} - {feature}',
            yaxis_title=feature
        )
        
        if not os.path.exists(os.path.join(plot_folder, "Comparison", "15min", "Boxplot", code)):
            os.makedirs(os.path.join(plot_folder, "Comparison", "15min", 'Boxplot', code))
            
        feature_ = feature.replace('/', '_')
        
        fig.write_image(
            os.path.join(
                plot_folder, "Comparison", "15min", "Boxplot", code, f'{feature_}.png'
            )
        )
        