# Supply Points Analysis between Grab and Sensors

In [8]:
import os
import json
import pandas as pd
import plotly.graph_objects as go

In [9]:
utils_folder = os.path.join('..', '..', 'utils')

data_folder = os.path.join('..', '..', 'data')
clean_data_folder = os.path.join(data_folder, 'Clean Data')
metadata_folder = os.path.join(data_folder, 'Metadata')

sensor_folder = os.path.join(clean_data_folder, 'sensors')

# Load Data

In [10]:
grab_df = pd.read_excel(os.path.join(clean_data_folder, 'grab.xlsx'))

In [11]:
sensor_dict = {}

for file in os.listdir(sensor_folder):
    if file.endswith('.xlsx'):
        sensor_dict[file.split('.')[0]] = pd.read_excel(os.path.join(sensor_folder, file))

In [12]:
with open(os.path.join(utils_folder, "columns_types.json")) as f:
    column_types = json.load(f)
    
metadata_columns = column_types["metadata_columns"]
features_columns = column_types["features_columns"]
targets_columns = column_types["targets_columns"]

In [None]:
grab_df

In [14]:
grab_df.rename(
    columns={
        'Punto di prelievo': 'Code',
        'Data di prelievo': 'DateTime'
    },
    inplace=True
)

In [15]:
# rename grab columns
feature_mapping = {
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/l)",
    "Colore (Cu)": "Color (CU)",
    "Concentr. ioni idrogeno (al prelievo) (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/l)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Torbidità (NTu)": "Turbidity (FTU)",
    "Nitrati (mg/L)": "Nitrate (mg/l)",
    
}

targets_mapping = {
    "Batteri coliformi a 37°C (MPN/100 mL)": "Coliforms (MPN/100ml)",
    "Bromodiclorometano (µg/L)": "Bromodichloromethane (µg/l)",
    "Bromoformio (µg/L)": "Bromoform (µg/l)",
    "Cloroformio (µg/L)": "Chloroform (µg/l)",
    "Conta delle colonie a 22°C (UFC/mL)": "Colony count at 22°C (UFC/ml)",
    "Dibromoclorometano (µg/L)": "Dibromochloromethane (µg/l)",
    "Enterococchi (MPN/100 mL)": "Enterococci (MPN/100ml)",
    "Escherichia coli (MPN/100 mL)": "Escherichia coli (MPN/100ml)",
    "Pseudomonas aeruginosa (UFC/250 mL)": "Pseudomonas aeruginosa (UFC/250ml)",
    "Acido Perfluoroottanoico PFOA (µg/L)": "Perfluorooctanoic acid PFOA (µg/l)",
    "Acido Perfluoroottansolfonico PFOS (µg/L)": "Perfluorooctanesulfonic acid PFOS (µg/l)",
    "Somma di PFAS (µg/L)": "Sum of PFAS (µg/l)",
}

In [16]:
# rename grab_df columns that contain features
for column in grab_df.columns:
    if column in feature_mapping:
        grab_df.rename(columns={column: feature_mapping[column]}, inplace=True)
        
    if column in targets_mapping:
        grab_df.rename(columns={column: targets_mapping[column]}, inplace=True)
        
    if len(column.split('_')) > 1:
        if column.split('_')[0] in feature_mapping:
            new_name = feature_mapping[column.split('_')[0]]
            new_name = new_name + ' ' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)
            
        if column.split('_')[0] in targets_mapping:
            new_name = targets_mapping[column.split('_')[0]]
            new_name = new_name + '_' + column.split('_')[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [None]:
grab_df

# Metadata Info

## Grab

In [18]:
feature_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([feature_mapping.values(), ['N° Data', 'N° Missing', 'Mean', 'Std']]),
    index=grab_df['Code'].unique()
)

In [19]:
for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        feature_df.loc[code, (feature, 'N° Data')] = grab_df[grab_df['Code'] == code][feature].count()
        feature_df.loc[code, (feature, 'N° Missing')] = grab_df[grab_df['Code'] == code][feature].isna().sum()
        feature_df.loc[code, (feature, 'Mean')] = grab_df[grab_df['Code'] == code][feature].mean()
        feature_df.loc[code, (feature, 'Std')] = grab_df[grab_df['Code'] == code][feature].std()

In [None]:
feature_df

In [21]:
targets_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([targets_mapping.values(), ['N° Data', 'N° Missing', 'Mean', 'Std']]),
    index=grab_df['Code'].unique()
)

In [22]:
for code in grab_df['Code'].unique():
    for target in targets_mapping.values():
        targets_df.loc[code, (target, 'N° Data')] = grab_df[grab_df['Code'] == code][target].count()
        targets_df.loc[code, (target, 'N° Missing')] = grab_df[grab_df['Code'] == code][target].isna().sum()
        targets_df.loc[code, (target, 'Mean')] = grab_df[grab_df['Code'] == code][target].mean()
        targets_df.loc[code, (target, 'Std')] = grab_df[grab_df['Code'] == code][target].std()

In [None]:
targets_df

In [24]:
%%script false --no-raise-error
feature_df.to_excel(
    os.path.join(metadata_folder, 'Grab', 'features.xlsx')
)

targets_df.to_excel(
    os.path.join(metadata_folder, 'Grab', 'targets.xlsx')
)

## Sensor

In [25]:
### Fix Conductivity name
for sensor in sensor_dict:
    sensor_dict[sensor].rename(columns={'Conductivity (μS/cm)': 'Conductivity (uS/cm)'}, inplace=True)

In [26]:
sensor_columns = sensor_dict['Berna'].columns.difference(['DateTime'])

In [27]:
sensors_df = pd.DataFrame(
    columns=pd.MultiIndex.from_product([sensor_columns, ['Mean', 'Std']]),
    index=list(sensor_dict.keys())
)

In [28]:
for sensor in sensor_dict.keys():
    for column in sensor_columns:
        sensors_df.loc[sensor, (column, 'Mean')] = sensor_dict[sensor][column].mean()
        sensors_df.loc[sensor, (column, 'Std')] = sensor_dict[sensor][column].std()

In [None]:
sensors_df

In [30]:
sensors_df.to_excel(
    os.path.join(metadata_folder, 'Sensor', 'sensors.xlsx')
)

# Time Series Comparison

In [None]:
# plot the time series of the sensors and the grab data

for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        
        g_df = grab_df[grab_df['Code'] == code].copy()
        
        s_df = sensor_dict[code].copy()
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x=g_df['DateTime'],
                y=g_df[feature],
                mode='markers',
                name='Grab'
            )
        )
        
        fig.add_trace(
            go.Scatter(
                x=s_df['DateTime'],
                y=s_df[feature],
                mode='lines',
                name='Sensor'
            )
        )
        
        fig.update_layout(
            title=f'{code} - {feature}',
            xaxis_title='DateTime',
            yaxis_title=feature
        )
        
        fig.show()
        
        
        
        

# Boxplot Comparison 

In [None]:
# plot the box plot of grab data and sensor data

for code in grab_df['Code'].unique():
    for feature in feature_mapping.values():
        
        g_df = grab_df[grab_df['Code'] == code].copy()
        
        s_df = sensor_dict[code].copy()
        
        # resample the sensor data
        s_df['DateTime'] = pd.to_datetime(s_df['DateTime'])
        s_df.set_index('DateTime', inplace=True)
        s_df = s_df.resample('D').mean().reset_index()
        
        if feature == 'Free Chlorine (mg/l)':
            s_df = s_df[s_df[feature] < 5]
            
        if feature == 'TOC (mg/l)':
            s_df = s_df[s_df[feature] < 2]
            
        if feature == 'Turbidity (FTU)':
            s_df = s_df[s_df[feature] < 1.5]
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Box(
                y=g_df[feature],
                name='Grab'
            )
        )
        
        fig.add_trace(
            go.Box(
                y=s_df[feature],
                name='Sensor'
            )
        )
        
        fig.update_layout(
            title=f'{code} - {feature}',
            yaxis_title=feature
        )
        
        if code == 'Tabacchi':
            fig.show(
                renderer='svg',
                width=800,
            )
            
        if code == 'Montevideo':
            fig.show(
                renderer='svg',
                width=800,
            )
        

In [33]:
for feature in feature_mapping.values():
    
    # resample the sensor data
    s_df['DateTime'] = pd.to_datetime(s_df['DateTime'])
    s_df.set_index('DateTime', inplace=True)
    s_df = s_df.resample('D').mean().reset_index()
    
    if feature == 'Free Chlorine (mg/l)':
        s_df = s_df[s_df[feature] < 5]
        
    if feature == 'TOC (mg/l)':
        s_df = s_df[s_df[feature] < 2]
        
    if feature == 'Turbidity (FTU)':
        s_df = s_df[s_df[feature] < 1.5]
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Box(
            y=s_df[feature],
            name='Sensor'
        )
    )
    
    fig.update_layout(
        title=f'{feature}',
        yaxis_title=feature
    )
    
    feature_ = feature.replace('/', '_')
    
    fig.write_image(
        os.path.join(metadata_folder, 'Sensor', f'{feature_}.png')
    )
    