# Feltre Sensor Data Preprocessing

In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go

In [None]:
utils_folder = os.path.join("..", "..", "utils")

with open(os.path.join(utils_folder, "onedrive.txt"), "r") as f:
    cloud_data_folder = os.path.join(f.readline().strip(), "Centrali")

sensor_folder = os.path.join(cloud_data_folder, "Sensori")
feltre_folder = os.path.join(sensor_folder, "Feltre")
probe_folder = os.path.join(feltre_folder, "Sonde")
cytometer_folder = os.path.join(probe_folder, "Citometro")
multiparam_folder = os.path.join(probe_folder, "Multiparametrica")

local_data_folder = os.path.join("..", "..", "data")
clean_data_folder = os.path.join(local_data_folder, "Clean Data")
plot_data_folder = os.path.join(local_data_folder, "Plots", "Feltre")

## Cytometer

In [None]:
# Load cytometer data

cytometer_files = [f for f in os.listdir(cytometer_folder) if f.endswith(".xlsx")]

auto_cytometer_df = pd.DataFrame()
error_cytometer_df = pd.DataFrame()

for cytometer_file in cytometer_files:
    
    if "auto" in cytometer_file:
        if auto_cytometer_df.empty:
            auto_cytometer_df = pd.read_excel(os.path.join(cytometer_folder, cytometer_file))
        else:
            auto_cytometer_df = pd.concat([auto_cytometer_df, pd.read_excel(os.path.join(cytometer_folder, cytometer_file))])
        
    elif "error" in cytometer_file:
        if error_cytometer_df.empty:
            error_cytometer_df = pd.read_excel(os.path.join(cytometer_folder, cytometer_file))
        else:
            error_cytometer_df = pd.concat([error_cytometer_df, pd.read_excel(os.path.join(cytometer_folder, cytometer_file))])

In [None]:
error_cytometer_df.head(10)

In [None]:
auto_cytometer_df.head(10)

In [None]:
# check if there are common dates between the two dataframes
common_dates = np.intersect1d(auto_cytometer_df["Date [local]"], error_cytometer_df["Date [local]"])
common_dates

In [None]:
# no common dates, so the error dataframe is not useful as the auto dataframe already removes the errors

## Multiparam

In [None]:
multiparam_files = [f for f in os.listdir(multiparam_folder) if f.endswith(".csv")]

param_df = pd.DataFrame()
spectro_df = pd.DataFrame()

for multiparam_file in multiparam_files:

    if 'par' in multiparam_file:
        if param_df.empty:
            param_df = pd.read_csv(os.path.join(multiparam_folder, multiparam_file), sep=";", header=1)
        else:
            param_df = pd.concat([param_df, pd.read_csv(os.path.join(multiparam_folder, multiparam_file), sep=";", header=1)])
            
    elif 'spec' in multiparam_file:
        if spectro_df.empty:
            spectro_df = pd.read_csv(os.path.join(multiparam_folder, multiparam_file), sep=";", header=1)
        else:
            spectro_df = pd.concat([spectro_df, pd.read_csv(os.path.join(multiparam_folder, multiparam_file), sep=";", header=1)])
            

# Clean Data

## Cytometer

In [None]:
cyto_df = auto_cytometer_df

In [None]:
cyto_df.columns.to_list()

In [None]:
cyto_df.drop(
    columns=[
        'Timestamp',
        'Date [local]',
        'Date [GMT]',
        'Instrument Name',
        'Instrument SN',
        'Mode',
        'Name',
        'Protocol',
        'TCC [1/mL]',
        'GATE+ [1/mL]',
        'ACC [1/mL]',
        'HACC [1/mL]',
        'LACC [1/mL]',
        'HACP [%]',
        'Cartridge Fill',
    ],
    inplace=True
)

In [None]:
cyto_df

In [None]:
cyto_df.rename(
    columns={
        'Sampling Date [local]': 'DateTime',
    },
    inplace=True
)

## Multiparam

In [None]:
param_df

In [None]:
spectro_df

In [None]:
param_df.columns.to_list()

In [None]:
spectro_df.columns.to_list()

In [None]:
# Need to keep just the measured values as we would build online soft sensors for them
params_columns_to_drop = [col for col in param_df.columns if 'Clean' in col]
params_columns_to_drop.append('Status')
params_columns_to_drop.append('Temperature - Measured value [°C] (Limit:0.00-45.00_Coefs:0.00 0.00 0.00 0.00)')

# remove all the wavelenghts that are not the 254nm one
spectro_columns_to_drop = [col for col in spectro_df.columns if '254' not in col]
spectro_columns_to_drop.remove('Measurement interval=900[sec] (Export-Aggregation disabled)')
spectro_columns_to_drop.append('Status (Source:0)')

param_df.drop(columns=params_columns_to_drop, inplace=True)
spectro_df.drop(columns=spectro_columns_to_drop, inplace=True)

In [None]:
param_df.describe().T

In [None]:
# remove the columns with all NaN values and a small amount of valid values
param_df.drop(
    columns=[
        'Total Chlorine - Measured value [mg/l] (Limit:0.00-2.00_Coefs:0.00 1.00 0.00 0.00)',
        'pH - Measured value (Limit:0.00-14.00_Coefs:-2.40 0.97 0.00 0.00)',
    ],
    inplace=True
)

In [None]:
param_df.isna().sum()

In [None]:
param_df.drop(
    columns=[
        'Temperature - Measured value [C] (Limit:-5.00-100.00_Coefs:-0.40 1.00 0.00 0.00)',
        'Status [Temperature - Measured value].1',
        'Status [Temperature - Measured value].2',
        'Status [Total Chlorine - Measured value]'
    ],
    inplace=True
)

In [None]:
spectro_df.describe().T

In [None]:
spectro_df.isna().sum()

# Preliminary Insights

## Cytometer

* BactoSense: misurazioni di:
    * ICC [1/mL]: concentrazione di cellule intatte
    * HNAC [1/mL]: concentrazione di cellule ad alto contenuto di acido nucleico
    * LNAC [1/mL]: concentrazione di cellule a basso contenuto di acido nucleico
    * HNAP [%]: frazione di ICC costituita da cellule ad alto contenuto di acido nucleico
    * TCC [1/mL] (no valori)
    * GATE+ [1/mL] (no valori)
    * ACC [1/mL] (no valori)
    * HACC [1/mL] (no valori)
    * LACC [1/mL] (no valori)
    * HACP [%] (no valori)

In [None]:
measurement_columns = [
    'ICC [1/mL]',
    'HNAC [1/mL]',
    'LNAC [1/mL]',
    'HNAP [%]',
]

In [None]:
for col in measurement_columns:
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=cyto_df["DateTime"],
            y=cyto_df[col],
            mode='lines',
            name=col,
        )
    )
    
    warning_df = cyto_df[cyto_df['Warnings'].notna()]
    
    if not warning_df.empty:
        fig.add_trace(
            go.Scatter(
                x=warning_df["DateTime"],
                y=warning_df[col],
                mode='markers',
                marker=dict(color='red'),
                name='Warnings',
            )
        )
    
    fig.update_layout(
        title=col,
        xaxis_title="DateTime",
        yaxis_title=col,
    )
    
    fig.show()
    

In [None]:
# Remove first samples as BactoSense was not yet calibrated
cyto_df = cyto_df[cyto_df['DateTime'] >= '2024-11-30']

In [None]:
# Remove ICC measurements above 70k as they are not reliable
cyto_df.loc[cyto_df['ICC [1/mL]'] > 70000, 'ICC [1/mL]'] = np.nan

# Remove HNAC measurements above 70k as they are not reliable
cyto_df.loc[cyto_df['HNAC [1/mL]'] > 70000, 'HNAC [1/mL]'] = np.nan

# Remove measurements == 0 as they are not reliable
cyto_df.loc[cyto_df['ICC [1/mL]'] == 0, 'ICC [1/mL]'] = np.nan
cyto_df.loc[cyto_df['HNAC [1/mL]'] == 0, 'HNAC [1/mL]'] = np.nan
cyto_df.loc[cyto_df['LNAC [1/mL]'] == 0, 'LNAC [1/mL]'] = np.nan
cyto_df.loc[cyto_df['HNAP [%]'] == 0, 'HNAP [%]'] = np.nan

In [None]:
from datetime import datetime

In [None]:
# Remove measurements of 18-02 as the instrument was turning on

condition = (cyto_df['DateTime'].dt.date == datetime.strptime('2025-02-18', '%Y-%m-%d').date())
parameters = cyto_df.columns.difference(['DateTime', 'Warnings'])

cyto_df.loc[condition, parameters] = np.nan

In [None]:
# Impute missing values with time interpolation
cyto_df.set_index('DateTime', inplace=True)
cyto_df.interpolate(method='time', inplace=True)

In [None]:
cyto_df.reset_index(inplace=True)

In [None]:
for col in measurement_columns:
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=cyto_df["DateTime"],
            y=cyto_df[col],
            mode='lines',
            name=col,
        )
    )
    
    warning_df = cyto_df[cyto_df['Warnings'].notna()]
    
    if not warning_df.empty:
        fig.add_trace(
            go.Scatter(
                x=warning_df["DateTime"],
                y=warning_df[col],
                mode='markers',
                marker=dict(color='red'),
                name='Warnings',
            )
        )
    
    fig.update_layout(
        title=col,
        xaxis_title="DateTime",
        yaxis_title=col,
    )
    
    fig.show()

## Multiparam

In [None]:
param_df.rename(
    columns={
        'Measurement interval=900[sec] (Export-Aggregation disabled)': 'DateTime',
    },
    inplace=True
)

In [None]:
param_df['DateTime'] = pd.to_datetime(param_df['DateTime'])

In [None]:
param_df.sort_values(by='DateTime', inplace=True)

In [None]:
def rename_columns(columns):
    
    new_columns = [col.split(']')[0] + ']' for col in columns]
    
    new_columns = [col.split('(')[0] if 'Limit' in col else col for col in new_columns]
    
    for col in new_columns:
        if 'Result' in col:
            new_columns[new_columns.index(col)] = col.replace(' - Result', '')
        
        if ' - Measured value' in col:
            new_columns[new_columns.index(col)] = col.replace(' - Measured value', '')
            
    for col in new_columns:
        if col == 'pH ':
            new_columns[new_columns.index(col)] = 'pH'
        
        # TODO add unit of measure to the column name
        if col == 'UV254 ':
            new_columns[new_columns.index(col)] = 'UV254'
        
        if col == 'nitrati ':
            new_columns[new_columns.index(col)] = 'nitrati'
        
        if col == 'nitriti ':
            new_columns[new_columns.index(col)] = 'nitriti'
        
    return new_columns
    

In [None]:
dt_column = param_df['DateTime']
param_df.drop(columns='DateTime', inplace=True)

param_df.columns = rename_columns(param_df.columns)

param_df['DateTime'] = dt_column

In [None]:
param_df.columns.to_list()

In [None]:
for column in param_df.columns:
    
    if  (column == 'DateTime') or ('Status' in column):
        continue
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=param_df["DateTime"],
            y=param_df[column],
            mode='lines',
            name=column,
        )
    )
    
    # make the NaN values red
    nan_df = param_df[param_df[column].isna()]
    
    # make error values brown
    
    # TODO provvisorio, da sistemare dopo aver capito le unità di misura
    
    
    status_col = 'Status [' + column.split(' [')[0] + ']'
    error_df = param_df[param_df[status_col].str.contains('Error', na=False)]
    
    if not error_df.empty:
        for index, row in error_df.iterrows():
            fig.add_vline(x=row['DateTime'], line=dict(color='yellow', width=0.5))
            
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode='lines',
                marker=dict(color='yellow'),
                name='Error',
            )
        )
    
    if not nan_df.empty:
        for index, row in nan_df.iterrows():
            fig.add_vline(x=row['DateTime'], line=dict(color='red', width=0.5))
            
        # add a legend for the NaN values
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode='lines',
                marker=dict(color='red'),
                name='NaN',
            )
        )
    
    fig.update_layout(
        title=column,
        xaxis_title="DateTime",
        yaxis_title=column,
    )
    
    fig.show()

In [None]:
# drop pH-mV and nitriti columns as they are not useful
param_df.drop(columns=['pH-mV [mV]', 'nitriti'], inplace=True)

In [None]:
spectro_df.rename(
    columns={
        'Measurement interval=900[sec] (Export-Aggregation disabled)': 'DateTime',
    },
    inplace=True
)

spectro_df.sort_values(by='DateTime', inplace=True)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=spectro_df["DateTime"],
        y=spectro_df['254 nm'],
        mode='lines',
        name='Spectro',
    )
)

fig.add_trace(
    go.Scatter(
        x=param_df["DateTime"],
        y=param_df['UV254'],
        mode='lines',
        name='Multiparam',
    )
)
    
fig.update_layout(
    title='254 nm',
    xaxis_title="DateTime",
    yaxis_title='254 nm',
)

fig.show()

In [None]:
# same exact values, so the spectro data is not useful

In [None]:
# Something happened around 25-09-2024, let's check the data

In [None]:
# for each parameter, print the dates that have NaN values
for column in param_df.columns:
    
    if  (column == 'DateTime') or ('Status' in column):
        continue
    
    nan_df = param_df[param_df[column].isna()]
    
    if not nan_df.empty:
        print('='*50)
        print(column)
        print('='*50)
        print()
        print('NaN values dates:')
        print(nan_df['DateTime'])
        print()

In [None]:
# it seems that the before the July NaN values most of the measurements come from a different distribution
# The same happens for the September anomalies

for column in param_df.columns:
    
    if  (column == 'DateTime') or ('Status' in column):
        continue
    
    df = param_df[['DateTime', column]]
    
    fig = go.Figure()
    
    # pre July
    pre_july_df = df[df['DateTime'] < '2024-07-03']
    
    # between July and September
    between_july_sept_df = df[(df['DateTime'] >= '2024-07-03') & (df['DateTime'] < '2024-09-22')]
    
    # post September
    post_sept_df = df[df['DateTime'] >= '2024-09-30']
    
    fig.add_trace(
        go.Box(
            y=pre_july_df[column],
            name='Pre July',
        )
    )
    
    fig.add_trace(
        go.Box(
            y=between_july_sept_df[column],
            name='Between July and September',
        )
    )
    
    fig.add_trace(
        go.Box(
            y=post_sept_df[column],
            name='Post September',
        )
    )
    
    fig.update_layout(
        title=column,
        yaxis_title=column,
    )
    
    fig.show()

# Common Dates

* BactoSense: misurazioni di:
    * ICC [1/mL]: concentrazione di cellule intatte
    * HNAC [1/mL]: concentrazione di cellule ad alto contenuto di acido nucleico
    * LNAC [1/mL]: concentrazione di cellule a basso contenuto di acido nucleico
    * HNAP [%]: frazione di ICC costituita da cellule ad alto contenuto di acido nucleico

In [None]:
# get the range of dates of the cyto_df
min_date = cyto_df['DateTime'].min()
max_date = cyto_df['DateTime'].max()

# get only the rows of the param_df that are in the range of the cyto_df
param_df = param_df[(param_df['DateTime'] >= min_date) & (param_df['DateTime'] <= max_date)]

In [None]:
cyto_df.columns.to_list()

In [None]:
column_mapping = {
    "Pressione [atm]": "Pressure (atm)",
    "pH": "pH",
    "Conductivity [uS/cm]": "Conductivity (uS/cm)",
    "Temperature [°C]": "Temperature (°C)",
    "UV254": "UVA254 (1/cm)",
    "nitrati": "Nitrate (mg/l)",
    "Turbidity [FTU]": "Turbidity (NTU)",
    "TOCeq [mg/l]": "TOC (mg/l)",
    "DOCeq [mg/l]": "DOC (mg/l)",
    "Free Chlorine [mg/l]": "Free Chlorine (mg/l)",
    "ICC [1/mL]": "ICC (1/mL)",
    "HNAC [1/mL]": "HNAC (1/mL)",
    "LNAC [1/mL]": "LNAC (1/mL)",
    "HNAP [%]": "HNAP (%)",
}
    

In [None]:
dec_plot_data_folder = os.path.join(plot_data_folder, "Dec_24")

In [None]:
for column in param_df.columns:
    
    if  (column == 'DateTime') or ('Status' in column):
        continue
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=param_df[param_df['DateTime'] >= pd.Timestamp('29-11-2024')]["DateTime"],
            y=param_df[param_df['DateTime'] >= pd.Timestamp('29-11-2024')][column],
            mode='lines',
            name=column_mapping[column],
            line=dict(color='green'),
        )
    )
    
    fig.update_layout(
        xaxis_title="Time",
        yaxis_title=column_mapping[column],
        margin=dict(l=0, r=10, t=30,b=0),
    )
    
    # update overall font size
    fig.update_layout(
        font=dict(
            size=17,
        )
    )
    
    fig.show()
    
    column_ = column.replace('/', '_')
    
    fig.write_image(
        os.path.join(dec_plot_data_folder, f"{column_}.png"),
        scale=3
    )
    

for col in measurement_columns:
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=cyto_df[cyto_df['DateTime'] >= pd.Timestamp('29-11-2024')]["DateTime"],
            y=cyto_df[cyto_df['DateTime'] >= pd.Timestamp('29-11-2024')][col],
            mode='lines',
            name=column_mapping[col],
            line=dict(color='blue'),
        )
    )
    
    fig.update_layout(
        xaxis_title="Time",
        yaxis_title=column_mapping[col],
        margin=dict(l=0, r=10, t=30, b=0),
    )
    
    # update overall font size
    fig.update_layout(
        font=dict(
            size=17,
        )
    )
    
    col_ = col.replace('/', '_')
    
    fig.write_image(
        os.path.join(dec_plot_data_folder, f"{col_}.png"),
        scale=3
    )
    
    fig.show()

# Build Dataset

In [None]:
# remove status columns from the param_df
status_columns = [col for col in param_df.columns if 'Status' in col]

param_df.drop(columns=status_columns, inplace=True)

In [None]:
param_df.set_index('DateTime', inplace=True)

In [None]:
param_df = param_df.resample('15min').mean().interpolate(method='time')

In [None]:
cyto_df.drop(columns=['Warnings', 'Alarms'], inplace=True)
cyto_df.set_index('DateTime', inplace=True)

In [None]:
# we have to resample the cyto_df as it has a different frequency, we choose to resample it to 15 minutes to match the param_df and avoid losing information
cyto_df = cyto_df.resample('15min').median().interpolate(method='time')

In [None]:
param_df

In [None]:
cyto_df

In [None]:
# merge the two dataframes
merged_df = pd.merge(cyto_df, param_df, left_index=True, right_index=True)

In [None]:
merged_df

In [None]:
merged_df.rename(
    columns={
        'nitrati': 'Nitrate [mg/l]',
        'UV254': 'UV254 [1/m]',
    },
    inplace=True
)

# Input Data Cleaning

Since there is a big gap in output variables, I decided to split the dataset into two different ones and build two different models for each one.

In [None]:
# Something happened on 26-02-2025, we are going to remove the data in that date for the following columns:
# - UVA254
# - Nitrate
# - Turbidity
# - TOC
# - DOC

columns_to_remove = [
    'UV254 [1/m]',
    'Nitrate [mg/l]',
    'Turbidity [FTU]',
    'TOCeq [mg/l]',
    'DOCeq [mg/l]',
]

merged_df.loc[merged_df.index.date == pd.Timestamp('2025-02-26').date(), columns_to_remove] = np.nan

In [None]:
merged_df.interpolate(method='time', inplace=True)

In [None]:
for column in columns_to_remove:
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=merged_df[merged_df.index >= pd.Timestamp('29-11-2024')].index,
            y=merged_df[merged_df.index >= pd.Timestamp('29-11-2024')][column],
            mode='lines',
            name=column,
            line=dict(color='green'),
        )
    )
    
    fig.update_layout(
        xaxis_title="Time",
        yaxis_title=column,
        margin=dict(l=0, r=10, t=30, b=0),
    )
    
    # update overall font size
    fig.update_layout(
        font=dict(
            size=17,
        )
    )
    fig.show()

In [None]:
first_merged_df = merged_df[merged_df.index <= pd.Timestamp('2024-12-22')]
second_merged_df = merged_df[merged_df.index >= pd.Timestamp('2025-02-19')]

In [None]:
first_merged_df.to_excel(os.path.join(clean_data_folder, "Feltre", "first_part.xlsx"))
second_merged_df.to_excel(os.path.join(clean_data_folder, "Feltre", "second_part.xlsx")) 