# Handling of Outliers and NaNs

In [None]:
import os
import sys

sys.path.append("..")

import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import scipy.stats as stats

from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
from copy import deepcopy

from utils.functions.normalize_string import normalize_string

In [None]:
data_folder = os.path.join("..", "data")
utils_folder = os.path.join("..", "utils")

interm_data_folder = os.path.join(data_folder, "Intermediate Data")
clean_data_folder = os.path.join(data_folder, "Clean Data")

# Load Data

In [None]:
eb0_features_lab_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB0_Features_Lab.xlsx")
)

eb0_targets_lab_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB0_Targets_Lab.xlsx")
)

In [None]:
eb1_features_lab_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB1_Features_Lab.xlsx")
)

eb1_targets_lab_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB1_Targets_Lab.xlsx")
)

In [None]:
eb0_sensor_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB0_Sensor.xlsx")
)

In [None]:
eb1_sensor_df = pd.read_excel(
    os.path.join(interm_data_folder, "EB1_Sensor.xlsx")
)

In [None]:
thms_df = pd.read_excel(os.path.join(interm_data_folder, "THMs.xlsx"))

In [None]:
# import feature mappings
with open(
    os.path.join(utils_folder, "mappings", "eb0_features_mapping.json")
) as f:
    eb0_features_mapping = json.load(f)

with open(
    os.path.join(utils_folder, "mappings", "eb1_features_mapping.json")
) as f:
    eb1_features_mapping = json.load(f)

In [None]:
"""
Per quanto riguarda outliers e missing values, ho deciso prima di rimuovere gli outliers
dato che per gestire i missing values utilizzo data imputation con KNN, che è molto sensibile
agli outliers.
Inoltre, prima di rimuovere gli outliers dai sensori, vengono prima rimossi i sample
che hanno un valore di ALARMA ESPECTRAL >= 3, e vengono rimossi anche quei valori = 0
per alcune variabili per le quali non è appurato che si tratta di un errore di misurazione.
Queste feature sono: index UV, Sulfats, Particules, pH, color, conductivity.
 
Data imputation viene fatta solo sulle features e non sulle variabili target in quanto
imputare i valori di target potrebbe portare ad avere molto bias e quindi a risultati non realistici.
"""

# Data Check-Up

In [None]:
eb0_features_lab_df

In [None]:
eb0_targets_lab_df

In [None]:
eb1_features_lab_df

In [None]:
eb1_targets_lab_df

In [None]:
eb0_sensor_df

In [None]:
eb1_sensor_df

In [None]:
eb0_sensor_df.drop("Unnamed: 0", axis=1, inplace=True)
eb1_sensor_df.drop("Unnamed: 0", axis=1, inplace=True)

# Ouliers

## Lab

### EB0

In [None]:
eb0_features_lab_df.columns = [
    normalize_string(c) for c in eb0_features_lab_df.columns
]

eb0_features_mapping = {
    normalize_string(k): v for k, v in eb0_features_mapping.items()
}

In [None]:
# NaN count per feature
print(eb0_features_lab_df.isna().sum() / len(eb0_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb0_features_lab_df))

In [None]:
# NaN count per sample
eb0_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# count of the samples with zero NaNs
(eb0_features_lab_df.isna().sum(axis=1) == 0).sum()

In [None]:
%%script false --no-raise-error
# Try combinations with and without zeros
# Outliers are removed using the IQR method with 0.05 and 0.95 quantiles,
# since the lab dataset is small and not much noisy

for lab_key in eb0_features_mapping.keys():
    quartile1, quartile3 = eb0_features_lab_df[lab_key].quantile([0.05, 0.95])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    # Create a temporary DataFrame with the column values before and after the IQR operation
    temp_df = pd.DataFrame(
        {
            "Before IQR": eb0_features_lab_df[lab_key],
            "After IQR": eb0_features_lab_df[lab_key].apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
        }
    )

    # Plot the temporary DataFrame

    fig, axs = plt.subplots(ncols=2, figsize=(20, 10))

    sns.boxplot(
        data=temp_df,
        palette="Set2",
        saturation=0.5,
        whis=1.5,
        fliersize=3,
        ax=axs[0],
    )

    sns.histplot(
        data=temp_df, palette="Set2", kde=True, stat="density", ax=axs[1]
    )

    plt.title(lab_key)
    plt.show()

In [None]:
%%script false --no-raise-error

# No outliers removal since the dataset is small and not much noisy

# Remove outliers
for lab_key in eb0_features_mapping.keys():

    quartile1, quartile3 = eb0_features_lab_df[lab_key].quantile([0.05, 0.95])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb0_features_lab_df[lab_key] = eb0_features_lab_df[lab_key].apply(
        lambda x: x if (x > lower_bound and x < upper_bound) else None
    )

In [None]:
# NaN count per feature
print(eb0_features_lab_df.isna().sum() / len(eb0_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb0_features_lab_df))

In [None]:
# NaN count per sample
eb0_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# count of the samples with zero NaNs
(eb0_features_lab_df.isna().sum(axis=1) == 0).sum()

### EB1

In [None]:
eb1_features_lab_df.columns = [
    normalize_string(c) for c in eb1_features_lab_df.columns
]

eb1_features_mapping = {
    normalize_string(k): v for k, v in eb1_features_mapping.items()
}

In [None]:
# NaN count per feature
print(eb1_features_lab_df.isna().sum() / len(eb1_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb1_features_lab_df))

In [None]:
# NaN count per sample
eb1_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# count of the samples with zero NaNs
(eb1_features_lab_df.isna().sum(axis=1) == 0).sum()

In [None]:
%%script false --no-raise-error
# Try combinations with and without zeros
# Outliers are removed using the IQR method

for lab_key in eb1_features_mapping.keys():
    quartile1, quartile3 = eb1_features_lab_df[lab_key].quantile([0.05, 0.95])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    # Create a temporary DataFrame with the column values before and after the IQR operation
    temp_df = pd.DataFrame(
        {
            "Before IQR": eb1_features_lab_df[lab_key],
            "After IQR": eb1_features_lab_df[lab_key].apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
        }
    )

    # Plot the temporary DataFrame

    fig, axs = plt.subplots(ncols=2, figsize=(20, 10))

    sns.boxplot(
        data=temp_df,
        palette="Set2",
        saturation=0.5,
        whis=1.5,
        fliersize=3,
        ax=axs[0],
    )

    sns.histplot(
        data=temp_df, palette="Set2", kde=True, stat="density", ax=axs[1]
    )

    plt.title(lab_key)
    plt.show()

In [None]:
%%script false --no-raise-error

# No outliers removal since the dataset is small and not much noisy

# Remove outliers
for lab_key in eb1_features_mapping.keys():

    quartile1, quartile3 = eb1_features_lab_df[lab_key].quantile([0.05, 0.95])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb1_features_lab_df[lab_key] = eb1_features_lab_df[lab_key].apply(
        lambda x: x if (x > lower_bound and x < upper_bound) else None
    )

In [None]:
# NaN count per feature
print(eb1_features_lab_df.isna().sum() / len(eb1_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb1_features_lab_df))

In [None]:
# NaN count per sample
eb1_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# count of the samples with zero NaNs
(eb1_features_lab_df.isna().sum(axis=1) == 0).sum()

## Sensor

### EB0

#### Remove invalid samples

In [None]:
eb0_sensor_df = eb0_sensor_df[eb0_sensor_df["ALARMA ESPECTRAL"] < 3]

#### Remove Outliers

In [None]:
# Try combinations with and without zeros

# Outliers are removed using the IQR method
# NaNs are filled with the capped values

for lab_key, sensor_key in eb0_features_mapping.items():
    plt.figure(figsize=(20, 10))

    quartile1, quartile3 = eb0_sensor_df[sensor_key].quantile([0.25, 0.75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb0_without_zeros = eb0_sensor_df[sensor_key][
        eb0_sensor_df[sensor_key] != 0
    ]

    # Create a temporary DataFrame with the column values before and after the IQR operation
    temp_df = pd.DataFrame(
        {
            "Lab": eb0_features_lab_df[lab_key],
            "Before IQR": eb0_sensor_df[sensor_key],
            "Before IQR w/o zeros": eb0_without_zeros,
            "After IQR": eb0_sensor_df[sensor_key].apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
            "After IQR w/o zeros": eb0_without_zeros.apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
        }
    )

    # ...

    fig = plt.figure(figsize=(40, 20))

    gs0 = gridspec.GridSpec(1, 2, figure=fig)

    gs00 = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=gs0[0])

    # Plot the temporary DataFrame
    sns.boxplot(
        data=temp_df,
        palette="Set2",
        saturation=0.5,
        whis=1.5,
        fliersize=3,
        ax=fig.add_subplot(gs00[0]),
    )

    gs01 = gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=gs0[1])

    # Plot a histogram for each column
    for i, column in enumerate(temp_df.columns[1:], start=1):
        sns.histplot(
            data=temp_df[column],
            kde=True,
            stat="density",
            ax=fig.add_subplot(gs01[i - 1]),
            label=column,
        )

    plt.suptitle(lab_key)
    plt.tight_layout()
    plt.show()

In [None]:
list(eb0_features_mapping.values())

In [None]:
# Outliers are removed using the IQR method w/o zeros
# The features that can have 0 values are: RATIO_HG, TERBOLESA

for sensor_key in eb0_features_mapping.values():
    if sensor_key not in ["RATIO_HG", "TERBOLESA"]:
        eb0_sensor_df = eb0_sensor_df[eb0_sensor_df[sensor_key] > 0]

    quartile1, quartile3 = eb0_sensor_df[sensor_key].quantile([0.25, 0.75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb0_sensor_df[sensor_key] = eb0_sensor_df[sensor_key].apply(
        lambda x: x if (x > lower_bound and x < upper_bound) else None
    )

### EB1

#### Remove invalid samples

In [None]:
eb1_sensor_df = eb1_sensor_df[
    (eb1_sensor_df["ALARMA SPECTRAL"] < 3)
    | (eb1_sensor_df["ALARMA SPECTRAL"].isna())
]

#### Remove outliers

In [None]:
"""
Si potrebbe pensare di utilizzare come lower e upper bound per rimuovere gli outliers
dai samples dei sensori i valori di 0.25 e 0.75 dei samples da laboratorio
"""

In [None]:
# Try combinations with and without zeros

# Outliers are removed using the IQR method
# NaNs are filled with the capped values

for lab_key, sensor_key in eb1_features_mapping.items():
    plt.figure(figsize=(20, 10))

    quartile1, quartile3 = eb1_sensor_df[sensor_key].quantile([0.25, 0.75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb1_without_zeros = eb1_sensor_df[sensor_key][eb1_sensor_df[sensor_key] > 0]

    # Create a temporary DataFrame with the column values before and after the IQR operation
    temp_df = pd.DataFrame(
        {
            "Lab": eb1_features_lab_df[lab_key],
            "Before IQR": eb1_sensor_df[sensor_key],
            "Before IQR w/o zeros": eb1_without_zeros,
            "After IQR": eb1_sensor_df[sensor_key].apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
            "After IQR w/o zeros": eb1_without_zeros.apply(
                lambda x: x if (x > lower_bound and x < upper_bound) else None
            ),
        }
    )

    fig = plt.figure(figsize=(40, 20))

    gs0 = gridspec.GridSpec(1, 2, figure=fig)

    gs00 = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=gs0[0])

    # Plot the temporary DataFrame
    sns.boxplot(
        data=temp_df,
        palette="Set2",
        saturation=0.5,
        whis=1.5,
        fliersize=3,
        ax=fig.add_subplot(gs00[0]),
    )

    gs01 = gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=gs0[1])

    # Plot a histogram for each column
    for i, column in enumerate(temp_df.columns[1:], start=1):
        sns.histplot(
            data=temp_df[column],
            kde=True,
            stat="density",
            ax=fig.add_subplot(gs01[i - 1]),
            label=column,
        )

    plt.suptitle(lab_key)
    plt.tight_layout()
    plt.show()

In [None]:
list(eb1_features_mapping.values())

In [None]:
# Outliers are removed using the IQR method w/o zeros
for sensor_key in eb1_features_mapping.values():
    eb1_sensor_df = eb1_sensor_df[
        (eb1_sensor_df[sensor_key] > 0) | (eb1_sensor_df[sensor_key].isna())
    ]

    quartile1, quartile3 = eb1_sensor_df[sensor_key].quantile([0.25, 0.75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (1.5 * iqr)
    upper_bound = quartile3 + (1.5 * iqr)

    eb1_sensor_df[sensor_key] = eb1_sensor_df[sensor_key].apply(
        lambda x: x if (x > lower_bound and x < upper_bound) else None
    )

# Missing Values

## Lab

### EB0

In [None]:
# NaN count per feature
print(eb0_features_lab_df.isna().sum() / len(eb0_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb0_features_lab_df))

In [None]:
# NaN count per sample
eb0_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# remove features with more than 60% of NaNs
# eb0_features_lab_df = eb0_features_lab_df.dropna(
#     thresh=len(eb0_features_lab_df) * 0.6, axis=1
# )

In [None]:
# NaN count per sample
eb0_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# find sampling frequency
eb0_features_lab_df["DateTime"].diff().value_counts()

In [None]:
eb0_features_lab_df = eb0_features_lab_df.resample("D", on="DateTime").median()

In [None]:
# Fixing Conductivity 11-2022 outliers, multiply by 10

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(
    data=eb0_features_lab_df,
    x="DateTime",
    y="Conductivitat a 20oC",
)

sns.scatterplot(
    data=eb0_features_lab_df[eb0_features_lab_df["Conductivitat a 20oC"] < 600],
    x="DateTime",
    y="Conductivitat a 20oC",
)
plt.show()

eb0_features_lab_df[eb0_features_lab_df["Conductivitat a 20oC"] < 600]

In [None]:
# Fixing Conductivity 11-2022 outliers, multiply by 10
eb0_features_lab_df.loc[
    (eb0_features_lab_df.index == "2022-11-02")
    | (eb0_features_lab_df.index == "2022-11-29"),
    "Conductivitat a 20oC",
] *= 10

In [None]:
# interpolate missing values
eb0_features_lab_df = eb0_features_lab_df.interpolate(method="time")

In [None]:
eb0_features_lab_df.isna().sum()

In [None]:
eb0_features_lab_df.fillna(method="bfill", inplace=True)

In [None]:
for column in eb0_features_lab_df.columns.difference(["DateTime"]):
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=eb0_features_lab_df, x="DateTime", y=column)
    plt.title(column)
    plt.show()

### EB1

In [None]:
# NaN count per feature
print(eb1_features_lab_df.isna().sum() / len(eb1_features_lab_df) * 100)
print()
print("Total number of samples: ", len(eb1_features_lab_df))

EB0

Color                                     25.487365
Conductivitat a 20oC                       5.703971
Mercuri                                    7.364621
Quantitat més abundant de partícules    94.151625
Sulfats                                   84.187726
Temperatura                               61.227437
Terbolesa                                  5.703971
pH                                         5.703971
Índex UV                                 92.707581

In [None]:
# NaN count per sample
eb1_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# remove features with more than 60% of NaNs
# eb1_features_lab_df = eb1_features_lab_df.dropna(
#     thresh=len(eb1_features_lab_df) * 0.6, axis=1
# )

In [None]:
# NaN count per sample
eb1_features_lab_df.isna().sum(axis=1).hist()

In [None]:
# find sampling frequency
eb1_features_lab_df["DateTime"].diff().value_counts()

In [None]:
eb1_features_lab_df = eb1_features_lab_df.resample("D", on="DateTime").median()

In [None]:
# interpolate missing values
eb1_features_lab_df = eb1_features_lab_df.interpolate(method="time")

In [None]:
eb1_features_lab_df.isna().sum()

In [None]:
for column in eb1_features_lab_df.columns.difference(["DateTime"]):
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=eb1_features_lab_df, x="DateTime", y=column)
    plt.title(column)
    plt.show()

In [None]:
# NaN count per feature
print(eb1_targets_lab_df.isna().sum() / len(eb1_targets_lab_df) * 100)
print()
print("Total number of samples: ", len(eb1_targets_lab_df))

## Sensor

### EB0

In [None]:
# NaN count per feature
print(eb0_sensor_df.isna().sum() / len(eb0_sensor_df) * 100)
print()
print("Total number of samples: ", len(eb0_sensor_df))

In [None]:
# NaN count per sample
eb0_sensor_df.isna().sum(axis=1).hist()

#### Drop useless and NaN columns

In [None]:
useless_columns = [
    "ALARMA ESPECTRAL",
    "OX",
    "PARTICULES",
    "RATIO_TERB_SIG",
    "RATIO_TLF_UV",
    "SULFAT",
    "TLF",
    "VIS436",
    "VIS525",
    "VIS620",
]

In [None]:
# eb0_sensor_df = eb0_sensor_df.drop(useless_columns, axis=1)

#### Imputation for missing values

In [None]:
# find sampling frequency
eb0_sensor_df["DateTime"].diff().value_counts()

In [None]:
eb0_sensor_df = eb0_sensor_df.resample("D", on="DateTime").median()

In [None]:
# interpolate missing values
eb0_sensor_df = eb0_sensor_df.interpolate(method="time")

In [None]:
eb0_features_lab_df.isna().sum()

In [None]:
for column in eb0_sensor_df.columns.difference(["DateTime"]):
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=eb0_sensor_df, x="DateTime", y=column)
    plt.title(column)
    plt.show()

### EB1

In [None]:
eb1_sensor_df.columns.to_list()

In [None]:
eb1_features_lab_df.columns.to_list()

In [None]:
useless_columns = [
    "ALARMA SPECTRAL",
    "DOC",
    "ORP",
    "PARTICULES",
    "TEMPERATURA",
    "TLF",
    "TOC",
    "UVA",
]

In [None]:
# eb1_sensor_df = eb1_sensor_df.drop(useless_columns, axis=1)

#### Imputation for missing values

In [None]:
# find sampling frequency
eb1_sensor_df["DateTime"].diff().value_counts()

In [None]:
eb1_sensor_df = eb1_sensor_df.resample("D", on="DateTime").median()

In [None]:
# interpolate missing values
eb1_sensor_df = eb1_sensor_df.interpolate(method="time")

In [None]:
eb1_sensor_df.isna().sum()

In [None]:
eb1_sensor_df.fillna(method="bfill", inplace=True)

In [None]:
for column in eb1_sensor_df.columns.difference(["DateTime"]):
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=eb1_sensor_df, x="DateTime", y=column)
    plt.title(column)
    plt.show()

# Show and Save Results

In [None]:
eb0_features_lab_df

In [None]:
eb1_features_lab_df

In [None]:
eb0_sensor_df

In [None]:
eb1_sensor_df

In [None]:
eb0_features_lab_df.reset_index(inplace=True)
eb1_features_lab_df.reset_index(inplace=True)

eb0_sensor_df.reset_index(inplace=True)
eb1_sensor_df.reset_index(inplace=True)

In [None]:
eb0_features_lab_df.to_excel(
    os.path.join(clean_data_folder, "EB0_Features_Lab.xlsx"), index=False
)

In [None]:
eb0_targets_lab_df.to_excel(
    os.path.join(clean_data_folder, "EB0_Targets_Lab.xlsx"), index=False
)

In [None]:
eb1_targets_lab_df.to_excel(
    os.path.join(clean_data_folder, "EB1_Targets_Lab.xlsx"), index=False
)

In [None]:
eb1_features_lab_df.to_excel(
    os.path.join(clean_data_folder, "EB1_Features_Lab.xlsx"), index=False
)

In [None]:
eb0_sensor_df.to_excel(
    os.path.join(clean_data_folder, "EB0_Sensor.xlsx"), index=False
)

In [None]:
eb1_sensor_df.to_excel(
    os.path.join(clean_data_folder, "EB1_Sensor.xlsx"), index=False
)