# Import Libraries

In [266]:
from glob import glob
import pandas as pd
import numpy as np
from typing import List
from sklearn.base import BaseEstimator, TransformerMixin

# Data Loading

In [267]:
datasets_paths = glob("../data/*.csv")
datasets: List[pd.DataFrame] = [pd.read_csv(path) for path in datasets_paths]

# Data Preparation

- Treat the records with `anomaly = True` from the dataset, and then impute them.
- Remove the `anomaly` column from the dataset.

In [268]:
for i, dataset in enumerate(datasets):
    if "anomaly" in dataset.columns:
        # Replace anomalies with 'NaN'
        dataset.loc[dataset.anomaly == 1, "value"] = np.nan

        # Remove the 'anomaly' column
        dataset = dataset.drop(columns=["anomaly"])

        # Update the dataset
        datasets[i] = dataset

- Convert the timestamp column to the `datetime` datatype
- Set the timestamp column as the index of the dataframe

In [269]:
for i, dataset in enumerate(datasets):
    # Convert the 'timestamp' column to datetime
    dataset["timestamp"] = pd.to_datetime(dataset["timestamp"])

    # Set the 'timestamp' column as the index
    dataset = dataset.set_index("timestamp")

    # Update the dataset
    datasets[i] = dataset

- Remove Outliers

In [270]:
class OutlierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def __zscore(self, X):
        X = np.nan_to_num(X, copy=True, nan=0)  # Convert the NaN values to 0
        return (X - np.mean(X)) / np.std(X)

    def transform(self, X: pd.DataFrame, y=None):
        """
        Calculate the z-score for each column in the DataFrame and remove the rows with z-score > threshold
        """

        X_copy = X.copy().to_numpy()
        for i in range(X_copy.shape[1]):
            column = X_copy[:, i]
            z_scores = self.__zscore(column)
            outliers = np.abs(z_scores) > self.threshold
            X_copy[outliers] = np.nan

        return X_copy

In [272]:
datasets = [
    pd.DataFrame(data=values, columns=dataset.columns, index=dataset.index)
    for values, dataset in zip(
        [OutlierTransformer().fit_transform(dataset) for dataset in datasets], datasets
    )
]

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2021-07-01 00:00:00,0.41651
2021-07-01 01:00:00,
2021-07-01 02:00:00,0.536902
2021-07-01 03:00:00,
2021-07-01 04:00:00,0.707928


# Data Preprocessing Pipeline

## Data Cleaning