In [1]:
training_dataset_name = 'train2023.csv'
testing_dataset_name = 'test2023.csv'

In [None]:
import os
import pandas as pd

train_df = pd.read_csv(os.path.join('data', training_dataset_name), sep=';', header=None)
test_df = pd.read_csv(os.path.join('data', testing_dataset_name), sep=';', header=None)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import abc

class Preprocessor(abc.ABC):

    @classmethod
    @abc.abstractmethod
    def fit_transform(self, X, y):
        return X, y

    @classmethod
    @abc.abstractmethod
    def transform(self, X):
        pass

In [None]:
import numpy as np

from sklearn.base import OutlierMixin

class OutlierTransformer(Preprocessor):

    def __init__(self, outlier_detector: OutlierMixin, class_labels):
        self._outlier_detector = outlier_detector
        self._class_labels = class_labels

    def fit_transform(self, X, y):
        before = X.shape
        X_separated_by_class = [X[y == i, :] for i in self._class_labels]
        X_separated_by_class_cleared = [
            X_separated_by_class[i][self._outlier_detector.fit_predict(X=X_separated_by_class[i]) == 1, :] for i in
            self._class_labels]
        X = np.vstack(X_separated_by_class_cleared)
        after = X.shape
        y = np.hstack([np.full((1, X_separated_by_class_cleared[i].shape[0]), i) for i in self._class_labels])
        logging.info(f'CLEARING OUTLIERS: {before} -> {after}')

        return X, y

    def transform(self, X):
        return X

In [None]:
class FeatureSelector(Preprocessor):

    def __init__(self, feature_selector):
        self._feature_selector = feature_selector

    def fit_transform(self, X, y):
        before = X.shape
        X = self._feature_selector.fit_transform(X, y)

In [None]:
from sklearn.pipeline import Pipeline

class Scheme:

    def __init__(self, pipeline: Pipeline, preprocessors = None):
        if preprocessors is None:
            self._preprocessors = []
        self._pipeline = pipeline

    def fit(self, X, y):
        for preprocessors in self._preprocessors:
            X, y = preprocessors.fit_transform(X=X, y=y)
        self._pipeline.fit(X = X, y = y)