In [34]:
training_dataset_name = 'train2023.csv'
testing_dataset_name = 'test2023.csv'

In [35]:
import os
import pandas as pd

train_df = pd.read_csv(os.path.join('data', training_dataset_name), sep=';', header=None)
test_df = pd.read_csv(os.path.join('data', testing_dataset_name), sep=';', header=None)

classes = [1, 2, 3]

In [36]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [37]:
import numpy as np

from sklearn.base import BaseEstimator


class OutlierTransformer:

    def __init__(self, outlier_detector, class_labels, params: dict = None):
        self._class_labels = class_labels

        self._outlier_detector = outlier_detector
        self.set_params(params=params)

    def get_params(self, deep: bool):
        return self._outlier_detector.get_params(deep=deep)

    def set_params(self, params: dict):
        if params is None:
            return
        params = {param_name: param_value for param_name, param_value in params.items() if
                  param_name in self._outlier_detector.get_params().keys()}
        self._outlier_detector.set_params(params=params)

    def fit_transform(self, X, y):
        before = X.shape
        X_separated_by_class = [X[y == i, :] for i in self._class_labels]
        X_separated_by_class_cleared = [
            X_separated_by_class[i][self._outlier_detector.fit_predict(X=X_separated_by_class[i]) == 1, :] for i in
            self._class_labels]
        X = np.vstack(X_separated_by_class_cleared)
        after = X.shape
        y = np.hstack([np.full((1, X_separated_by_class_cleared[i].shape[0]), i) for i in self._class_labels])
        logging.debug(f'CLEARING OUTLIERS: {before} -> {after}')

        return X, y

In [38]:
from sklearn.pipeline import Pipeline


class Scheme(BaseEstimator):

    def __init__(self, pipeline: Pipeline, outlier_detector=None, class_labels=classes, params=None):
        if outlier_detector is not None:
            self._outlier_transformer = OutlierTransformer(outlier_detector=outlier_detector, class_labels=classes,
                                                           params=params)
        else:
            self._outlier_transformer = None
        self._pipeline = pipeline

    def get_params(self, deep: bool):
        return self._pipeline.get_params(deep=deep) | self._outlier_transformer.get_params(deep=deep)

    def set_params(self, **params):
        self._outlier_transformer.set_params(params=params)
        self._pipeline.set_params(kwargs=params)

    def fit(self, X, y):
        if self._outlier_transformer is not None:
            X, y = self._outlier_transformer.fit_transform(X=X, y=y)
        self._pipeline.fit(X=X, y=y)

    def predict(self, X):
        return self._pipeline.predict(X=X)

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

metrics = {"ACCURACY": accuracy_score, "F1": f1_score, "PRECISION": precision_score, "RECALL": recall_score,
           "AUC": roc_auc_score}
scorers = {metric_name: make_scorer(metric_callable) for metric_name, metric_callable in metrics.items()}

In [40]:
X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = test_df.to_numpy()

In [41]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler

scheme_knn = Scheme(pipeline=Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('feature_selector', SelectPercentile(score_func=mutual_info_classif, percentile=50)),
    ('classifier', KNeighborsClassifier())
]), outlier_detector=LocalOutlierFactor())
params_scheme_knn = {'n_neighbours': list(range(1, 31))}
gs_scheme_knn = GridSearchCV(estimator=scheme_knn, param_grid=params_scheme_knn, scoring=scorers, refit='ACCURACY',
                             n_jobs=-1)
logging.info('KNN SCHEME')
logging.info('TRAINING')
gs_scheme_knn.fit(X=X_train, y=y_train)
logging.info('BEST SCORE: ' + gs_scheme_knn.best_score_)
logging.info('BEST PARAMS: ' + gs_scheme_knn.best_params_)
logging.info('TESTING')
y_predict = gs_scheme_knn.predict(X=X_test)

INFO:root:KNN SCHEME
INFO:root:TRAINING


TypeError: Scheme.__init__() got an unexpected keyword argument 'steps'