Common settings

In [51]:
# import of auto-sklearn version that supports SMOTE, source has to be downloaded localy
if True:
    import sys
    sys.path.insert(0, "../my_autosklearn")
from enum import Enum


SEED = 0
K_FOLDS = 5
BASE_LEARNER = "RF"


class Task(Enum):
    DATA_IMBALANCE = 1
    FEATURE_INADEQUACY = 2
    SEMI_SUPERVISED = 3
    NOISY_DATA = 4


class LearnerType(Enum):
    CLASSIFICATION = 1
    REGRESSION = 2

XGB settings

In [52]:
from math import floor, sqrt
from xgboost import XGBClassifier, XGBRFRegressor


# TODO: figure out parameters, especially objective 
def get_XGBModel(X, y, base_learner, task):
    m = X.shape[1]

    if task == LearnerType.CLASSIFICATION:
        XGBModel = XGBClassifier
        objective = "binary:logistic"
    elif task == LearnerType.REGRESSION:
        XGBModel = XGBRFRegressor
        objective = "binary:logistic"
    else:
        assert False, f"Wrong arguent: {task}."

    if base_learner == "RF":
        return XGBModel(
            learning_rate = 0.2, # 0.0 - 1.0, log, not sure if each forest can have its own, it might lead to overfitting anyway
            max_depth = 20, # 2 - 50, 6 - 20 is another more conservative option
            subsample = 0.63, # 0.0 - 1.0
            # u can set only one of the colsample_by*
            # colsample_bytree: Optional[float] = None,
            # colsample_bylevel: Optional[float] = None,
            colsample_bynode = floor(sqrt(m))/m, # m..num_of_features, 0 - m, log?
            n_estimators = 10, # 100 - 500 = number of random forests in booster
            num_parallel_tree = 10, # 100 - 500 = number of trees in each random forest
            reg_lambda = 0, # -10. - 10.0, log = prunning of trees, higher value -> more prunning, not sure if negative values do anything
            min_child_weight = 2, # 0.0 - 10.0, log, higher value -> less options to choose from when selecting new nodes in trees
            objective = objective, # list at https://xgboost.readthedocs.io/en/stable/parameter.html, search for objective
            seed=SEED, 
            seed_per_iteration=True,
        )
    elif base_learner == "DecisionTree":
        return XGBModel(
            objective=objective,
            seed=SEED,
            seed_per_iteration=True
        )
    else:
        assert False, f"Wrong arguent: {base_learner}."

Auto-sklearn settings

In [53]:

from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from ConfigSpace.configuration_space import ConfigurationSpace


class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(feat_type=None, dataset_properties=None):
        return ConfigurationSpace()


import autosklearn.pipeline.components.data_preprocessing
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

In [54]:
from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from ConfigSpace.configuration_space import ConfigurationSpace


class XGBClassifier_(AutoSklearnClassificationAlgorithm):
    def __init__(self, **kwargs):
        self.estimator = None
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, y, sample_weight=None):
        self.estimator = get_XGBModel(X, y, BASE_LEARNER, LearnerType.CLASSIFICATION).fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "xgboost",
            "name": "xgboost",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(feat_type=None, dataset_properties=None):
        return ConfigurationSpace()  # TODO: optimize some of the parameters

import autosklearn.pipeline.components.classification
autosklearn.pipeline.components.classification.add_classifier(XGBClassifier_)

In [55]:
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import make_scorer, roc_auc, f1

from sklearn.model_selection import StratifiedKFold
from os.path import exists
import shutil


def minority_precision_(y_test, y_pred):
    from sklearn.metrics import precision_score
    return precision_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0)


minority_precision = make_scorer(
    name="minority_precision",
    score_func=minority_precision_,
    optimum=1,
    greater_is_better=True,
    needs_proba=False,
    needs_threshold=False,
)


TEMP = "temp_folder"


# TODO: figure out parameters
def get_AutoSklearnClassifier(X, y, task):
    if task == Task.DATA_IMBALANCE:
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
        can_apply_smote = len(numerical_features) == X.shape[1]
        metric=[roc_auc, minority_precision]
        include={
            "data_preprocessor": ["NoPreprocessing"],
            "balancing": ["none", "weighting"] + (["SVMSMOTE"] if can_apply_smote else []),
            "feature_preprocessor": ["no_preprocessing"],
            "classifier": ["XGBClassifier_"]
        }
    elif task == Task.INADEQUATE_FEATURES:
        metric=[f1, minority_precision]
        include={
            "data_preprocessor": ["NoPreprocessing"],
            "balancing": ["none"],
            "classifier": ["XGBClassifier_"]
        }
    else:
        assert False, f"Wrong arguent: {task}."

    if exists(TEMP):
        shutil.rmtree(TEMP, ignore_errors=True)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
    return AutoSklearnClassifier(
        time_left_for_this_task=5*60,
        metric=metric,
        initial_configurations_via_metalearning=0,
        ensemble_class=None,
        include=include,
        resampling_strategy=skf,
        seed=SEED,
        tmp_folder=TEMP,
        delete_tmp_folder_after_terminate=False,
        n_jobs=1,
        memory_limit=3000,
        per_run_time_limit=5*60
    ).fit(X, y)

Dataset loading

In [67]:
from sklearn.datasets import fetch_openml
import numpy as np


# TODO: find all of the missing datasets
IDS = [
    # ("breast-w", 15),
    # ("credit-approval", 29),
    # ("credit-g", 31),
    # ("diabetes", 37),
    # ("sick", 38),
    # ("spambase", 44),
    # ("tic-tac-toe", 50),
    # ("electricity", 151),
    # ("vowel", 307),
    # ("pc4", 1049),
    # ("pc3", 1050),
    ("JM1", 1053),
    # ("KC2", 1063),
    # ("kc1", 1067),
    # ("pc1", 1068),
    # ("bank-marketing", 1461),
    # ("blood-transfusion-service-center", 1464),
    # ("ilpd", 1480),
    # ("madelon", 1485),
    # ("nomao", 1486),
    # ("ozone-level-8hr", 1487),
    # ("phoneme", 1489),
    # ("qsar-biodeg", 1494),
    # ("adult", 1590),
    # ("Bioresponse", 4134),
    # ("cylinder-bands", 6332),
    # ("dresses-sales", 23381),
    # ("numerai28.6", 23517),
    # ("churn", 40701),
    # ("wilt", 40983),
    # ("climate-model-simulation-crashes", 40994),
]


DATASETS = []
for dataset_name, dataset_id in IDS:
    data = fetch_openml(data_id=dataset_id, parser="auto", as_frame=True)

    if data.frame.shape[0] < 1000:
        continue
    if len(data.target_names) != 1:
        continue
    target = data.target_names[0]
    if len(data.frame[target].unique()) != 2:
        continue

    if data.frame.isna().any().any():
        nan_percent = data.frame.isna().sum().sum() / data.frame.shape[0]
        assert nan_percent < 0.01, nan_percent
        data.frame.dropna(inplace=True)

    X = data.frame.drop(columns=[target])
    (l1, l2), (c1, c2) = np.unique(data.frame[target], return_counts=True)
    (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
    # NOTE: convention that minority label will always be 1
    y = data.frame[target] == l1
    DATASETS.append((dataset_name, X, y))

[name for name, _, _ in DATASETS]

2103 true 8777 false
Counter({False: 8777, True: 2103})


['JM1']

Utils for validation

In [71]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    confusion_matrix,
    recall_score, 
    precision_score, 
    f1_score, 
    fbeta_score, 
    roc_auc_score,
    roc_curve
)


# TODO: which metric and how to compute for which task?
# TODO: measure train time and inference time for each fold
def validate(X, y, model, task):
    res = {}
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if hasattr(model, "refit"):
            model = model.refit(X_train, y_train)
        else:
            # NOTE:  XGB docs says that calling fit() multiple times
            # will cause re-fit, unless xgb_model parameter
            # is provided explicitly
            model = model.fit(X_train, y_train)

        if task == Task.DATA_IMBALANCE:
            y_prob = model.predict_proba(X_test)[:, 1]
            fpr, tpr, thresholds = roc_curve(y_test, y_prob)
            y_pred = thresholds[(tpr - fpr).argmax()] < y_prob

            res[fold] = {
                "auc_roc": roc_auc_score(y_test, y_prob),
                "confusion_matrix": confusion_matrix(y_test, y_pred),

                "minority_precision": precision_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
                "majority_precision": precision_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
                "weighted_precision": precision_score(y_test, y_pred, average='weighted', zero_division=0.0),
                "macro_precision": precision_score(y_test, y_pred, average='macro', zero_division=0.0),
                
                "minority_recall": recall_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
                "majority_recall": recall_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
                "weighted_recall": recall_score(y_test, y_pred, average='weighted', zero_division=0.0),
                "macro_recall": recall_score(y_test, y_pred, average='macro', zero_division=0.0),
                
                "minority_f1": f1_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
                "majority_f1": f1_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
                "weighted_f1": f1_score(y_test, y_pred, average='weighted', zero_division=0.0),
                "macro_f1": f1_score(y_test, y_pred, average='macro', zero_division=0.0),
                
                "minority_f2": fbeta_score(y_test, y_pred, beta=2, pos_label=1, average='binary', zero_division=0.0),
                "majority_f2": fbeta_score(y_test, y_pred, beta=2, pos_label=0, average='binary', zero_division=0.0),
                "weighted_f2": fbeta_score(y_test, y_pred, beta=2, average='weighted', zero_division=0.0),
                "macro_f2": fbeta_score(y_test, y_pred, beta=2, average='macro', zero_division=0.0),
                
                "y_pred": [float(y) for y in y_pred],
                "y_prob": [float(y) for y in y_prob],
                "y_test": [float(y) for y in y_test],
            }
        else:
            pass  # TODO
    return res

Data imbalance

In [58]:
from imblearn.datasets import make_imbalance


IMBALANCED_DATSETS = []
ratios = [0.5, 0.25] + [r/100 for r in range(1, 21)]
for name, X, y in DATASETS:
    for ratio in sorted(ratios, reverse=True):
        (l1, l2), (c1, c2) = np.unique(data.frame[target], return_counts=True)
        (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
        new_minority_count = int(c2 * ratio)
        if c1 < new_minority_count:
            continue
        X, y = make_imbalance(X, y, sampling_strategy={0: c2, 1: new_minority_count}, random_state=SEED)
        IMBALANCED_DATSETS.append((name, ratio, X, y))

[(name, ratio) for name, ratio, _, _ in IMBALANCED_DATSETS]

[('JM1', 0.2),
 ('JM1', 0.19),
 ('JM1', 0.18),
 ('JM1', 0.17),
 ('JM1', 0.16),
 ('JM1', 0.15),
 ('JM1', 0.14),
 ('JM1', 0.13),
 ('JM1', 0.12),
 ('JM1', 0.11),
 ('JM1', 0.1),
 ('JM1', 0.09),
 ('JM1', 0.08),
 ('JM1', 0.07),
 ('JM1', 0.06),
 ('JM1', 0.05),
 ('JM1', 0.04),
 ('JM1', 0.03),
 ('JM1', 0.02),
 ('JM1', 0.01)]

In [72]:
imbalance_scores = {}
for name, ratio, X, y in IMBALANCED_DATSETS:
    model = get_AutoSklearnClassifier(X, y, Task.DATA_IMBALANCE)
    imbalance_scores[f"{name}(ratio={ratio:.2f})"] = validate(X, y, model, Task.DATA_IMBALANCE)

imbalance_scores

Noisy data

In [None]:
NOISY_DATSETS = []
noise_amount = [a/100 for a in range(1, 11)]
for name, X, y in DATASETS:
    for noise in sorted(noise_amount):
        # TODO: add 1% of noise to X each iteration, dont over write previous noise
        # X = ...
        pass

[(name, noise) for name, noise, _, _ in NOISY_DATSETS]

[]

In [None]:
class NoisyXGBClassifier:
    def fit(self, X, y):
        X_train, y_train = self.remove_noise(X, y)
        model = get_XGBModel(X_train, y_train, BASE_LEARNER, LearnerType.CLASSIFICATION)
        model = model.fit(X_train, y_train)
        return model

    def remove_noise(self, X, y):
        model = get_XGBModel(X, y, BASE_LEARNER, LearnerType.REGRESSION)
        model = model.fit(X, y)
        # TODO: create a new dataset instead of renaming labels
        X_transformed, y_transformed = X, model.predict(X)
        return X_transformed, y_transformed


noisy_scores = {}
for name, noise, X, y in NOISY_DATSETS:
    model = NoisyXGBClassifier()
    noisy_scores[f"{name}(noise={noise})"] = validate(X, y, model, Task.NOISY_DATA)

noisy_scores

{}

semi-supervised

In [None]:
HIDDEN_DATSETS = []
hidden_amount = [a/100 for a in range(1, 11)]
for name, X, y in DATASETS:
    for hidden in sorted(hidden_amount):
        # TODO: create hidden instances
        X, X_hidden = X, X
        HIDDEN_DATSETS.append((name, hidden, X, X_hidden, y))

[(name, hidden) for name, hidden, _, _, _ in HIDDEN_DATSETS]

[('JM1', 0.01),
 ('JM1', 0.02),
 ('JM1', 0.03),
 ('JM1', 0.04),
 ('JM1', 0.05),
 ('JM1', 0.06),
 ('JM1', 0.07),
 ('JM1', 0.08),
 ('JM1', 0.09),
 ('JM1', 0.1)]

In [None]:
from sklearn.ensemble import BaggingClassifier


# TODO: how to set up bagging
N_ESTIMATORS_BAGGING = 10
N_JOBS_BAGGING = 1

semi_scores = {}
for name, hidden, X, X_hidden, y in HIDDEN_DATSETS:
    while True: # TODO: while there are any hidden instances
        model = BaggingClassifier(
            estimator=get_XGBModel(X, y, BASE_LEARNER, LearnerType.CLASSIFICATION),
            n_estimators = N_ESTIMATORS_BAGGING,
            n_jobs = N_JOBS_BAGGING,
            random_state = SEED
        )
        model.fit(X, y)
        y_hidden = model.predict_proba(X_hidden)
        # TODO: select instances from X_hidden that most of the ensemble agrees on,
        # add them together with labels to X, y and remove from X_hidden
    model = get_XGBModel(X, y, BASE_LEARNER, LearnerType.CLASSIFICATION)
    semi_scores[f"{name}(noise={noise})"] = validate(X, y, model, Task.SEMI_SUPERVISED)


KeyboardInterrupt: 

Inadequate features

In [None]:
inadequate_features_scores = {}
for name, X, y in DATASETS:
    model = get_AutoSklearnClassifier(X, y, task=Task.INADEQUATE_FEATURES)
    inadequate_features_scores[name] = validate(X, y, model, Task.FEATURE_INADEQUACY)

inadequate_features_scores