Auto-sklearn settings

In [None]:
SEED = 0
TIME = 15
TIME_PER_RUN = None
META = 0
TEMP = "temp_folder"
K_FOLDS = 5
N_JOBS = None
MEM = None

XGBoostClassifier settings

In [None]:
from math import floor, sqrt
LEARNING_RATE = 0.2, # 0.0 - 1.0, log, not sure if each forest can have its own, it might lead to overfitting anyway

MAX_DEPTH = 20, # 2 - 50, 6 - 20 is another more conservative option
SUBSAMPLE = 0.63, # 0.0 - 1.0
# u can set only one of the colsample_by*
# colsample_bytree: Optional[float] = None,
# colsample_bylevel: Optional[float] = None,
COLSAMPLE_BYNODE = lambda m: floor(sqrt(m))/m, # m..num_of_features, 0 - m, log?
N_ESTIMATORS = 10, # 100 - 500 = number of random forests in booster
NUM_PARALLEL_TREE = 10, # 100 - 500 = number of trees in each random forest
REG_LAMBDA = 0, # -10. - 10.0, log = prunning of trees, higher value -> more prunning, not sure if negative values do anything
MIN_CHILD_WEIGHT = 2, # 0.0 - 10.0, log, higher value -> less options to choose from when selecting new nodes in trees

OBJECTIVE = 'binary:logistic', # list at https://xgboost.readthedocs.io/en/stable/parameter.html, search for objective
SEED_PER_ITERATION=True

XGBoostRegressor settings

In [None]:
from math import floor, sqrt
LEARNING_RATE_REGRESSION = 0.2, # 0.0 - 1.0, log, not sure if each forest can have its own, it might lead to overfitting anyway

MAX_DEPTH_REGRESSION = 20, # 2 - 50, 6 - 20 is another more conservative option
SUBSAMPLE_REGRESSION = 0.63, # 0.0 - 1.0
# u can set only one of the colsample_by*
# colsample_bytree: Optional[float] = None,
# colsample_bylevel: Optional[float] = None,
COLSAMPLE_BYNODE_REGRESSION = lambda m: floor(sqrt(m))/m, # m..num_of_features, 0 - m, log?
N_ESTIMATORS_REGRESSION = 10, # 100 - 500 = number of random forests in booster
NUM_PARALLEL_TREE_REGRESSION = 10, # 100 - 500 = number of trees in each random forest
REG_LAMBDA_REGRESSION = 0, # -10. - 10.0, log = prunning of trees, higher value -> more prunning, not sure if negative values do anything
MIN_CHILD_WEIGHT_REGRESSION = 2, # 0.0 - 10.0, log, higher value -> less options to choose from when selecting new nodes in trees

OBJECTIVE_REGRESSION = 'binary:logistic', # list at https://xgboost.readthedocs.io/en/stable/parameter.html, search for objective
SEED_PER_ITERATION_REGRESSION=True

Dataset loading

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np


# TODO: find all of the missing datasets
IDS = [
    # ("breast-w", 15),
    # ("credit-approval", 29),
    # ("credit-g", 31),
    # ("diabetes", 37),
    # ("sick", 38),
    # ("spambase", 44),
    # ("tic-tac-toe", 50),
    # ("electricity", 151),
    # ("vowel", 307),
    # ("pc4", 1049),
    # ("pc3", 1050),
    ("JM1", 1053),
    # ("KC2", 1063),
    # ("kc1", 1067),
    # ("pc1", 1068),
    # ("bank-marketing", 1461),
    # ("blood-transfusion-service-center", 1464),
    # ("ilpd", 1480),
    # ("madelon", 1485),
    # ("nomao", 1486),
    # ("ozone-level-8hr", 1487),
    # ("phoneme", 1489),
    # ("qsar-biodeg", 1494),
    # ("adult", 1590),
    # ("Bioresponse", 4134),
    # ("cylinder-bands", 6332),
    # ("dresses-sales", 23381),
    # ("numerai28.6", 23517),
    # ("churn", 40701),
    # ("wilt", 40983),
    # ("climate-model-simulation-crashes", 40994),
]


DATASETS = []
for dataset_name, dataset_id in IDS:
    data = fetch_openml(data_id=dataset_id, parser="auto", as_frame=True)

    if data.frame.shape[0] < 1000:
        continue
    if len(data.target_names) != 1:
        continue
    target = data.target_names[0]
    if len(data.frame[target].unique()) != 2:
        continue

    X = data.frame.drop(columns=[target])
    (l1, l2), (c1, c2) = np.unique(data.frame[target], return_counts=True)
    (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
    y = data.frame[target] == l1
    DATASETS.append((dataset_name, X, y))

[name for name, _, _ in DATASETS]

Utils for validation

In [None]:
from sklearn.metrics import (
    recall_score, 
    precision_score, 
    f1_score, 
    fbeta_score, 
    roc_auc_score,
    roc_curve
)


def validate(y_test, y_prob):
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    y_pred = thresholds[(tpr - fpr).argmax()] < y_prob
    return {
        "auc_roc": roc_auc_score(y_test, y_prob),
        "weighted_precision": precision_score(y_test, y_pred, average='weighted', zero_division=0.0),
        "macro_precision": precision_score(y_test, y_pred, average='macro', zero_division=0.0),
        "weighted_recall": recall_score(y_test, y_pred, average='weighted', zero_division=0.0),
        "macro_recall": recall_score(y_test, y_pred, average='macro', zero_division=0.0),
        "weighted_f1": f1_score(y_test, y_pred, average='weighted', zero_division=0.0),
        "macro_f1": f1_score(y_test, y_pred, average='macro', zero_division=0.0),
        "weighted_f2": fbeta_score(y_test, y_pred, beta=2, average='weighted', zero_division=0.0),
        "macro_f2": fbeta_score(y_test, y_pred, beta=2, average='macro', zero_division=0.0),
        "y_pred": [float(y) for y in y_pred],
        "y_prob": [float(y) for y in y_prob],
        "y_test": [float(y) for y in y_test],
    }

Data imbalance

In [None]:
from imblearn.datasets import make_imbalance


IMBALANCED_DATSETS = []
ratios = [0.5, 0.25] + [r/100 for r in range(1, 21)]
for name, X, y in DATASETS:
    for ratio in sorted(ratios, reverse=True):
        try:
            (l1, l2), (c1, c2) = np.unique(data.frame[target], return_counts=True)
            (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
            X, y = make_imbalance(X, y, sampling_strategy={0: c2, 1: int(c2 * ratio)}, random_state=SEED)
            IMBALANCED_DATSETS.append((name, ratio, X, y))
        except ValueError as e:
            if "With under-sampling methods, the number of samples in a class should be less or equal to the original number of samples." in str(e):
                continue
            raise e
[(name, ratio) for name, ratio, _, _ in IMBALANCED_DATSETS]

In [None]:
# import of auto-sklearn version that supports SMOTE, source has to be downloaded localy
if True:
    import sys
    sys.path.insert(0, "../my_autosklearn")
    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.metrics import roc_auc, recall_weighted

    import autosklearn.pipeline.components.data_preprocessing
    from no_data_preprocessor import NoPreprocessing
    autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)


from os.path import exists
import shutil
from sklearn.model_selection import StratifiedKFold


imbalance_scores = {}
for name, ratio, X, y in IMBALANCED_DATSETS:

    if exists(TEMP):
        shutil.rmtree(TEMP, ignore_errors=True)
    
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
    model = AutoSklearnClassifier(
        time_left_for_this_task=TIME,
        metric=[roc_auc, recall_weighted],
        initial_configurations_via_metalearning=META,
        include={
            "data_preprocessor": ["no_preprocessing"],
            "balancing": ["none", "weighting", "SVMSMOTE"],
            "feature_preprocessor": ["no_preprocessing"],
            "classifier": ["xgboost"]
            },
        resampling_strategy=skf,
        seed=SEED,
        tmp_folder=TEMP,
        delete_tmp_folder_after_terminate=True,
        n_jobs=N_JOBS,
        memory_limit=MEM,
        per_run_time_limit=TIME_PER_RUN
    ).fit(X, y)

    imbalance_scores[f"{name}(ratio={ratio:.2f})"] = {}
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = model.refit(X_train, y_train)
        imbalance_scores[f"{name}(ratio={ratio:.2f})"][fold] = validate(y_test, model.predict_proba(X_test)[:, 1])

imbalance_scores

Noisy data

In [None]:
NOISY_DATSETS = []
noise_amount = [a/100 for a in range(1, 11)]
for name, X, y in DATASETS:
    for noise in sorted(noise_amount):
        # TODO: add 1% of noise to X each iteration, dont over write previous noise
        # X = ...
        pass

[(name, noise) for name, noise, _, _ in NOISY_DATSETS]

In [None]:
from xgboost import XGBRegressor, XGBClassifier


def transform(X, y):
    model = XGBRegressor(
        learning_rate = LEARNING_RATE_REGRESSION,
        max_depth = MAX_DEPTH_REGRESSION,
        subsample = SUBSAMPLE_REGRESSION,
        colsample_bynode = COLSAMPLE_BYNODE_REGRESSION(m),
        n_estimators = N_ESTIMATORS_REGRESSION,
        num_parallel_tree = NUM_PARALLEL_TREE_REGRESSION,
        reg_lambda = REG_LAMBDA_REGRESSION,
        min_child_weight = MIN_CHILD_WEIGHT_REGRESSION,
        objective = OBJECTIVE_REGRESSION,
        seed=SEED, 
        seed_per_iteration=SEED_PER_ITERATION_REGRESSION,
    )
    model.fit(X, y)
    # TODO: create a new dataset instead of renaming labels
    X_transformed, y_transformed = X, model.predict(X)
    return X_transformed, y_transformed


noisy_scores = {}
for name, noise, X, y in NOISY_DATSETS:
    m = len(X[0])
    noisy_scores[f"{name}(noise={noise})"] = {}
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = transform(X_train, y_train)
        model = XGBClassifier(
            learning_rate = LEARNING_RATE,
            max_depth = MAX_DEPTH,
            subsample = SUBSAMPLE,
            colsample_bynode = COLSAMPLE_BYNODE(m),
            n_estimators = N_ESTIMATORS,
            num_parallel_tree = NUM_PARALLEL_TREE,
            reg_lambda = REG_LAMBDA,
            min_child_weight = MIN_CHILD_WEIGHT,
            objective = OBJECTIVE,
            seed=SEED, 
            seed_per_iteration=SEED_PER_ITERATION,
        ).fit(X_train, y_train)
        noisy_scores[f"{name}(noise={noise})"][fold] = validate(y_test, model.predict_proba(X_test)[:, 1])

semi-supervised

In [None]:
HIDDEN_DATSETS = []
hidden_amount = [a/100 for a in range(1, 11)]
for name, X, y in DATASETS:
    for hidden in sorted(noise_amount):
        # TODO: create hidden instances
        X, X_hidden = X, X
        HIDDEN_DATSETS.append((name, hidden, X, X_hidden, y))

[(name, hidden) for name, hidden, _, _, _ in HIDDEN_DATSETS]

In [None]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier


for name, hidden, X, X_hidden, y in HIDDEN_DATSETS:
    while True: # TODO: while there are any hidden instances
        model = BaggingClassifier(
            estimator=XGBClassifier(
                learning_rate = LEARNING_RATE,
                max_depth = MAX_DEPTH,
                subsample = SUBSAMPLE,
                colsample_bynode = COLSAMPLE_BYNODE(m),
                n_estimators = N_ESTIMATORS,
                num_parallel_tree = NUM_PARALLEL_TREE,
                reg_lambda = REG_LAMBDA,
                min_child_weight = MIN_CHILD_WEIGHT,
                objective = OBJECTIVE,
                seed=SEED, 
                seed_per_iteration=SEED_PER_ITERATION,
            ),
            n_estimators = 10,
            n_jobs = N_JOBS,
            random_state = SEED
        )
        model.fit(X, y)
        y_hidden = model.predict_proba(X_hidden)
        # TODO: select instances from X_hidden that most of the ensemble agrees on,
        # add them together with labels to X, y and remove from X_hidden


Inadequate features

In [None]:
# import of auto-sklearn version that supports SMOTE, it has to be downloaded localy
if True:
    import sys
    sys.path.insert(0, "../my_autosklearn")
    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.metrics import roc_auc, recall_weighted

    import autosklearn.pipeline.components.data_preprocessing
    from no_data_preprocessor import NoPreprocessing
    autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)


from os.path import exists
import shutil
from sklearn.model_selection import StratifiedKFold


feature_preprocessor = [
    'densifier', 'extra_trees_preproc_for_classification', 'fast_ica', 'feature_agglomeration', 
    'kernel_pca', 'kitchen_sinks', 'liblinear_svc_preprocessor', 'no_preprocessing', 'nystroem_sampler', 'pca', 
    'polynomial', 'random_trees_embedding', 'select_percentile_classification', 'select_rates_classification', 
    'truncatedSVD'
]


inadequate_features_scores = {}
for name, X, y in DATASETS:

    if exists(TEMP):
        shutil.rmtree(TEMP, ignore_errors=True)
    
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
    model = AutoSklearnClassifier(
        time_left_for_this_task=TIME,
        metric=[roc_auc, recall_weighted],
        initial_configurations_via_metalearning=META,
        include={
            "data_preprocessor": ["no_preprocessing"],
            "balancing": ["none"],
            "feature_preprocessor": feature_preprocessor,
            "classifier": ["xgboost"]
            },
        resampling_strategy=skf,
        seed=SEED,
        tmp_folder=TEMP,
        delete_tmp_folder_after_terminate=True,
        n_jobs=N_JOBS,
        memory_limit=MEM,
        per_run_time_limit=TIME_PER_RUN
    ).fit(X, y)

    inadequate_features_scores[name] = {}
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = model.refit(X_train, y_train)
        inadequate_features_scores[name][fold] = validate(y_test, model.predict_proba(X_test)[:, 1])

inadequate_features_scores