Common settings

In [2]:
from utils import (
    Task, LearnerType, Params,
    get_AutoSklearnClassifier, get_XGBModel, 
    AutoSklearnClassifier, XGBClassifier,
    dump_pkl, dump_json, load_pkl, load_json
)

Dataset loading

In [3]:
from sklearn.datasets import fetch_openml
import numpy as np


IDS = [
    ("credit-approval/credit-rating", 29),
    ("credit-g", 31),
    ("diabetes/pima-diabetes", 37),
    ("spambase", 44),
    ("tic-tac-toe", 50),
    ("electricity", 151),
    ("pc4", 1049),
    ("pc3", 1050),
    ("JM1", 1053),
    ("KC2", 1063),
    ("kc1", 1067),
    ("pc1", 1068),
    ("bank-marketing/bank-marketing-full", 1461),
    ("blood-transfusion-service-center", 1464),
    ("ilpd", 1480),
    ("madelon", 1485),
    ("ozone-level-8hr", 1487),
    ("phoneme", 1489),
    ("qsar-biodeg", 1494),
    ("cylinder-bands", 6332),
    ("dresses-sales", 23381),
    ("churn", 40701),
    ("climate-model-simulation-crashes", 40994),
]


DATASETS = []
for dataset_name, dataset_id in IDS:
    data = fetch_openml(data_id=dataset_id, parser="auto", as_frame=True)
    assert len(data.target_names) == 1
    target = data.target_names[0]

    if data.frame.shape[0] < 1000:
        continue
    
    if len(data.frame[target].unique()) != 2:
        continue

    X = data.frame.drop(columns=[target])
    (l1, l2), (c1, c2) = np.unique(data.frame[target], return_counts=True)
    (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
    # NOTE: convention that minority label will always be 1
    y = data.frame[target] == l1
    DATASETS.append((dataset_name, X, y))


[name for name, _, _ in DATASETS]

['credit-g',
 'spambase',
 'electricity',
 'pc4',
 'pc3',
 'JM1',
 'kc1',
 'pc1',
 'bank-marketing/bank-marketing-full',
 'madelon',
 'ozone-level-8hr',
 'phoneme',
 'qsar-biodeg',
 'churn']

Utils for validation

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from time import perf_counter
from sklearn.metrics import (
    confusion_matrix,
    recall_score, 
    precision_score, 
    f1_score, 
    fbeta_score, 
    roc_auc_score,
    roc_curve
)


def validate(X, y, model, task, name, ratio):
    if not Params.VALIDATE:
        return

    if isinstance(model, AutoSklearnClassifier):
        model = model.get_models_with_weights()[0][1]
        _, clf = model.steps.pop()
    elif isinstance(model, XGBClassifier):
        clf = model
        model = object()
        model.steps = []
    else:
        assert False

    res = load_json(Params.BASE_LEARNER, task, name, ratio)
    skf = StratifiedKFold(n_splits=Params.K_FOLDS, shuffle=True, random_state=Params.SEED)
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        if str(fold) in res:
            continue
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for _, step in model.steps:
            if hasattr(step, "fit_resample"):
                X_train, y_train = step.fit_resample(X_train, y_train)
            elif hasattr(step, "fit_transform"):
                X_train = step.fit(X_train, y_train)
            elif hasattr(step, "fit") and hasattr(step, "transform"):
                X_train = step.fit(X_train, y_train).transform(X_train)
            else:
                assert False, f"This step is not a transformer or resampler: {step}."

        train_time = perf_counter()
        clf = clf.fit(X_train, y_train)
        train_time = perf_counter() - train_time

        inference_time = perf_counter()
        y_prob = clf.predict_proba(X_test)[:, 1]
        inference_time = perf_counter() - inference_time

        if task == Task.DATA_IMBALANCE:
            y_test_threshold, y_test, y_prob_threshold, y_prob = train_test_split(
                y_test, y_prob, test_size=0.5, random_state=Params.SEED, stratify=y_test
            )
            fpr, tpr, thresholds = roc_curve(y_test_threshold, y_prob_threshold)
            y_pred = thresholds[(3*tpr*(1-fpr)/(2*(1-fpr)+tpr)).argmax()] < y_prob
        else:
            y_pred = model.predict(X_test)

        res[fold] = {
            "train_time": train_time,
            "inference_time": inference_time,

            "auc_roc": roc_auc_score(y_test, y_prob),
            "confusion_matrix": [int(n) for n in confusion_matrix(y_test, y_pred).ravel()],

            "minority_precision": precision_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            "majority_precision": precision_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),

            "minority_recall": recall_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            "majority_recall": recall_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),

            "minority_f1": f1_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            "majority_f1": f1_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
            "macro_f1": f1_score(y_test, y_pred, average='macro', zero_division=0.0),

            "minority_f2": fbeta_score(y_test, y_pred, beta=2, pos_label=1, average='binary', zero_division=0.0),
            "majority_f2": fbeta_score(y_test, y_pred, beta=2, pos_label=0, average='binary', zero_division=0.0),
            "macro_f2": fbeta_score(y_test, y_pred, beta=2, average='macro', zero_division=0.0),
        }

    dump_json(res, Params.BASE_LEARNER, task, name, ratio)

Data imbalance

In [4]:
from imblearn.datasets import make_imbalance

def sorted_class_count(y):
    (l1, l2), (c1, c2) = np.unique(y, return_counts=True)
    (c1, l1), (c2, l2) = sorted(((c1, l1), (c2, l2)))
    return c1, c2

IMBALANCED_DATSETS = []
ratios = [0.5, 0.25] + [r/100 for r in range(1, 21)]
for name, X, y in DATASETS:
    c1, c2 = sorted_class_count(y)
    IMBALANCED_DATSETS.append((name, c1/c2, X, y))
    for ratio in sorted(ratios, reverse=True):
        c1, c2 = sorted_class_count(y)
        new_minority_count = int(c2 * ratio)
        if c1 < new_minority_count:
            continue
        X, y = make_imbalance(X, y, sampling_strategy={0: c2, 1: new_minority_count}, random_state=Params.SEED)
        IMBALANCED_DATSETS.append((name, ratio, X, y))

for _, ratio, _, y in IMBALANCED_DATSETS:
    c1, c2 = sorted_class_count(y)
    assert (c1 / c2 - ratio) < 0.001

[(name, ratio) for name, ratio, _, _ in IMBALANCED_DATSETS]


[('credit-g', 0.42857142857142855),
 ('credit-g', 0.25),
 ('credit-g', 0.2),
 ('credit-g', 0.19),
 ('credit-g', 0.18),
 ('credit-g', 0.17),
 ('credit-g', 0.16),
 ('credit-g', 0.15),
 ('credit-g', 0.14),
 ('credit-g', 0.13),
 ('credit-g', 0.12),
 ('credit-g', 0.11),
 ('credit-g', 0.1),
 ('credit-g', 0.09),
 ('credit-g', 0.08),
 ('credit-g', 0.07),
 ('credit-g', 0.06),
 ('credit-g', 0.05),
 ('credit-g', 0.04),
 ('credit-g', 0.03),
 ('credit-g', 0.02),
 ('credit-g', 0.01),
 ('spambase', 0.650286944045911),
 ('spambase', 0.5),
 ('spambase', 0.25),
 ('spambase', 0.2),
 ('spambase', 0.19),
 ('spambase', 0.18),
 ('spambase', 0.17),
 ('spambase', 0.16),
 ('spambase', 0.15),
 ('spambase', 0.14),
 ('spambase', 0.13),
 ('spambase', 0.12),
 ('spambase', 0.11),
 ('spambase', 0.1),
 ('spambase', 0.09),
 ('spambase', 0.08),
 ('spambase', 0.07),
 ('spambase', 0.06),
 ('spambase', 0.05),
 ('spambase', 0.04),
 ('spambase', 0.03),
 ('spambase', 0.02),
 ('spambase', 0.01),
 ('electricity', 0.7377564717162

In [5]:
for name, ratio, X, y in IMBALANCED_DATSETS:
    try:
        model = load_pkl(Params.BASE_LEARNER, Task.DATA_IMBALANCE, name, ratio)
    except:
        model = get_AutoSklearnClassifier(X, y, Task.DATA_IMBALANCE)
        dump_pkl(model, Params.BASE_LEARNER, Task.DATA_IMBALANCE, name, ratio)
    validate(X, y, model, Task.DATA_IMBALANCE, name, ratio)

Fitting to the training data:   4%|[32m▍         [0m| 139/3600 [02:19<57:58,  1.00s/it, The total time budget for this task is 1:00:00]  

Noisy data

In [16]:
import numpy as np


NOISY_DATASETS = []
noise_amount = [a / 100 for a in range(1, 11)]

for name, X, y in DATASETS:
    NOISY_DATASETS.append((name, 0, X.copy(), y.copy()))
    noise_to_add = int(len(X) / 100)
    indices_left = [X.index.copy() for _ in range(X.shape[1])]
    for noise in sorted(noise_amount):
        for i, feature in enumerate(X.columns):
            noise_indices = np.random.choice(indices_left[i], noise_to_add, replace=False)  
            if X[feature].dtype == "float64":
                X.loc[noise_indices, feature] = np.random.uniform(X[feature].min(), X[feature].max(), noise_to_add)
            elif X[feature].dtype == "int64":
                X.loc[noise_indices, feature] = np.random.randint(X[feature].min(), X[feature].max()+1, noise_to_add)
            elif X[feature].dtype == "category":
                X.loc[noise_indices, feature] = np.random.choice(X[feature].unique(), noise_to_add)
            else:
                assert False, X[feature].dtype
            indices_left[i].drop(noise_indices)

        NOISY_DATASETS.append((name, noise, X.copy(), y.copy()))


for i in range(1, len(NOISY_DATASETS)):
    name, noise, X2, y2 = NOISY_DATASETS[i]
    _, _, X, y = [
        (name2, noise2, X2, y2) 
        for name2, noise2, X2, y2 in NOISY_DATASETS 
        if name2 == name and noise2 == 0
    ][0]
    for feature in X.columns:
        assert len(X[feature]) == len(X2[feature])
        sm = sum(a != a2 for a, a2 in zip(X[feature], X2[feature]))
        diff = 0.5 if len(X[feature].unique()) <= 10 else 0.02
        assert abs(sm / len(X[feature]) - noise) < diff, f"{sm / len(X[feature])} {noise} {len(X[feature].unique())}"


[(name, noise) for name, noise, _, _ in NOISY_DATASETS]

[('credit-g', 0),
 ('credit-g', 0.01),
 ('credit-g', 0.02),
 ('credit-g', 0.03),
 ('credit-g', 0.04),
 ('credit-g', 0.05),
 ('credit-g', 0.06),
 ('credit-g', 0.07),
 ('credit-g', 0.08),
 ('credit-g', 0.09),
 ('credit-g', 0.1),
 ('spambase', 0),
 ('spambase', 0.01),
 ('spambase', 0.02),
 ('spambase', 0.03),
 ('spambase', 0.04),
 ('spambase', 0.05),
 ('spambase', 0.06),
 ('spambase', 0.07),
 ('spambase', 0.08),
 ('spambase', 0.09),
 ('spambase', 0.1),
 ('electricity', 0),
 ('electricity', 0.01),
 ('electricity', 0.02),
 ('electricity', 0.03),
 ('electricity', 0.04),
 ('electricity', 0.05),
 ('electricity', 0.06),
 ('electricity', 0.07),
 ('electricity', 0.08),
 ('electricity', 0.09),
 ('electricity', 0.1),
 ('pc4', 0),
 ('pc4', 0.01),
 ('pc4', 0.02),
 ('pc4', 0.03),
 ('pc4', 0.04),
 ('pc4', 0.05),
 ('pc4', 0.06),
 ('pc4', 0.07),
 ('pc4', 0.08),
 ('pc4', 0.09),
 ('pc4', 0.1),
 ('pc3', 0),
 ('pc3', 0.01),
 ('pc3', 0.02),
 ('pc3', 0.03),
 ('pc3', 0.04),
 ('pc3', 0.05),
 ('pc3', 0.06),
 ('pc

In [None]:
class NoiseRemover:
    def fit_resample(self, X, y):
        model = get_XGBModel(X, y, Params.BASE_LEARNER, LearnerType.REGRESSION)
        model = model.fit(X, y)
        X_transformed, y_transformed = X, model.predict(X)
        return X_transformed, y_transformed


for name, noise, X, y in NOISY_DATASETS:
    X_transformed, y_transformed = NoiseRemover().fit_transform(X, y)
    model = get_XGBModel(X_transformed, y_transformed, Params.BASE_LEARNER, LearnerType.CLASSIFICATION)
    validate(X_transformed, y_transformed, model, Task.NOISY_DATA, name, noise)

semi-supervised

In [7]:
import numpy as np
import pandas as pd


HIDDEN_DATASETS = []
hidden_amount = [a / 100 for a in range(1, 11)]

for name, X, y in DATASETS:
    X_hidden = pd.DataFrame(np.empty((0, X.shape[1])))
    HIDDEN_DATASETS.append((name, 0, X.copy(), X_hidden.copy(), y.copy()))
    num_to_hide = int(len(X) / 100)
    indices_left = X.index
    for hidden in sorted(hidden_amount):
        indices_to_hide = np.random.choice(indices_left, num_to_hide, replace=False)        
        X_hidden = pd.concat([X_hidden, X.loc[indices_to_hide]], ignore_index=True)
        indices_left = indices_left.drop(indices_to_hide)
        X = X.drop(indices_to_hide)
        y = y.drop(indices_to_hide)
        HIDDEN_DATASETS.append((name, hidden, X.copy(), X_hidden.copy(), y.copy()))

for _, hidden, X, X_hidden, y in HIDDEN_DATASETS:
    assert abs(len(X_hidden) / len(X) - hidden) < 0.015, f"{len(X_hidden) / len(X)}, {hidden}"
    assert len(X) == len(y)


[(name, hidden) for name, hidden, _, _, _ in HIDDEN_DATASETS]

[('credit-g', 0),
 ('credit-g', 0.01),
 ('credit-g', 0.02),
 ('credit-g', 0.03),
 ('credit-g', 0.04),
 ('credit-g', 0.05),
 ('credit-g', 0.06),
 ('credit-g', 0.07),
 ('credit-g', 0.08),
 ('credit-g', 0.09),
 ('credit-g', 0.1),
 ('spambase', 0),
 ('spambase', 0.01),
 ('spambase', 0.02),
 ('spambase', 0.03),
 ('spambase', 0.04),
 ('spambase', 0.05),
 ('spambase', 0.06),
 ('spambase', 0.07),
 ('spambase', 0.08),
 ('spambase', 0.09),
 ('spambase', 0.1),
 ('electricity', 0),
 ('electricity', 0.01),
 ('electricity', 0.02),
 ('electricity', 0.03),
 ('electricity', 0.04),
 ('electricity', 0.05),
 ('electricity', 0.06),
 ('electricity', 0.07),
 ('electricity', 0.08),
 ('electricity', 0.09),
 ('electricity', 0.1),
 ('pc4', 0),
 ('pc4', 0.01),
 ('pc4', 0.02),
 ('pc4', 0.03),
 ('pc4', 0.04),
 ('pc4', 0.05),
 ('pc4', 0.06),
 ('pc4', 0.07),
 ('pc4', 0.08),
 ('pc4', 0.09),
 ('pc4', 0.1),
 ('pc3', 0),
 ('pc3', 0.01),
 ('pc3', 0.02),
 ('pc3', 0.03),
 ('pc3', 0.04),
 ('pc3', 0.05),
 ('pc3', 0.06),
 ('pc

In [None]:
from sklearn.ensemble import BaggingClassifier
import pandas as pd


for name, hidden, X, X_hidden, y in HIDDEN_DATASETS:
    try:
        X, y = load_pkl(Params.BASE_LEARNER, Task.SEMI_SUPERVISED, name, hidden)
    except:
        num_indices_to_select = len(X_hidden) // 10
        while len(X_hidden) != 0:
            model = BaggingClassifier(
                estimator=get_XGBModel(X, y, Params.BASE_LEARNER, LearnerType.CLASSIFICATION),
                n_estimators = 11,
                max_samples = 0.5,
                max_features = 0.5,
                bootstrap = True,
                bootstrap_features = True,
                n_jobs=1,
                random_state = Params.SEED
            ).fit(X, y)
            y_prob = model.predict_proba(X_hidden)[:,1]
            max_indices = np.argsort(np.maximum(y_prob, 1 - y_prob))[::-1][:num_indices_to_select]
            X = pd.concat([X, pd.DataFrame(X_hidden.iloc[max_indices])], ignore_index=True)
            y = pd.concat([y, pd.DataFrame(0.5 <= y_prob[max_indices])], ignore_index=True)
            X_hidden = X_hidden.drop(max_indices) 
        dump_pkl((X, y), Params.BASE_LEARNER, Task.SEMI_SUPERVISED, name, hidden) 

    model = get_XGBModel(X, y, Params.BASE_LEARNER, LearnerType.CLASSIFICATION)
    validate(X, y, model, Task.SEMI_SUPERVISED, name, hidden)

Inadequate features

In [None]:
for name, X, y in DATASETS:
    try:
        model = load_pkl(Params.BASE_LEARNER, Task.FEATURE_INADEQUACY, name, None)
    except:
        model = get_AutoSklearnClassifier(X, y, task=Task.INADEQUATE_FEATURES)
        dump_pkl(model, Params.BASE_LEARNER, Task.DATA_IMBALANCE, name, ratio)
    validate(X, y, model, Task.FEATURE_INADEQUACY, name, None)