In [9]:
from datasets import get_datasets
from experiments import get_experiments
from file_loader import Folders, load_json, load_pkl, dump_pkl, dump_json
from metrics import (
    imbalance_ratio,
    partial_roc_auc_score_,
    harmonic_mean_recall_,
    NEW_ASKL_METRICS
)
from sklearn.model_selection import StratifiedKFold

from collections import defaultdict, Counter
from pprint import pprint
from os.path import exists

import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    recall_score, 
    precision_score, 
    f1_score,
    fbeta_score, 
    roc_auc_score,
    roc_curve,
)

In [10]:
FIT=0
VALIDATE=1
RUN_INFO=0
IGNORE_ERRORS=0

TIME = 8 * 60
TIME_PER_RUN = 90
K_FOLDS = 5
N_JOBS = 15
SEED = 0
REMOVE_TEMP = False
MEM = 9000

In [11]:
def run_info(model, dataset_name, ratio, experiment_name, run_info_file):
    print(f"Computing run summary, {dataset_name}, {ratio:.2f}, {experiment_name} ...")
    os.makedirs(os.path.dirname(run_info_file), exist_ok=True)

    try:
        ensemble = model.show_models()
    except:
        ensemble = None

    with open(run_info_file, 'w') as f:
        f.write(model.sprint_statistics())
        f.write("\n")
        pprint(model.leaderboard(), stream=f)
        f.write("\n")
        pprint(ensemble, stream=f)
        f.write("\n")
        dct = defaultdict(Counter)
        for key, val in model.automl_.runhistory_.data.items():
            status = str(val.status).split(".")[1]
            balancing = model.automl_.runhistory_.ids_config[key.config_id]['balancing:strategy']
            dct[balancing][status] += 1
        pprint(dct, stream=f)
        f.write("\n")
        if ensemble is None:
            return

        ensemble = [(model, str(model["balancing"]).split("(")[0]) for model in ensemble.values()]
        dct = {
            "statuses": Counter(str(val.status) for val in model.automl_.runhistory_.data.values()),
            "none": sum(model["ensemble_weight"] for model, balancing_name in ensemble if balancing_name == "NoPreprocessing"), 
            "weighting": sum(model["ensemble_weight"] for model, balancing_name in ensemble if balancing_name == "Weighting"), 
            "smote": sum(model["ensemble_weight"] for model, balancing_name in ensemble if balancing_name not in ("NoPreprocessing", "Weighting")), 
            "smote_sampling_strategy": [
                (model["balancing"].choice.sampling_strategy, model["ensemble_weight"])
                for model, balancing_name in ensemble if balancing_name not in ("NoPreprocessing", "Weighting")
            ],
            "ensemble_size": len(ensemble)
        }
        pprint(dct, stream=f)
        f.write("\n")

In [12]:
def check_model_settings(model, experiment, X, y):
    if experiment["CLASSIFIER"] != "auto-sklearn":
        return

    default = model.get_configuration_space(X, y)["balancing:strategy"].default_value
    choices = model.get_configuration_space(X, y)["balancing:strategy"].choices
    sampling_startegy = "balancing:SVMSMOTE:sampling_strategy" in model.get_configuration_space(X, y)
    expected_default = experiment["DEFAULT"]
    expected_choices = sorted(experiment["INCLUDE"]["balancing"])

    metric = Counter(str(m) for m in model.metric)
    expected_metric = Counter(str(m) for m in experiment["METRIC"])

    assert model.time_left_for_this_task == TIME, model.time_left_for_this_task
    assert model.per_run_time_limit == TIME_PER_RUN
    assert model.seed == SEED
    assert model.memory_limit == MEM
    assert model.n_jobs == N_JOBS, model.n_jobs
    assert model.resampling_strategy.n_splits == K_FOLDS
    assert model.resampling_strategy.shuffle == True
    assert model.resampling_strategy.random_state == SEED

    assert metric == expected_metric, f"Expected {expected_metric}, but got {metric}"
    assert model.initial_configurations_via_metalearning == experiment["META"]

    assert default == expected_default, f"expected:{expected_default}, but got: {default}"
    assert sorted(choices) == expected_choices, f"expected:{expected_choices}, but got: {choices}"
    if "SAMPLING_STRATEGY" in experiment:
        assert sampling_startegy == experiment["SAMPLING_STRATEGY"], f"Expected {experiment['SAMPLING_STRATEGY']}, but got {sampling_startegy}"
    else:
        assert not sampling_startegy
    # data_prep_choices = model.get_configuration_space(X, y)["data_preprocessor:__choice__"].choices
    # assert list(data_prep_choices) == ["feature_type"], list(data_prep_choices)

In [13]:
def validate(skf, model, X, y, scores, validation_file, dataset_name, ratio, experiment_name):
    def help(y_test, y_pred, postfix):
        return {
            f"confusion_matrix_{postfix}": [int(val) for val in confusion_matrix(y_test, y_pred).ravel()],

            f"minority_precision_{postfix}": precision_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            f"majority_precision_{postfix}": precision_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
            f"weighted_precision_{postfix}": precision_score(y_test, y_pred, average='weighted', zero_division=0.0),
            f"macro_precision_{postfix}": precision_score(y_test, y_pred, average='macro', zero_division=0.0),
            
            f"minority_recall_{postfix}": recall_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            f"majority_recall_{postfix}": recall_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
            f"weighted_recall_{postfix}": recall_score(y_test, y_pred, average='weighted', zero_division=0.0),
            f"macro_recall_{postfix}": recall_score(y_test, y_pred, average='macro', zero_division=0.0),
            
            f"minority_f1_{postfix}": f1_score(y_test, y_pred, pos_label=1, average='binary', zero_division=0.0),
            f"majority_f1_{postfix}": f1_score(y_test, y_pred, pos_label=0, average='binary', zero_division=0.0),
            f"weighted_f1_{postfix}": f1_score(y_test, y_pred, average='weighted', zero_division=0.0),
            f"macro_f1_{postfix}": f1_score(y_test, y_pred, average='macro', zero_division=0.0),
            
            f"minority_f2_{postfix}": fbeta_score(y_test, y_pred, beta=2, pos_label=1, average='binary', zero_division=0.0),
            f"majority_f2_{postfix}": fbeta_score(y_test, y_pred, beta=2, pos_label=0, average='binary', zero_division=0.0),
            f"weighted_f2_{postfix}": fbeta_score(y_test, y_pred, beta=2, average='weighted', zero_division=0.0),
            f"macro_f2_{postfix}": fbeta_score(y_test, y_pred, beta=2, average='macro', zero_division=0.0),

            f"weighted_harmonic_mean_recall_{postfix}": harmonic_mean_recall_(y_test, y_pred, weight=1),
            f"weighted_harmonic_mean_recall_2_{postfix}": harmonic_mean_recall_(y_test, y_pred, weight=2),
        }

    print(f"Validating, {dataset_name}, {ratio:.2f}, {experiment_name} ...", end=" ", flush=True)
    IR = imbalance_ratio(y, mode="big")

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        fold = str(fold)
        print(fold, end=" ", flush=True)

        if fold in scores and isinstance(scores[fold], dict) and all(metric in scores[fold] for metric in ("y_prob", "y_test")):
            y_prob = scores[fold]["y_prob"]
            y_test = scores[fold]["y_test"]
        else:
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            if hasattr(model, "refit"):
                model = model.refit(X_train, y_train)
            else:
                if model.steps[0][0] == "weighting":
                    unique, counts = np.unique(y_train, return_counts=True)
                    cw = 1 / (counts / np.sum(counts)) / 2
                    sample_weight = np.ones(y_train.shape)
                    for i, ue in enumerate(unique):
                        sample_weight[y_train == ue] *= cw[i]
                    model = model.fit(X_train, y_train, RF__sample_weight=sample_weight)
                else:
                    model = model.fit(X_train, y_train)
            y_prob = model.predict_proba(X_test)[:, 1]

        scores[fold] = {
            "auc_roc": roc_auc_score(y_test, y_prob),
            "pr_auc_roc": partial_roc_auc_score_(y_test, y_prob),
            "y_prob": [float(y) for y in y_prob],
            "y_test": [float(y) for y in y_test],
        }

        scores[fold] |= help(y_test, [0.5 <= p for p in y_prob], "default")

        y_threshold, y_test, y_prob_threshold, y_prob = train_test_split(y_test, y_prob, test_size=0.5, stratify=y_test, random_state=SEED, shuffle=True)
        fpr, tpr, thresholds = roc_curve(y_threshold, y_prob_threshold)
        scores[fold] |= help(y_test, thresholds[np.argmax(3*tpr*(1-fpr)/(tpr+2*(1-fpr)))] <= y_prob, "weighted_harmonic_mean")

        scores[fold] |= help(y_test, thresholds[np.argmax(tpr - fpr / IR)] <= y_prob, "tpr-fpr/IR")
        for k in [0.05, 0.067, 0.1, 0.2, 0.5, 0.75, 0.9, 1]:
            scores[fold] |= help(y_test, thresholds[np.argmax(tpr - k * fpr)] <= y_prob, f"tpr-{k}*fpr")

    dump_json(scores, validation_file)
    print()

In [14]:
if True:
    import sys
    sys.path.insert(0, "../my_autosklearn")
    from autosklearn.classification import AutoSklearnClassifier

    import autosklearn.pipeline.components.data_preprocessing
    from no_data_preprocessor import NoPreprocessing
    autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)


from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SVMSMOTE


from sklearn.base import BaseEstimator, TransformerMixin

class IdentityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X


MODEL_DICT = {
    "weighting": IdentityTransformer(),
    "SVMSMOTE": SVMSMOTE(random_state=SEED),
    "RF": RandomForestClassifier(random_state=SEED)
}


def stop_after_100_configurations_callback(smbo, run_info, result, time_left):
    return sum("SUCCESS" in str(val.status) for val in smbo.runhistory.data.values()) <= 116


def get_model(dataset_name, X, y, ratio, experiment_name, experiment, model_file, temp, skf):
    if exists(model_file):
        print(f"Loading, {dataset_name}, {ratio:.2f}, {experiment_name} ...")
        return load_pkl(model_file)

    if experiment["CLASSIFIER"] == "auto-sklearn":
        print(f"Fitting, {dataset_name}, {ratio:.2f}, {experiment_name} ...")

        if exists(temp):
            shutil.rmtree(temp, ignore_errors=True)

        model = AutoSklearnClassifier(
            time_left_for_this_task=TIME,
            metric=[NEW_ASKL_METRICS[metric] for metric in experiment["METRIC"]],
            initial_configurations_via_metalearning=experiment["META"],
            include=experiment["INCLUDE"],
            resampling_strategy=skf,
            seed=SEED,
            tmp_folder=temp,
            delete_tmp_folder_after_terminate=REMOVE_TEMP,
            get_trials_callback=stop_after_100_configurations_callback if "CALLBACK" in experiment and experiment["CALLBACK"] else None,
            n_jobs=N_JOBS,
            memory_limit=MEM,
            per_run_time_limit=TIME_PER_RUN
        )
        check_model_settings(model, experiment, X, y)
        model = model.fit(X, y)
        dump_pkl(model, model_file)
        return model

    return Pipeline([
        (primitive, MODEL_DICT[primitive])
        for primitive in experiment["CLASSIFIER"].split("+")
    ])

In [15]:
def run(run_id, dataset_name, X, y, ratio, experiment_name, experiment):
    file_destination = f"{experiment_name}/{dataset_name}/{ratio:.2f}"
    validation_file = f"{Folders.VALIDATION_DIR}/{file_destination}.json"
    model_file = f"{Folders.MODELS_DIR}/{file_destination}.pkl"
    run_info_file = f"{Folders.RUN_INFO_DIR}/{file_destination}.txt"
    temp = f"{Folders.TEMP}/{run_id}"
    scores = load_json(validation_file)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

    if not exists(model_file) and not FIT and experiment["CLASSIFIER"] == "auto-sklearn":
        return
    model = get_model(dataset_name, X, y, ratio, experiment_name, experiment, model_file, temp, skf)
    check_model_settings(model, experiment, X, y)
    if RUN_INFO and experiment["CLASSIFIER"] == "auto-sklearn":
        run_info(model, dataset_name, ratio, experiment_name, run_info_file)
    if VALIDATE:
        validate(skf, model, X, y, scores, validation_file, dataset_name, ratio, experiment_name)

In [16]:
if __name__ == "__main__" and (VALIDATE or FIT or RUN_INFO):
    DATASETS = get_datasets()
    EXPERIMENTS = [
        # "SVMSMOTE-default=SVMSMOTE-META=25-metric=[harmonic_mean_recall_2]",
        # "weighting-default=weighting-META=25-metric=[harmonic_mean_recall_2]",
        # "none-default=none-META=25-metric=[harmonic_mean_recall_2]",

        # "RF",
        # "SVMSMOTE+RF",
        # "weighting+RF",

        # "SVMSMOTE-default=SVMSMOTE-META=25-sampling_strategy=False",
        # "SVMSMOTE-default=SVMSMOTE-META=25",
        # "weighting-default=weighting-META=25",
        # "none-default=none-META=25",

        # "SVMSMOTE-default=SVMSMOTE-META=25-callback",
        # "weighting-default=weighting-META=25-callback",
        # "none-default=none-META=25-callback",

        "all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25"
    ]
    assert set(EXPERIMENTS) <= set(get_experiments().keys()), set(EXPERIMENTS) - set(get_experiments().keys())

    jobs = [
        (dataset_name, X, y, ratio, experiment_name, experiment)
        for experiment_name, experiment in get_experiments().items()
        for dataset_name, X, y, ratio in DATASETS
        if experiment_name in EXPERIMENTS
    ]

    for i, (dataset_name, X, y, ratio, experiment_name, experiment) in enumerate(jobs):
        if IGNORE_ERRORS:
            try:
                run(i, dataset_name, X, y, ratio, experiment_name, experiment)
            except TypeError as e: 
                if "is not JSON serializable" in str(e):
                    raise e
            except:
                continue
        else:
            run(i, dataset_name, X, y, ratio, experiment_name, experiment)

Loading, diabetes(id=37), 0.54, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ...
Validating, diabetes(id=37), 0.54, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ... 0 1 2 3 4 
Loading, diabetes(id=37), 0.15, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ...
Validating, diabetes(id=37), 0.15, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ... 0 1 2 3 4 
Loading, diabetes(id=37), 0.10, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ...
Validating, diabetes(id=37), 0.10, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ... 0 1 2 3 4 
Loading, diabetes(id=37), 0.05, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ...
Validating, diabetes(id=37), 0.05, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ... 0 1 2 3 4 
Loading, pc3(id=1050), 0.11, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ...
Validating, pc3(id=1050), 0.11, all_SMOTE_like+weighting+none-default=SVMSMOTE-META=25 ... 0 1 2 3 4 
Loading, pc3(id=1050), 0.1

KeyboardInterrupt: 