In [1]:
if True:
    import sys
    # make sure that you import the correct version of auto-sklearn
    # that supports Resamplers
    sys.path.insert(0, "../my_autosklearn")

Create My_SVMSMOTE for balancing

In [2]:
from ConfigSpace.configuration_space import ConfigurationSpace
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, INPUT, SIGNED_DATA, SPARSE, UNSIGNED_DATA
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter, UniformFloatHyperparameter


class My_SVMSMOTE(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, k_neighbors=5, sampling_strategy="minority", C=1.0, random_state=None):
        self.random_state = random_state
        self.k_neighbors = k_neighbors
        self.sampling_strategy = sampling_strategy
        self.C = C

    def fit_resample(self, X, y):
        from imblearn.over_sampling import SVMSMOTE
        from sklearn.svm import SVC
        return SVMSMOTE(
            k_neighbors=self.k_neighbors, 
            sampling_strategy=self.sampling_strategy,
            svm_estimator=SVC(
                C=self.C,
            ),
        ).fit_resample(X, y)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "SVMSMOTE",
            "name": "SVMSMOTE",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": False,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(feat_type=None, dataset_properties=None):
        cs = ConfigurationSpace()
        # NOTE: This is just to showcase possibilities of ConfigSpace
        cs.add_hyperparameters([
            UniformIntegerHyperparameter(name="k_neighbors", lower=3, upper=10, default_value=5, log=False),
            CategoricalHyperparameter(name="sampling_strategy", choices=["all", "not minority", "not majority", "minority"], default_value="minority"),
            UniformFloatHyperparameter(name="C", lower=0.03125, upper=32768, default_value=1.0, log=True),
        ])
        return cs

Add My_SVMSMOTE to auto-sklearn

In [3]:
import autosklearn.pipeline.components.data_preprocessing.balancing
autosklearn.pipeline.components.data_preprocessing.balancing.add_preprocessor(My_SVMSMOTE)

Create LOF for anomaly detection

In [4]:
from ConfigSpace.configuration_space import ConfigurationSpace
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, INPUT, SIGNED_DATA, SPARSE, UNSIGNED_DATA
from ConfigSpace.hyperparameters import UniformFloatHyperparameter


class My_LOF(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, p=0.01, random_state=None):
        self.random_state = random_state
        self.p = p

    def fit_resample(self, X, y):
        from sklearn.neighbors import LocalOutlierFactor
        import numpy as np

        clf = LocalOutlierFactor()
        clf.fit_predict(X)
        factors = clf.negative_outlier_factor_
        inliers = np.argsort(-factors)[:int((1-self.p) * len(factors))]
        return X[inliers], y[inliers]

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "LOF",
            "name": "LOF",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(feat_type=None, dataset_properties=None):
        cs = ConfigurationSpace()
        # parameter that controls percentage of instances to remove
        cs.add_hyperparameters([
            UniformFloatHyperparameter(name="p", lower=0.0, upper=0.1, default_value=0.01, log=False)
        ])
        return cs

Create AutoSklearnChoice for encapsulating more anomaly detection methods

In [5]:
from ConfigSpace.configuration_space import ConfigurationSpace
from autosklearn.pipeline.components.base import AutoSklearnChoice
from collections import OrderedDict
from ConfigSpace.hyperparameters import CategoricalHyperparameter


# You can also customize methods:
# - get_available_components
# - set_hyperparameters
# But it is unlikely that you would need to.
class My_AnomalyDetectionChoice(AutoSklearnChoice):
    def __init__(self, feat_type, dataset_properties, random_state=None):
        self.random_state = random_state

    @classmethod
    def get_components(cls):
        return OrderedDict([("My_LOF", My_LOF)])

    def get_hyperparameter_search_space(self, feat_type, dataset_properties=None, default=None, include=None, exclude=None):
        if dataset_properties is None:
            dataset_properties = {}

        available_components = self.get_available_components(dataset_properties=dataset_properties, include=include, exclude=exclude)

        choice = CategoricalHyperparameter("__choice__", list(available_components.keys()), default_value="My_LOF")
        cs = ConfigurationSpace()
        cs.add_hyperparameter(choice)

        for name, preprocessor in available_components.items():
            space = preprocessor.get_hyperparameter_search_space(dataset_properties=dataset_properties)
            cs.add_configuration_space(name, space, parent_hyperparameter={"parent": choice, "value": name})
        return cs

    def fit_resample(self, X, y):
        return self.choice.fit_resample(X, y)

Add My_AnomalyDetectionChoice to classification pipeline of auto-sklearn

In [6]:
# Don't run this cell multiple times
if "STEP_ADDED" not in locals():
    from autosklearn.pipeline.classification import SimpleClassificationPipeline

    old_pipeline_steps = SimpleClassificationPipeline._get_pipeline_steps
    def _get_pipeline_steps(self, dataset_properties, feat_type=None):
        default_dataset_properties = {"target_type": "classification"}
        if dataset_properties is not None and isinstance(dataset_properties, dict):
            default_dataset_properties.update(dataset_properties)

        steps = old_pipeline_steps(self, dataset_properties, feat_type)
        choice = My_AnomalyDetectionChoice(feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state)
        steps.insert(1, ("my_anomaly_detection", choice))
        return steps

    SimpleClassificationPipeline._get_pipeline_steps = _get_pipeline_steps
    STEP_ADDED = True

Run auto-sklearn

In [7]:
from autosklearn.classification import AutoSklearnClassifier
from sklearn.datasets import load_breast_cancer


X, y = load_breast_cancer(return_X_y=True)
model = AutoSklearnClassifier(
    include = {
        "balancing": ["My_SVMSMOTE"],  # restrict balancing to only My_SVMSMOTE
    },
    tmp_folder="temp/anomaly",
    initial_configurations_via_metalearning=0,
    time_left_for_this_task=30,
    per_run_time_limit=10,
)
model.fit(X, y)
model.show_models()

Fitting to the training data: 100%|[32m██████████[0m| 30/30 [00:21<00:00,  1.41it/s, The total time budget for this task is 0:00:30]


{2: {'model_id': 2,
  'rank': 2,
  'cost': 0.04255319148936165,
  'ensemble_weight': 0.04,
  'data_preprocessor': FeatTypeSplit(numerical_transformer:imputation:strategy: mean,
  		numerical_transformer:rescaling:__choice__: standardize,),
  'my_anomaly_detection': My_LOF(random_state=1),
  'balancing': My_SVMSMOTE(random_state=1),
  'feature_preprocessor': NoPreprocessing(random_state=None),
  'classifier': RandomForest(bootstrap=True, criterion='gini', max_depth=None, max_features=0.5,
               max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=1),
  'sklearn_classifier': RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1,
                         random_state=1, warm_start=True)},
 3: {'model_id': 3,
  'rank': 3,
  'cost': 0.04255319148936165,
  'ensemble_weight': 0.32,
  'data_preprocessor': FeatTypeSplit(numerical_transformer:imputation:strategy: median,
  		numeric