lale/lib/lale/smac.py

# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import numpy as np
import sys

import time
import traceback
from sklearn.model_selection import train_test_split
from sklearn.model_selection._split import check_cv
from sklearn.metrics import log_loss
from sklearn.metrics.scorer import check_scoring

# Import ConfigSpace and different types of parameters
from smac.configspace import ConfigurationSpace

# Import SMAC-utilities
from smac.facade.smac_facade import SMAC as orig_SMAC
from smac.scenario.scenario import Scenario
from smac.tae.execute_ta_run import BudgetExhaustedException
from lale.helpers import cross_val_score_track_trials
from lale.lib.sklearn import LogisticRegression
import lale.operators
from lale.search.lale_smac import lale_op_smac_tae, get_smac_space, lale_trainable_op_from_config
import lale.sklearn_compat
import lale.docstrings

logger = logging.getLogger(__name__)

class SMACImpl:

    def __init__(self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, lale_num_grids=None):
        """ Instantiate the SMAC that will use the given estimator and other parameters to select the 
        best performing trainable instantiation of the estimator. 

        Parameters
        ----------
        estimator : lale.operators.IndividualOp or lale.operators.Pipeline, optional
            A valid Lale individual operator or pipeline, by default LogisticRegression
        max_evals : int, optional
            Number of trials of SMAC search i.e. runcount_limit of SMAC, by default 50
        cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
            Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
            Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.
            The fit method performs cross validation on the input dataset for per trial, 
            and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag, 
            by default 5
        handle_cv_failure : bool, optional
            A boolean flag to indicating how to deal with cross validation failure for a trial.
            If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit
            and reporting the score on the validation part.
            If False, the trial is terminated by assigning status to FAIL.
            , by default False
        scoring: string or a scorer object created using 
            https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer.
            A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of 
            sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).
            A completely custom scorer object can be created from a python function following the example at 
            https://scikit-learn.org/stable/modules/model_evaluation.html
            The metric has to return a scalar value, and note that scikit-learns's scorer object always returns values such that
            higher score is better. Since Hyperopt solves a minimization problem, we pass (best_score - score) to Hyperopt.
            by default 'accuracy'.
        best_score : float, optional
            The best score for the specified scorer. This allows us to return a loss to hyperopt that is
            greater than equal to zero, where zero is the best loss. By default, zero.
        max_opt_time : float, optional
            Maximum amount of wall clock time in seconds for the optimization. By default, None, implying no runtime
            bound.

        Examples
        --------
        >>> from sklearn.metrics import make_scorer, f1_score, accuracy_score
        >>> lr = LogisticRegression()
        >>> clf = SMAC(estimator=lr, scoring='accuracy', cv=5)
        >>> from sklearn import datasets
        >>> diabetes = datasets.load_diabetes()
        >>> X = diabetes.data[:150]
        >>> y = diabetes.target[:150]
        >>> trained = clf.fit(X, y)
        >>> predictions = trained.predict(X)

        Other scoring metrics:

        >>> clf = SMAC(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv=3, max_evals=2)

        """

        self.max_evals = max_evals
        if estimator is None:
            self.estimator = LogisticRegression()
        else:
            self.estimator = estimator

        self.search_space:ConfigurationSpace = get_smac_space(self.estimator, lale_num_grids=lale_num_grids)
        self.scoring = scoring
        self.best_score = best_score
        self.handle_cv_failure = handle_cv_failure
        self.cv = cv
        self.max_opt_time = max_opt_time
        # Scenario object
        scenario_options = {"run_obj": "quality",   # we optimize quality (alternatively runtime)
                            "runcount-limit": self.max_evals,  # maximum function evaluations
                            "cs": self.search_space,               # configuration space
                            "deterministic": "true",
                            "abort_on_first_run_crash": False,
                            }
        if max_opt_time is not None:
            scenario_options["wallclock_limit"]= max_opt_time
        self.scenario = Scenario(scenario_options)
        self.trials = None

    def fit(self, X_train, y_train):
        self.cv = check_cv(self.cv, y = y_train, classifier=True) #TODO: Replace the classifier flag value by using tags?

        def smac_train_test(trainable, X_train, y_train):
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring)
                logger.debug("Successful trial of SMAC")
            except BaseException as e:
                #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score  = scorer(trained, X_validation, y_validation)
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json()))
                    raise e
            return cv_score, logloss, execution_time

        def f(trainable):
            return_dict = {}
            try:
                score, logloss, execution_time = smac_train_test(trainable, X_train=X_train, y_train=y_train)
                return_dict = {
                    'loss': self.best_score - score,
                    'time': execution_time,
                    'log_loss': logloss
                }
            except BaseException as e:
                logger.warning(f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT.")
                raise e
            return return_dict['loss']

        try :
            smac = orig_SMAC(scenario=self.scenario, rng=np.random.RandomState(42),
                    tae_runner=lale_op_smac_tae(self.estimator, f))
            incumbent = smac.optimize()
            self.trials = smac.get_runhistory()
            trainable = lale_trainable_op_from_config(self.estimator, incumbent)
            #get the trainable corresponding to the best params and train it on the entire training dataset.
            trained = trainable.fit(X_train, y_train)
            self._best_estimator = trained
        except BudgetExhaustedException:
            logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely')
        except BaseException as e:
            logger.warning('Error during optimization: {}'.format(e))
            self._best_estimator = None

        return self

    def predict(self, X_eval):
        import warnings
        warnings.filterwarnings("ignore")
        trained = self._best_estimator
        try:
            predictions = trained.predict(X_eval)
        except ValueError as e:
            logger.warning("ValueError in predicting using SMACCV:{}, the error is:{}".format(trained, e))
            predictions = None

        return predictions

    def get_trials(self):
        """Returns the trials i.e. RunHistory object.
        
        Returns
        -------
        smac.runhistory.runhistory.RunHistory
            RunHistory of all the trials executed during the optimization i.e. fit method of SMACCV.
        """
        return self.trials

    def get_pipeline(self, pipeline_name=None, astype='lale'):
        if pipeline_name is not None:
            raise NotImplementedError('Cannot get pipeline by name yet.')
        result = getattr(self, '_best_estimator', None)
        if result is None or astype == 'lale':
            return result
        assert astype == 'sklearn', astype
        return lale.sklearn_compat.make_sklearn_compat(result)

_hyperparams_schema = {
    'allOf': [
    {   'type': 'object',
        'required': [
            'estimator', 'max_evals', 'cv', 'handle_cv_failure',
            'max_opt_time', 'lale_num_grids'],
        'relevantToOptimizer': ['estimator'],
        'additionalProperties': False,
        'properties': {
            'estimator': {
                'anyOf': [
                {   'laleType': 'operator',
                    'not': {'enum': [None]}},
                {   'enum': [None]}],
                'default': None},
            'max_evals': {
                'type': 'integer',
                'minimum': 1,
                'default': 50},
            'cv': {
                'type': 'integer',
                'minimum': 1,
                'default': 5},
            'handle_cv_failure': {
                'type': 'boolean',
                'default': False},
            'scoring': {
                'anyOf': [
                {    'description': 'Custom scorer object, see https://scikit-learn.org/stable/modules/model_evaluation.html',
                     'not': {'type': 'string'}},
                {    'enum': [
                        'accuracy', 'explained_variance', 'max_error',
                        'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
                        'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted',
                        'balanced_accuracy', 'average_precision',
                        'neg_log_loss', 'neg_brier_score', 'r2', 'neg_mean_squared_error', 'neg_mean_absolute_error',
                         'neg_root_mean_squared_error', 'neg_mean_squared_log_error',
                         'neg_median_absolute_error']}],
                'default': 'accuracy'},
            'best_score': {
                'type': 'number',
                'default': 0.0},
            'max_opt_time': {
                'anyOf': [
                {   'type': 'number',
                    'minimum': 0.0},
                {   'enum': [None]}],
                'default': None},
            'lale_num_grids': {
                'anyOf': [
                {   'description': 'If not set, keep all grids.',
                    'enum': [None]},
                {   'description': 'Fraction of grids to keep.',
                    'type': 'number',
                    'minimum': 0.0,
                    'exclusiveMinimum': True,
                    'maximum': 1.0,
                    'exclusiveMaximum': True},
                {   'description': 'Number of grids to keep.',
                    'type': 'integer',
                    'minimum': 1}],
                'default': None}
                }}]}

_input_fit_schema = {
    'type': 'object',
    'required': ['X', 'y'],
    'properties': {
        'X': {
            'type': 'array',
            'items': {
                'anyOf': [
                {   'type': 'array', 'items': {'type': ['number', 'string']}},
                {   'type': 'string'}]}},
        'y': {
            'type': 'array', 'items': {'type': 'number'}}}}

_input_predict_schema = {
    'type': 'object',
    'properties': {
        'X': {
            'type': 'array',
            'items': {
                'anyOf': [
                {   'type': 'array', 'items': {'type': ['number', 'string']}},
                {   'type': 'string'}]}}}}

_output_predict_schema = {
    'type': 'array', 'items': {'type': 'number'}}

_combined_schemas = {
    'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.hyperopt_cv.html',
    'description': 'SMAC, the optimizer used inside auto-weka and auto-sklearn.',
    'type': 'object',
    'tags': {
        'pre': [],
        'op': ['estimator'],
        'post': []},
    'properties': {
        'hyperparams': _hyperparams_schema,
        'input_fit': _input_fit_schema,
        'input_predict': _input_predict_schema,
        'output_predict': _output_predict_schema}}

lale.docstrings.set_docstrings(SMACImpl, _combined_schemas)

SMAC = lale.operators.make_operator(SMACImpl, _combined_schemas)