-
Notifications
You must be signed in to change notification settings - Fork 83
/
smac.py
319 lines (288 loc) · 14.1 KB
/
smac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
import sys
import time
import traceback
from sklearn.model_selection import train_test_split
from sklearn.model_selection._split import check_cv
from sklearn.metrics import log_loss
from sklearn.metrics.scorer import check_scoring
# Import ConfigSpace and different types of parameters
from smac.configspace import ConfigurationSpace
# Import SMAC-utilities
from smac.facade.smac_facade import SMAC as orig_SMAC
from smac.scenario.scenario import Scenario
from smac.tae.execute_ta_run import BudgetExhaustedException
from lale.helpers import cross_val_score_track_trials
from lale.lib.sklearn import LogisticRegression
import lale.operators
from lale.search.lale_smac import lale_op_smac_tae, get_smac_space, lale_trainable_op_from_config
import lale.sklearn_compat
import lale.docstrings
logger = logging.getLogger(__name__)
class SMACImpl:
def __init__(self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, lale_num_grids=None):
""" Instantiate the SMAC that will use the given estimator and other parameters to select the
best performing trainable instantiation of the estimator.
Parameters
----------
estimator : lale.operators.IndividualOp or lale.operators.Pipeline, optional
A valid Lale individual operator or pipeline, by default LogisticRegression
max_evals : int, optional
Number of trials of SMAC search i.e. runcount_limit of SMAC, by default 50
cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.
The fit method performs cross validation on the input dataset for per trial,
and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag,
by default 5
handle_cv_failure : bool, optional
A boolean flag to indicating how to deal with cross validation failure for a trial.
If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit
and reporting the score on the validation part.
If False, the trial is terminated by assigning status to FAIL.
, by default False
scoring: string or a scorer object created using
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer.
A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of
sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).
A completely custom scorer object can be created from a python function following the example at
https://scikit-learn.org/stable/modules/model_evaluation.html
The metric has to return a scalar value, and note that scikit-learns's scorer object always returns values such that
higher score is better. Since Hyperopt solves a minimization problem, we pass (best_score - score) to Hyperopt.
by default 'accuracy'.
best_score : float, optional
The best score for the specified scorer. This allows us to return a loss to hyperopt that is
greater than equal to zero, where zero is the best loss. By default, zero.
max_opt_time : float, optional
Maximum amount of wall clock time in seconds for the optimization. By default, None, implying no runtime
bound.
Examples
--------
>>> from sklearn.metrics import make_scorer, f1_score, accuracy_score
>>> lr = LogisticRegression()
>>> clf = SMAC(estimator=lr, scoring='accuracy', cv=5)
>>> from sklearn import datasets
>>> diabetes = datasets.load_diabetes()
>>> X = diabetes.data[:150]
>>> y = diabetes.target[:150]
>>> trained = clf.fit(X, y)
>>> predictions = trained.predict(X)
Other scoring metrics:
>>> clf = SMAC(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv=3, max_evals=2)
"""
self.max_evals = max_evals
if estimator is None:
self.estimator = LogisticRegression()
else:
self.estimator = estimator
self.search_space:ConfigurationSpace = get_smac_space(self.estimator, lale_num_grids=lale_num_grids)
self.scoring = scoring
self.best_score = best_score
self.handle_cv_failure = handle_cv_failure
self.cv = cv
self.max_opt_time = max_opt_time
# Scenario object
scenario_options = {"run_obj": "quality", # we optimize quality (alternatively runtime)
"runcount-limit": self.max_evals, # maximum function evaluations
"cs": self.search_space, # configuration space
"deterministic": "true",
"abort_on_first_run_crash": False,
}
if max_opt_time is not None:
scenario_options["wallclock_limit"]= max_opt_time
self.scenario = Scenario(scenario_options)
self.trials = None
def fit(self, X_train, y_train):
self.cv = check_cv(self.cv, y = y_train, classifier=True) #TODO: Replace the classifier flag value by using tags?
def smac_train_test(trainable, X_train, y_train):
try:
cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring)
logger.debug("Successful trial of SMAC")
except BaseException as e:
#If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
if self.handle_cv_failure:
X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20)
start = time.time()
trained = trainable.fit(X_train_part, y_train_part)
scorer = check_scoring(trainable, scoring=self.scoring)
cv_score = scorer(trained, X_validation, y_validation)
execution_time = time.time() - start
y_pred_proba = trained.predict_proba(X_validation)
try:
logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba)
except BaseException:
logloss = 0
logger.debug("Warning, log loss cannot be computed")
else:
logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json()))
raise e
return cv_score, logloss, execution_time
def f(trainable):
return_dict = {}
try:
score, logloss, execution_time = smac_train_test(trainable, X_train=X_train, y_train=y_train)
return_dict = {
'loss': self.best_score - score,
'time': execution_time,
'log_loss': logloss
}
except BaseException as e:
logger.warning(f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT.")
raise e
return return_dict['loss']
try :
smac = orig_SMAC(scenario=self.scenario, rng=np.random.RandomState(42),
tae_runner=lale_op_smac_tae(self.estimator, f))
incumbent = smac.optimize()
self.trials = smac.get_runhistory()
trainable = lale_trainable_op_from_config(self.estimator, incumbent)
#get the trainable corresponding to the best params and train it on the entire training dataset.
trained = trainable.fit(X_train, y_train)
self._best_estimator = trained
except BudgetExhaustedException:
logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely')
except BaseException as e:
logger.warning('Error during optimization: {}'.format(e))
self._best_estimator = None
return self
def predict(self, X_eval):
import warnings
warnings.filterwarnings("ignore")
trained = self._best_estimator
try:
predictions = trained.predict(X_eval)
except ValueError as e:
logger.warning("ValueError in predicting using SMACCV:{}, the error is:{}".format(trained, e))
predictions = None
return predictions
def get_trials(self):
"""Returns the trials i.e. RunHistory object.
Returns
-------
smac.runhistory.runhistory.RunHistory
RunHistory of all the trials executed during the optimization i.e. fit method of SMACCV.
"""
return self.trials
def get_pipeline(self, pipeline_name=None, astype='lale'):
if pipeline_name is not None:
raise NotImplementedError('Cannot get pipeline by name yet.')
result = getattr(self, '_best_estimator', None)
if result is None or astype == 'lale':
return result
assert astype == 'sklearn', astype
return lale.sklearn_compat.make_sklearn_compat(result)
_hyperparams_schema = {
'allOf': [
{ 'type': 'object',
'required': [
'estimator', 'max_evals', 'cv', 'handle_cv_failure',
'max_opt_time', 'lale_num_grids'],
'relevantToOptimizer': ['estimator'],
'additionalProperties': False,
'properties': {
'estimator': {
'anyOf': [
{ 'laleType': 'operator',
'not': {'enum': [None]}},
{ 'enum': [None]}],
'default': None},
'max_evals': {
'type': 'integer',
'minimum': 1,
'default': 50},
'cv': {
'type': 'integer',
'minimum': 1,
'default': 5},
'handle_cv_failure': {
'type': 'boolean',
'default': False},
'scoring': {
'anyOf': [
{ 'description': 'Custom scorer object, see https://scikit-learn.org/stable/modules/model_evaluation.html',
'not': {'type': 'string'}},
{ 'enum': [
'accuracy', 'explained_variance', 'max_error',
'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted',
'balanced_accuracy', 'average_precision',
'neg_log_loss', 'neg_brier_score', 'r2', 'neg_mean_squared_error', 'neg_mean_absolute_error',
'neg_root_mean_squared_error', 'neg_mean_squared_log_error',
'neg_median_absolute_error']}],
'default': 'accuracy'},
'best_score': {
'type': 'number',
'default': 0.0},
'max_opt_time': {
'anyOf': [
{ 'type': 'number',
'minimum': 0.0},
{ 'enum': [None]}],
'default': None},
'lale_num_grids': {
'anyOf': [
{ 'description': 'If not set, keep all grids.',
'enum': [None]},
{ 'description': 'Fraction of grids to keep.',
'type': 'number',
'minimum': 0.0,
'exclusiveMinimum': True,
'maximum': 1.0,
'exclusiveMaximum': True},
{ 'description': 'Number of grids to keep.',
'type': 'integer',
'minimum': 1}],
'default': None}
}}]}
_input_fit_schema = {
'type': 'object',
'required': ['X', 'y'],
'properties': {
'X': {
'type': 'array',
'items': {
'anyOf': [
{ 'type': 'array', 'items': {'type': ['number', 'string']}},
{ 'type': 'string'}]}},
'y': {
'type': 'array', 'items': {'type': 'number'}}}}
_input_predict_schema = {
'type': 'object',
'properties': {
'X': {
'type': 'array',
'items': {
'anyOf': [
{ 'type': 'array', 'items': {'type': ['number', 'string']}},
{ 'type': 'string'}]}}}}
_output_predict_schema = {
'type': 'array', 'items': {'type': 'number'}}
_combined_schemas = {
'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.hyperopt_cv.html',
'description': 'SMAC, the optimizer used inside auto-weka and auto-sklearn.',
'type': 'object',
'tags': {
'pre': [],
'op': ['estimator'],
'post': []},
'properties': {
'hyperparams': _hyperparams_schema,
'input_fit': _input_fit_schema,
'input_predict': _input_predict_schema,
'output_predict': _output_predict_schema}}
lale.docstrings.set_docstrings(SMACImpl, _combined_schemas)
SMAC = lale.operators.make_operator(SMACImpl, _combined_schemas)