In [2]:
#default_exp modeling.training

In [3]:
#hide
from nbdev.showdoc import *
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Training Functionality

These functions are designed to help with anything in the training stage of the ML life cycle.

In [4]:
#export
import xgboost
import os
import pickle
import logging

from hyperopt import fmin, tpe, STATUS_OK, Trials, hp, space_eval
from sdsde.wrapper.azurewrapper import blob_pusher
from sklearn import metrics
from fastai.basics import *
from fastai.tabular.all import *

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Hypertuning

### `HpOptMultilabel`

In [5]:
#export


class HpOptMultilabel:
    """Class that hypertunes an arbitrary model to multilabel classification
    """

    def __init__(self, X_train, X_test, y_train, y_test, parameter_space=None, model=xgboost.XGBClassifier):
        """Initialization takes in a test and train set and an optional hyperparameter space

        Args:
        * X_train (array): training features
        * X_test (array): testing features
        * y_train (array): testing labels
        * y_test (array): testing labels
        * parameter_space (dict): hyperopt compatible parameter space
        * model (module pointer): machine learning model compatiable with parameter space
        """
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model

        if parameter_space is None:
            self.parameter_space = {
                'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                'reg_alpha': hp.uniform('reg_alpha', 0, 5),
                'reg_lambda': hp.uniform('reg_lambda', 0, 5),
                'min_child_weight': hp.uniform('min_child_weight', 0, 5),
                'gamma': hp.uniform('gamma', 0, 5),
                'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                'tree_method': hp.choice('tree_method', ['hist', 'exact', 'approx']),
                'objective': hp.choice('objective', ['multi:softmax', 'multi:softprob']),
                'eval_metric': hp.choice('eval_metric', ['mlogloss', 'merror']),
                'gpu_id': hp.choice('gpu_id', [0]),
                'use_label_encoder': hp.choice('use_label_encoder', [False]),
            }
        else:
            self.parameter_space = parameter_space

    def objective(self, params):
        """Objective function for loss that is provided to perform the MINLP
        optimizaiton in hyperopt

        Args:
        * params (dict): hyperopt formated dictionary of hyperparameters

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        model = self.model(**params)
        model.fit(self.X_train, self.y_train)
        pred_proba = model.predict_proba(self.X_test)
        loss = 1 - metrics.roc_auc_score(self.y_test, pred_proba, multi_class='ovr', average='macro')
        return {'loss': loss, 'status': STATUS_OK}

    def optimize(self, max_evals=20):
        """optimizes the hyperparameter space in the object

        Args:
        * max_evals: number of hyperopt iterations

        Returns:
        * dict: best hyperparameters
        """
        trials = Trials()
        best = fmin(fn=self.objective,
                    space=self.parameter_space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        return space_eval(self.parameter_space, best)

In [6]:
show_doc(HpOptMultilabel)
show_doc(HpOptMultilabel.__init__)
show_doc(HpOptMultilabel.objective)
show_doc(HpOptMultilabel.optimize)

<h2 id="HpOptMultilabel" class="doc_header"><code>class</code> <code>HpOptMultilabel</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>HpOptMultilabel</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBClassifier`*)

Class that hypertunes an arbitrary model to multilabel classification
    

<h4 id="HpOptMultilabel.__init__" class="doc_header"><code>HpOptMultilabel.__init__</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptMultilabel.__init__</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBClassifier`*)

Initialization takes in a test and train set and an optional hyperparameter space

Args:
* X_train (array): training features
* X_test (array): testing features
* y_train (array): testing labels
* y_test (array): testing labels
* parameter_space (dict): hyperopt compatible parameter space
* model (module pointer): machine learning model compatiable with parameter space

<h4 id="HpOptMultilabel.objective" class="doc_header"><code>HpOptMultilabel.objective</code><a href="__main__.py#L44" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptMultilabel.objective</code>(**`params`**)

Objective function for loss that is provided to perform the MINLP
optimizaiton in hyperopt

Args:
* params (dict): hyperopt formated dictionary of hyperparameters

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptMultilabel.optimize" class="doc_header"><code>HpOptMultilabel.optimize</code><a href="__main__.py#L60" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptMultilabel.optimize</code>(**`max_evals`**=*`20`*)

optimizes the hyperparameter space in the object

Args:
* max_evals: number of hyperopt iterations

Returns:
* dict: best hyperparameters

Example of passing in a custom parameter set to an XGBoost classifier

In [7]:
df = datasets.load_iris()
X = df['data']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                    'gamma': hp.uniform('gamma', 0, 5),
                    'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                    'objective': hp.choice('objective', ['multi:softmax', 'multi:softprob']),
                    'eval_metric': hp.choice('eval_metric', ['mlogloss', 'merror']),
                    'use_label_encoder': hp.choice('use_label_encoder', [False])
                   }

opt = HpOptMultilabel(X_train, X_test, y_train, y_test, parameter_space=parameter_space)
best = opt.optimize(max_evals=10)
model = xgboost.XGBClassifier(**best)
print(model)

model.fit(X_train, y_train)
pred_proba = model.predict_proba(X_test)
auc = metrics.roc_auc_score(y_test, pred_proba, multi_class='ovr', average='macro')
print(f'AUC: {auc}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002774 seconds
INFO:hyperopt.tpe:TPE using 0 trials


 10%|██▋                        | 1/10 [00:00<00:04,  2.16trial/s, best loss: 0.0270531400966183]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003591 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.027053


 20%|█████                    | 2/10 [00:00<00:03,  2.34trial/s, best loss: 0.022222222222222143]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002915 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.022222


 30%|███████▌                 | 3/10 [00:03<00:11,  1.62s/trial, best loss: 0.022222222222222143]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002641 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.022222


 40%|██████████               | 4/10 [00:04<00:07,  1.21s/trial, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.063942 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.009662


 50%|████████████▌            | 5/10 [00:05<00:05,  1.06s/trial, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002873 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.009662


 60%|███████████████          | 6/10 [00:05<00:03,  1.12trial/s, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003395 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.009662


 70%|█████████████████▌       | 7/10 [00:06<00:02,  1.19trial/s, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002627 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.009662


 80%|████████████████████     | 8/10 [00:07<00:01,  1.33trial/s, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002477 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.009662


 90%|██████████████████████▌  | 9/10 [00:07<00:00,  1.55trial/s, best loss: 0.009661835748792313]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003147 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.009662


100%|████████████████████████| 10/10 [00:07<00:00,  1.25trial/s, best loss: 0.009661835748792313]
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9400000000000001,
              enable_categorical=False, eval_metric='mlogloss',
              gamma=1.2432948057293043, gpu_id=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.18219904135916973,
              max_delta_step=None, max_depth=15, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, objective='multi:softprob',
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None,
              subsample=0.7000000000000001, tree_method=None,
              use_label_encoder=False, validate_parameters=None, ...)
AUC: 0.9903381642512077


Example of passing in a custom set of parameters to a random forest classifier

In [8]:
parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                   }

opt = HpOptMultilabel(X_train, X_test, y_train, y_test, parameter_space=parameter_space, model=RandomForestClassifier)
best = opt.optimize(max_evals=10)
model = RandomForestClassifier(**best)
print(model)

model.fit(X_train, y_train)
pred_proba = model.predict_proba(X_test)
auc = metrics.roc_auc_score(y_test, pred_proba, multi_class='ovr', average='macro')
print(f'AUC: {auc}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001163 seconds
INFO:hyperopt.tpe:TPE using 0 trials


 10%|██▌                       | 1/10 [00:00<00:00,  9.33trial/s, best loss: 0.01207729468599028]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000951 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.012077


 20%|█████▏                    | 2/10 [00:00<00:00,  9.32trial/s, best loss: 0.01207729468599028]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000819 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.012077
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000818 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.012077


 40%|██████████               | 4/10 [00:00<00:00,  9.98trial/s, best loss: 0.007729468599033784]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000851 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.007729


 50%|████████████▌            | 5/10 [00:00<00:00,  9.72trial/s, best loss: 0.007729468599033784]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000816 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.007729
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000952 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.007729


 70%|█████████████████▌       | 7/10 [00:00<00:00, 10.05trial/s, best loss: 0.005797101449275366]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000877 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.005797


 80%|████████████████████     | 8/10 [00:00<00:00,  9.96trial/s, best loss: 0.005797101449275366]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000815 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.005797
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000807 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.005797


100%|████████████████████████| 10/10 [00:01<00:00,  9.99trial/s, best loss: 0.005797101449275366]
RandomForestClassifier(max_depth=12)
AUC: 0.9898550724637681


### `HpOptBinary`

In [9]:
#export


class HpOptBinary:
    """Class that hypertunes an arbitrary model to binary classification
    """

    def __init__(self, X_train, X_test, y_train, y_test, parameter_space=None, model=xgboost.XGBClassifier):
        """Initialization takes in a test and train set and an optional hyperparameter space

        Args:
        * X_train (array): training features
        * X_test (array): testing features
        * y_train (array): testing labels
        * y_test (array): testing labels
        * parameter_space (dict): hyperopt compatible parameter space
        * model (module pointer): machine learning model compatiable with parameter space
        """
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model

        if parameter_space is None:
            self.parameter_space = {
                'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                'reg_alpha': hp.uniform('reg_alpha', 0, 5),
                'reg_lambda': hp.uniform('reg_lambda', 0, 5),
                'min_child_weight': hp.uniform('min_child_weight', 0, 5),
                'gamma': hp.uniform('gamma', 0, 5),
                'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                'tree_method': hp.choice('tree_method', ['hist', 'exact', 'approx']),
                'objective': hp.choice('objective', ['binary:logistic', 'binary:logitraw', 'binary:hinge']),
                'eval_metric': hp.choice('eval_metric', ['logloss', 'error', 'auc', 'aucpr', 'map']),
                'gpu_id': hp.choice('gpu_id', [0]),
                'use_label_encoder': hp.choice('use_label_encoder', [False]),
            }
        else:
            self.parameter_space = parameter_space

    def objective(self, params):
        """Objective function for loss that is provided to perform the MINLP
        optimizaiton in hyperopt

        Args:
        * params (dict): hyperopt formated dictionary of hyperparameters

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        model = self.model(**params)
        model.fit(self.X_train, self.y_train)
        pred_proba = model.predict_proba(self.X_test)
        loss = 1 - metrics.roc_auc_score(self.y_test, pred_proba[:, 1])
        return {'loss': loss, 'status': STATUS_OK}

    def optimize(self, max_evals=20):
        """optimizes the hyperparameter space in the object

        Args:
        * max_evals: number of hyperopt iterations

        Returns:
        * dict: best hyperparameters
        """
        trials = Trials()
        best = fmin(fn=self.objective,
                    space=self.parameter_space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        return space_eval(self.parameter_space, best)

In [10]:
show_doc(HpOptBinary)
show_doc(HpOptBinary.__init__)
show_doc(HpOptBinary.objective)
show_doc(HpOptBinary.optimize)

<h2 id="HpOptBinary" class="doc_header"><code>class</code> <code>HpOptBinary</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>HpOptBinary</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBClassifier`*)

Class that hypertunes an arbitrary model to binary classification
    

<h4 id="HpOptBinary.__init__" class="doc_header"><code>HpOptBinary.__init__</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptBinary.__init__</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBClassifier`*)

Initialization takes in a test and train set and an optional hyperparameter space

Args:
* X_train (array): training features
* X_test (array): testing features
* y_train (array): testing labels
* y_test (array): testing labels
* parameter_space (dict): hyperopt compatible parameter space
* model (module pointer): machine learning model compatiable with parameter space

<h4 id="HpOptBinary.objective" class="doc_header"><code>HpOptBinary.objective</code><a href="__main__.py#L44" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptBinary.objective</code>(**`params`**)

Objective function for loss that is provided to perform the MINLP
optimizaiton in hyperopt

Args:
* params (dict): hyperopt formated dictionary of hyperparameters

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptBinary.optimize" class="doc_header"><code>HpOptBinary.optimize</code><a href="__main__.py#L60" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptBinary.optimize</code>(**`max_evals`**=*`20`*)

optimizes the hyperparameter space in the object

Args:
* max_evals: number of hyperopt iterations

Returns:
* dict: best hyperparameters

Example of passing in a custom parameter set to an XGBoost classifier

In [11]:
df = datasets.load_breast_cancer()
X = df['data']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                    'gamma': hp.uniform('gamma', 0, 5),
                    'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                    'objective': hp.choice('objective', ['binary:logistic', 'binary:logitraw', 'binary:hinge']),
                    'eval_metric': hp.choice('eval_metric', ['logloss', 'error', 'auc', 'aucpr', 'map']),
                    'use_label_encoder': hp.choice('use_label_encoder', [False])
                   }

opt = HpOptBinary(X_train, X_test, y_train, y_test, parameter_space=parameter_space)
best = opt.optimize(max_evals=10)
model = xgboost.XGBClassifier(**best)
print(model)

model.fit(X_train, y_train)
pred_proba = model.predict_proba(X_test)
auc = metrics.roc_auc_score(y_test, pred_proba[:, 1])
print(f'AUC: {auc}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002559 seconds
INFO:hyperopt.tpe:TPE using 0 trials


 10%|██▌                       | 1/10 [00:00<00:03,  2.33trial/s, best loss: 0.01552323056882321]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002964 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.015523


 20%|█████                    | 2/10 [00:01<00:04,  1.66trial/s, best loss: 0.005644811115935733]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003888 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.005645


 30%|███████▌                 | 3/10 [00:01<00:03,  1.85trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002863 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.001086


 40%|██████████               | 4/10 [00:02<00:02,  2.04trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003050 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.001086


 50%|████████████▌            | 5/10 [00:02<00:02,  2.12trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003161 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.001086


 60%|███████████████          | 6/10 [00:03<00:02,  1.53trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003774 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.001086


 70%|█████████████████▌       | 7/10 [00:04<00:02,  1.32trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002774 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.001086


 80%|████████████████████     | 8/10 [00:04<00:01,  1.50trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002884 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.001086


 90%|██████████████████████▌  | 9/10 [00:05<00:00,  1.78trial/s, best loss: 0.001085540599218393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003308 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.001086


100%|████████████████████████| 10/10 [00:05<00:00,  1.76trial/s, best loss: 0.001085540599218393]
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.68,
              enable_categorical=False, eval_metric='error',
              gamma=0.01197826414525871, gpu_id=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06734041838405497,
              max_delta_step=None, max_depth=20, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, objective='binary:logitraw',
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.55,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, ...)
AUC: 0.9989144594007816


In [12]:
df = datasets.load_breast_cancer()
X = df['data']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                   }

opt = HpOptBinary(X_train, X_test, y_train, y_test, parameter_space=parameter_space, model=RandomForestClassifier)
best = opt.optimize(max_evals=10)
model = RandomForestClassifier(**best)
print(model)

model.fit(X_train, y_train)
pred_proba = model.predict_proba(X_test)
auc = metrics.roc_auc_score(y_test, pred_proba[:, 1])
print(f'AUC: {auc}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001014 seconds
INFO:hyperopt.tpe:TPE using 0 trials


 10%|██▌                      | 1/10 [00:00<00:01,  6.73trial/s, best loss: 0.003752535496957532]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000902 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.003753


 20%|█████                    | 2/10 [00:00<00:01,  7.34trial/s, best loss: 0.003752535496957532]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000851 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.003753


 30%|███████▌                 | 3/10 [00:00<00:00,  7.21trial/s, best loss: 0.003752535496957532]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000872 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.003753


 40%|██████████               | 4/10 [00:00<00:00,  7.24trial/s, best loss: 0.003752535496957532]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000787 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.003753


 50%|████████████            | 5/10 [00:00<00:00,  7.18trial/s, best loss: 0.0034482758620689724]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000821 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.003448


 60%|██████████████▍         | 6/10 [00:00<00:00,  7.00trial/s, best loss: 0.0034482758620689724]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000946 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.003448


 70%|████████████████▊       | 7/10 [00:00<00:00,  7.04trial/s, best loss: 0.0034482758620689724]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000826 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.003448


 80%|███████████████████▏    | 8/10 [00:01<00:00,  7.06trial/s, best loss: 0.0034482758620689724]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000828 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.003448


 90%|█████████████████████▌  | 9/10 [00:01<00:00,  7.14trial/s, best loss: 0.0034482758620689724]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000816 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.003448


100%|███████████████████████| 10/10 [00:01<00:00,  7.12trial/s, best loss: 0.0034482758620688614]
RandomForestClassifier(max_depth=11)
AUC: 0.9962474645030426


### `HpOptRegression`

In [13]:
#export


class HpOptRegression:
    """Class that hypertunes an arbitrary model to regression classification
    """

    def __init__(self, X_train, X_test, y_train, y_test, parameter_space=None, model=xgboost.XGBRegressor):
        """Initialization takes in a test and train set and an optional hyperparameter space

        Args:
        * X_train (array): training features
        * X_test (array): testing features
        * y_train (array): testing labels
        * y_test (array): testing labels
        * parameter_space (dict): hyperopt compatible parameter space
        * model (module pointer): machine learning model compatiable with parameter space
        """
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model

        if parameter_space is None:
            self.parameter_space = {
                'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                'reg_alpha': hp.uniform('reg_alpha', 0, 5),
                'reg_lambda': hp.uniform('reg_lambda', 0, 5),
                'min_child_weight': hp.uniform('min_child_weight', 0, 5),
                'gamma': hp.uniform('gamma', 0, 5),
                'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                'tree_method': hp.choice('tree_method', ['hist', 'exact', 'approx']),
                'objective': hp.choice('objective', ['reg:squarederror', 'reg:squaredlogerror']),
                'eval_metric': hp.choice('eval_metric', ['rmse', 'mae', 'mape', 'rmsle']),
                'gpu_id': hp.choice('gpu_id', [0]),
            }
        else:
            self.parameter_space = parameter_space

    def objective(self, params):
        """Objective function for loss that is provided to perform the MINLP
        optimizaiton in hyperopt

        Args:
        * params (dict): hyperopt formated dictionary of hyperparameters

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        model = self.model(**params)
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        loss = metrics.mean_squared_error(self.y_test, y_pred)
        return {'loss': loss, 'status': STATUS_OK}

    def optimize(self, max_evals=20):
        """optimizes the hyperparameter space in the object

        Args:
        * max_evals: number of hyperopt iterations

        Returns:
        * dict: best hyperparameters
        """
        trials = Trials()
        best = fmin(fn=self.objective,
                    space=self.parameter_space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        return space_eval(self.parameter_space, best)

In [14]:
show_doc(HpOptRegression)
show_doc(HpOptRegression.__init__)
show_doc(HpOptRegression.objective)
show_doc(HpOptRegression.optimize)

<h2 id="HpOptRegression" class="doc_header"><code>class</code> <code>HpOptRegression</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>HpOptRegression</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBRegressor`*)

Class that hypertunes an arbitrary model to regression classification
    

<h4 id="HpOptRegression.__init__" class="doc_header"><code>HpOptRegression.__init__</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptRegression.__init__</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`parameter_space`**=*`None`*, **`model`**=*`XGBRegressor`*)

Initialization takes in a test and train set and an optional hyperparameter space

Args:
* X_train (array): training features
* X_test (array): testing features
* y_train (array): testing labels
* y_test (array): testing labels
* parameter_space (dict): hyperopt compatible parameter space
* model (module pointer): machine learning model compatiable with parameter space

<h4 id="HpOptRegression.objective" class="doc_header"><code>HpOptRegression.objective</code><a href="__main__.py#L43" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptRegression.objective</code>(**`params`**)

Objective function for loss that is provided to perform the MINLP
optimizaiton in hyperopt

Args:
* params (dict): hyperopt formated dictionary of hyperparameters

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptRegression.optimize" class="doc_header"><code>HpOptRegression.optimize</code><a href="__main__.py#L59" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptRegression.optimize</code>(**`max_evals`**=*`20`*)

optimizes the hyperparameter space in the object

Args:
* max_evals: number of hyperopt iterations

Returns:
* dict: best hyperparameters

Example of passing in a custom parameter set to an XGBoost classifier

In [15]:
df = datasets.load_boston()
X = df['data']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                    'gamma': hp.uniform('gamma', 0, 5),
                    'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                    'objective': hp.choice('objective', ['reg:squarederror', 'reg:squaredlogerror',]),
                    'eval_metric': hp.choice('eval_metric', ['rmse', 'mae', 'mape', 'rmsle']),
                    'use_label_encoder': hp.choice('use_label_encoder', [False])
                   }

opt = HpOptRegression(X_train, X_test, y_train, y_test, parameter_space=parameter_space)
best = opt.optimize(max_evals=10)
model = xgboost.XGBRegressor(**best)
print(model)

model.fit(X_train, y_train)
pred = model.predict(X_test)
mse = np.sqrt(metrics.mean_squared_error(y_test, pred))
print(f'RMSE: {mse}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

 10%|██▊                         | 1/10 [00:00<00:03,  2.42trial/s, best loss: 382.1420797347393]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003195 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 382.142080


 20%|█████▌                      | 2/10 [00:00<00:03,  2.24trial/s, best loss: 83.14360105909299]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003565 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 83.143601


 30%|████████▍                   | 3/10 [00:02<00:05,  1.25trial/s, best loss: 22.26331950990619]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.004754 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 22.263320


 40%|███████████▏                | 4/10 [00:02<00:04,  1.50trial/s, best loss: 22.26331950990619]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.007802 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 22.263320


 50%|██████████████              | 5/10 [00:02<00:02,  1.73trial/s, best loss: 22.26331950990619]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003551 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 22.263320


 60%|████████████████▊           | 6/10 [00:21<00:27,  6.80s/trial, best loss: 22.26331950990619]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003100 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 22.263320


 70%|███████████████████▌        | 7/10 [00:29<00:21,  7.21s/trial, best loss: 22.26331950990619]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.005583 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 22.263320


 80%|█████████████████████▌     | 8/10 [00:34<00:12,  6.36s/trial, best loss: 14.797843019213886]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003923 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 14.797843


 90%|████████████████████████▎  | 9/10 [00:35<00:04,  4.61s/trial, best loss: 14.797843019213886]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.004712 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 14.797843


100%|███████████████████████████| 10/10 [00:37<00:00,  3.75s/trial, best loss: 7.480335479601239]
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.34,
             enable_categorical=False, eval_metric='mae',
             gamma=1.9238588058422383, gpu_id=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.08056615747341234,
             max_delta_step=None, max_depth=19, min_child_weight=None,
             missing=nan, monotone_constraints=None, n_estimators=100,
             n_jobs=None, num_parallel_tree=None, predictor=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=0.75, tree_method=None,
             use_label_encoder=False, validate_parameters=None, verbosity=None)
RMSE: 2.7350201972931094


In [16]:
df = datasets.load_boston()
X = df['data']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

parameter_space = {
                    'max_depth': hp.choice('max_depth', np.arange(21, dtype=int) + 2),
                   }

opt = HpOptRegression(X_train, X_test, y_train, y_test, parameter_space=parameter_space, model=RandomForestRegressor)
best = opt.optimize(max_evals=10)
model = RandomForestRegressor(**best)
print(model)

model.fit(X_train, y_train)
pred = model.predict(X_test)
mse = np.sqrt(metrics.mean_squared_error(y_test, pred))
print(f'RMSE: {mse}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

 10%|██▋                        | 1/10 [00:00<00:01,  4.55trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000835 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 10.652024


 20%|█████▍                     | 2/10 [00:00<00:01,  5.02trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001177 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 10.652024


 30%|████████                   | 3/10 [00:00<00:01,  5.52trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000838 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 10.652024


 40%|██████████▊                | 4/10 [00:00<00:00,  6.24trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000847 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 10.652024


 50%|█████████████▌             | 5/10 [00:00<00:00,  6.32trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000809 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 10.652024


 60%|████████████████▏          | 6/10 [00:01<00:00,  5.98trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003137 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 10.652024


 70%|██████████████████▉        | 7/10 [00:01<00:00,  5.32trial/s, best loss: 10.652023839411436]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.000967 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 10.652024


 80%|█████████████████████▌     | 8/10 [00:01<00:00,  5.43trial/s, best loss: 10.366898348544648]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001141 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 10.366898


 90%|█████████████████████████▏  | 9/10 [00:01<00:00,  5.13trial/s, best loss: 9.564085201909078]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001695 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 9.564085


100%|███████████████████████████| 10/10 [00:01<00:00,  5.49trial/s, best loss: 9.564085201909078]
RandomForestRegressor(max_depth=13)
RMSE: 3.3667505227667256


## Feature Selection

### `HpOptFeatureSelection`

In [17]:
#export


class HpOptFeatureSelection:
    """Uses hyperopt to remove features while maximizing an objective for a given problem
    """
    def __init__(self, X_train, X_test, y_train, y_test, space, model, problem_type):
        """Initialize data, model, and problem type

        Args:
        * X_train (DataFrame): training dataframe of features
        * X_test (DataFrame): testing dataframe of labels
        * y_train (DataFrame): training dataframe of features
        * y_test (DataFrame): testing dataframe of labels
        * space (dict): dictionary with each feature corresponding to a hyperopt choice object
        * model (object): model with "fit" and "predict" functions that are callable
        * problem_type (str): one of "binary", "regression", or "multilabel"
        """
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.parameter_space = space
        self.model = model
        self.problem_type = problem_type

    def objective_binary(self, params):
        """binary loss objective that grabs the columns for feature selection. uses AUC metric

        Args:
        * params (dict): which columns to use as features

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        cols = [i for i, j in params.items() if j == 1]
        self.model.fit(self.X_train[cols], self.y_train)
        pred_proba = self.model.predict_proba(self.X_test[cols])
        loss = 1 - metrics.roc_auc_score(self.y_test, pred_proba[:, 1])
        return {'loss': loss, 'status': STATUS_OK}

    def objective_multi(self, params):
        """multilabel loss objective that grabs the columns for feature selection. uses AUC metric

        Args:
        * params (dict): which columns to use as features

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        model = self.model(**params)
        model.fit(self.X_train, self.y_train)
        pred_proba = model.predict_proba(self.X_test)
        loss = 1 - metrics.roc_auc_score(self.y_test, pred_proba, multi_class='ovr', average='macro')
        return {'loss': loss, 'status': STATUS_OK}

    def objective_regression(self, params):
        """regression loss objective that grabs the columns for feature selection. uses MSE metric

        Args:
        * params (dict): which columns to use as features

        Returns:
        * dict: loss and status for hyperopt optimization
        """
        model = self.model(**params)
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        loss = metrics.mean_squared_error(self.y_test, y_pred)
        return {'loss': loss, 'status': STATUS_OK}

    def optimize(self, max_evals=20):
        """optimizes a feature space for each type of problem

        Args:
        * max_evals (int, optional): number of hyperopt evaluations. Defaults to 20.

        Returns:
        * object: hyperopt optimized object of parameters which are features
        """
        trials = Trials()
        if self.problem_type == 'binary':
            best = fmin(fn=self.objective_binary,
                        space=self.parameter_space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
        elif self.problem_type == 'multilabel':
            best = fmin(fn=self.objective_multi,
                        space=self.parameter_space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
        elif self.problem_type == 'regression':
            best = fmin(fn=self.objective_regression,
                        space=self.parameter_space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
        else:
            logger.info('Not an acceptable problem type to solve')
            return None
        return space_eval(self.parameter_space, best)

In [18]:
show_doc(HpOptFeatureSelection)
show_doc(HpOptFeatureSelection.__init__)
show_doc(HpOptFeatureSelection.objective_binary)
show_doc(HpOptFeatureSelection.objective_multi)
show_doc(HpOptFeatureSelection.objective_regression)
show_doc(HpOptFeatureSelection.optimize)

<h2 id="HpOptFeatureSelection" class="doc_header"><code>class</code> <code>HpOptFeatureSelection</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>HpOptFeatureSelection</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`space`**, **`model`**, **`problem_type`**)

Uses hyperopt to remove features while maximizing an objective for a given problem
    

<h4 id="HpOptFeatureSelection.__init__" class="doc_header"><code>HpOptFeatureSelection.__init__</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptFeatureSelection.__init__</code>(**`X_train`**, **`X_test`**, **`y_train`**, **`y_test`**, **`space`**, **`model`**, **`problem_type`**)

Initialize data, model, and problem type

Args:
* X_train (DataFrame): training dataframe of features
* X_test (DataFrame): testing dataframe of labels
* y_train (DataFrame): training dataframe of features
* y_test (DataFrame): testing dataframe of labels
* space (dict): dictionary with each feature corresponding to a hyperopt choice object
* model (object): model with "fit" and "predict" functions that are callable
* problem_type (str): one of "binary", "regression", or "multilabel"

<h4 id="HpOptFeatureSelection.objective_binary" class="doc_header"><code>HpOptFeatureSelection.objective_binary</code><a href="__main__.py#L27" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptFeatureSelection.objective_binary</code>(**`params`**)

binary loss objective that grabs the columns for feature selection. uses AUC metric

Args:
* params (dict): which columns to use as features

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptFeatureSelection.objective_multi" class="doc_header"><code>HpOptFeatureSelection.objective_multi</code><a href="__main__.py#L42" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptFeatureSelection.objective_multi</code>(**`params`**)

multilabel loss objective that grabs the columns for feature selection. uses AUC metric

Args:
* params (dict): which columns to use as features

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptFeatureSelection.objective_regression" class="doc_header"><code>HpOptFeatureSelection.objective_regression</code><a href="__main__.py#L57" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptFeatureSelection.objective_regression</code>(**`params`**)

regression loss objective that grabs the columns for feature selection. uses MSE metric

Args:
* params (dict): which columns to use as features

Returns:
* dict: loss and status for hyperopt optimization

<h4 id="HpOptFeatureSelection.optimize" class="doc_header"><code>HpOptFeatureSelection.optimize</code><a href="__main__.py#L72" class="source_link" style="float:right">[source]</a></h4>

> <code>HpOptFeatureSelection.optimize</code>(**`max_evals`**=*`20`*)

optimizes a feature space for each type of problem

Args:
* max_evals (int, optional): number of hyperopt evaluations. Defaults to 20.

Returns:
* object: hyperopt optimized object of parameters which are features

In [19]:
# prepare Dataset
df = datasets.load_breast_cancer()
X = pd.DataFrame(df['data'])
X.columns = [str(i) for i in range(X.shape[1])]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

# create hyperopt space
space = {}
for col in X.columns:
    space[col] = hp.choice(col, [0, 1])
    
# select features
hpobj = HpOptFeatureSelection(
    X_train,
    X_test,
    y_train,
    y_test, 
    space, 
    RandomForestClassifier(),
    problem_type='binary'
    )
best = hpobj.optimize(max_evals=10)
out = [i for i,j in best.items() if j==1]
print(f'Original number of features {X.shape[1]}')
print(f'Final number of features {len(out)}')

  0%|                                                     | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.007170 seconds
INFO:hyperopt.tpe:TPE using 0 trials


 10%|██▌                      | 1/10 [00:00<00:02,  4.45trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.008813 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.002063


 20%|█████                    | 2/10 [00:00<00:01,  4.70trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.007937 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.002063


 30%|███████▌                 | 3/10 [00:00<00:01,  4.79trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.014538 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.002063


 40%|██████████               | 4/10 [00:00<00:01,  5.12trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.008785 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.002063


 50%|████████████▌            | 5/10 [00:00<00:00,  5.49trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.006144 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.002063


 60%|███████████████          | 6/10 [00:01<00:00,  5.78trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.006808 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.002063


 70%|█████████████████▌       | 7/10 [00:01<00:00,  6.17trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.006119 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.002063


 80%|████████████████████     | 8/10 [00:01<00:00,  6.41trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.010131 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.002063


 90%|██████████████████████▌  | 9/10 [00:01<00:00,  6.38trial/s, best loss: 0.002062527138515069]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.007144 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.002063


100%|███████████████████████| 10/10 [00:01<00:00,  5.79trial/s, best loss: 0.0018454190186713015]
Original number of features 30
Final number of features 17


## Model Movement

### `save_sklearn_object_to_data_lake`

In [20]:
#export


def save_sklearn_object_to_data_lake(model, file_name, path, container, connection_str, overwrite=False):
    """moves a sklearn object to azure data lake as a pickle file at a given path

    Args:
    * model (sklearn object): model, pipeline, transformer in sklearn format
    * file_name (str): name of file
    * path (str): data lake path
    * container (str): data lake container
    * connection_str (str): azure connection string for the account
    * overwrite (bool, optional): set to overwrite a current file if there`. Defaults to False.
    """
    logger.info(f'Pushing Sklearn Object to Azure: {os.path.join(path, file_name)}')
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    blob_pusher(container_name=container,
                connection_str=connection_str,
                file_path=[file_name],
                blob_dest=[path],
                overwrite=overwrite)
    os.unlink(file_name)

In [21]:
show_doc(save_sklearn_object_to_data_lake)

<h4 id="save_sklearn_object_to_data_lake" class="doc_header"><code>save_sklearn_object_to_data_lake</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>save_sklearn_object_to_data_lake</code>(**`model`**, **`file_name`**, **`path`**, **`container`**, **`connection_str`**, **`overwrite`**=*`False`*)

moves a sklearn object to azure data lake as a pickle file at a given path

Args:
* model (sklearn object): model, pipeline, transformer in sklearn format
* file_name (str): name of file
* path (str): data lake path
* container (str): data lake container
* connection_str (str): azure connection string for the account
* overwrite (bool, optional): set to overwrite a current file if there`. Defaults to False.

In [22]:
#skip
save_sklearn_object_to_data_lake(model=model,
                                 file_name='RandomForestExample.pickle',
                                 path='sdsde_library/testing/models/', 
                                 container='sdsdetesting', 
                                 connection_str=os.environ['DATALAKE_CONN_STR_SECRET'], 
                                 overwrite=True)

INFO:__main__:Pushing Sklearn Object to Azure: sdsde_library/testing/models/RandomForestExample.pickle
INFO:sdsde.azure.filehandling:sdsdetesting is a valid
INFO:sdsde.azure.filehandling:ContainerAlreadyExists
INFO:sdsde.azure.filehandling:Uploading RandomForestExample.pickle, to to Azure Storage sdsde_library/testing/models/RandomForestExample.pickle
INFO:sdsde.azure.filehandling:Azure Upload Complete


# Create

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_fastai_helpers.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted 08_yaml_ingestion_binary_classification.ipynb.
Converted index.ipynb.
