# Format DataFrame

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
train_df = pd.DataFrame(data.data, columns=data.feature_names)
train_df["diagnosis"] = data.target

print(train_df.shape)
train_df.head()

(569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# Set Up Environment

In [2]:
from hyperparameter_hunter import Environment, CrossValidationExperiment

env = Environment(
    train_dataset=train_df,
    root_results_path="HyperparameterHunterAssets",
    target_column="diagnosis",
    metrics_map=["roc_auc_score"],
    cross_validation_type="StratifiedKFold",
    cross_validation_params=dict(n_splits=5, random_state=32),
    verbose=1,
)

Cross-Experiment Key:   'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='


We're initializing our `Environment` with `verbose=1` to tell our experiments to only log the essentials because we're about to run lots of experiments.

Now that HyperparameterHunter has an active `Environment`, we can do two things:

# 1. Perform Experiments

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

experiment_0 = CrossValidationExperiment(KNeighborsClassifier, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: '9eb9e706-c8f7-482d-b75c-9061e2abfb84'
<18:06:27> Hyperparameter Key:     '-lUbOmxw4PBQbJTgKXe2SQFMLkdgvTCnmTl9Wnz3CUc='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.91671)  |  Time Elapsed: 0.04464 s
<18:06:27> 
<18:06:27> Saving results for Experiment: '9eb9e706-c8f7-482d-b75c-9061e2abfb84'


In [4]:
experiment_1 = CrossValidationExperiment(SVC, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: '7b7669a0-6ee5-4365-ac35-fa97e9e912db'
<18:06:27> Hyperparameter Key:     'zSm0a8Zxw7EfGD4Ai8Xz63NasLSEOwkx_bV1rNK5_W8='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.50000)  |  Time Elapsed: 0.11852 s
<18:06:27> 
<18:06:27> Saving results for Experiment: '7b7669a0-6ee5-4365-ac35-fa97e9e912db'


In [5]:
experiment_2 = CrossValidationExperiment(LinearSVC, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: 'bfcc8fe6-c4a9-4178-a81d-70288168e238'
<18:06:27> Hyperparameter Key:     'umSzvriJIkgXYrenXLwiFwAGcPH-z8u5r8rnMRmK12Y='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.89917)  |  Time Elapsed: 0.15575 s
<18:06:27> 
<18:06:27> Saving results for Experiment: 'bfcc8fe6-c4a9-4178-a81d-70288168e238'


In [6]:
experiment_3 = CrossValidationExperiment(NuSVC, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: '9d2464c6-8a29-44df-918b-f91be6c81a1b'
<18:06:27> Hyperparameter Key:     'Db1ESJoQdnzn1dzxKdDO_yR4PniOYG7W6vMrybE6mxI='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.50000)  |  Time Elapsed: 0.10904 s
<18:06:27> 
<18:06:27> Saving results for Experiment: '9d2464c6-8a29-44df-918b-f91be6c81a1b'


In [7]:
experiment_4 = CrossValidationExperiment(DecisionTreeClassifier, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: '3c44b2e1-8d17-49bf-87de-7600a957aba4'
<18:06:27> Hyperparameter Key:     'Em2R2LoQaueLGw5QJS3sUe6Gpv9voIAGrND_DQmerFM='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.92209)  |  Time Elapsed: 0.06151 s
<18:06:27> 
<18:06:27> Saving results for Experiment: '3c44b2e1-8d17-49bf-87de-7600a957aba4'


In [8]:
experiment_5 = CrossValidationExperiment(RandomForestClassifier, {})

<18:06:27> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:27> Initialized Experiment: 'da111588-06a5-4a5d-84f1-5b01be330dda'
<18:06:27> Hyperparameter Key:     'OdIHXd6SeFQi6gFbjUKsKR_KplScHEjsAppJ4Yhxpys='
<18:06:27> 
<18:06:27> 
<18:06:27> FINAL:    OOF(roc_auc_score=0.95777)  |  Time Elapsed: 0.09244 s
<18:06:27> 
<18:06:27> Saving results for Experiment: 'da111588-06a5-4a5d-84f1-5b01be330dda'


In [9]:
experiment_6 = CrossValidationExperiment(AdaBoostClassifier, {})

<18:06:28> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:28> Initialized Experiment: '2e7ead68-c40f-4c39-a49b-55f668fb3a63'
<18:06:28> Hyperparameter Key:     '65vlVQBglPtLRvLoMPQ4vo6V11XNuBPSVdGa3gfiga8='
<18:06:28> 
<18:06:28> 
<18:06:28> FINAL:    OOF(roc_auc_score=0.96425)  |  Time Elapsed: 0.50103 s
<18:06:28> 
<18:06:28> Saving results for Experiment: '2e7ead68-c40f-4c39-a49b-55f668fb3a63'


In [10]:
experiment_7 = CrossValidationExperiment(GradientBoostingClassifier, {})

<18:06:28> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:28> Initialized Experiment: '8f7b1c7d-4b9a-4e6c-af3c-54b49e7309e5'
<18:06:28> Hyperparameter Key:     'QsW5PTxiq440XcDtxUQDwRZ2k6utNZByjmD0aDq4viM='
<18:06:28> 
<18:06:29> 
<18:06:29> FINAL:    OOF(roc_auc_score=0.95393)  |  Time Elapsed: 0.64604 s
<18:06:29> 
<18:06:29> Saving results for Experiment: '8f7b1c7d-4b9a-4e6c-af3c-54b49e7309e5'


In [11]:
experiment_8 = CrossValidationExperiment(GaussianNB, {})

<18:06:29> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:29> Initialized Experiment: 'be5ae00f-416a-436b-bff7-d4f244a598c0'
<18:06:29> Hyperparameter Key:     'rKKFZX71cT7Hyreg02mNi_12YKCWQWNX9X9Dm1y1mH8='
<18:06:29> 
<18:06:29> 
<18:06:29> FINAL:    OOF(roc_auc_score=0.93035)  |  Time Elapsed: 0.03867 s
<18:06:29> 
<18:06:29> Saving results for Experiment: 'be5ae00f-416a-436b-bff7-d4f244a598c0'


In [12]:
experiment_9 = CrossValidationExperiment(LinearDiscriminantAnalysis, {})

<18:06:29> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:29> Initialized Experiment: '1059a381-6dcf-46c8-a208-678e13d906fb'
<18:06:29> Hyperparameter Key:     '0oh263u3lfeNrwqq0jzCrNw1M5eMWldb8CIw6ML7qmI='
<18:06:29> 
<18:06:29> 
<18:06:29> FINAL:    OOF(roc_auc_score=0.95003)  |  Time Elapsed: 0.04898 s
<18:06:29> 
<18:06:29> Saving results for Experiment: '1059a381-6dcf-46c8-a208-678e13d906fb'


In [13]:
experiment_10 = CrossValidationExperiment(QuadraticDiscriminantAnalysis, {})

<18:06:29> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:29> Initialized Experiment: '7996ba40-0e56-43ed-82c8-8b5f4c824348'
<18:06:29> Hyperparameter Key:     'qdHf2hC76V9AcGks6LKQjZoi1kM-_Z5bQBskGQ_FM5M='
<18:06:29> 
<18:06:29> 
<18:06:29> FINAL:    OOF(roc_auc_score=0.95821)  |  Time Elapsed: 0.04282 s
<18:06:29> 
<18:06:29> Saving results for Experiment: '7996ba40-0e56-43ed-82c8-8b5f4c824348'


In [14]:
experiment_11 = CrossValidationExperiment(MLPClassifier, {})

<18:06:29> Validated Environment:  'aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60='
<18:06:29> Initialized Experiment: 'dbefabbf-9cb0-4abc-a57a-a0d606f63687'
<18:06:29> Hyperparameter Key:     'JYz6ECsGC_S2dHnO3Gxcn4-jC5odreQXXF13WwoeW_0='
<18:06:29> 
<18:06:30> 
<18:06:30> FINAL:    OOF(roc_auc_score=0.92003)  |  Time Elapsed: 0.82599 s
<18:06:30> 
<18:06:30> Saving results for Experiment: 'dbefabbf-9cb0-4abc-a57a-a0d606f63687'


Of course, SKLearn has many more algorithms than those shown here, but I think you get the idea.

Notice that in all the above experiments, we gave `CrossValidationExperiment` `model_init_params={}`. Passing an empty dict tells it to use the default hyperparameters for the `model_initializer`, which it'll figure out on its own.

# 2. Hyperparameter Optimization

We're just going to do optimization on one of the algorithms used above (`AdaBoostClassifier`); although, HyperparameterHunter can certainly do consecutive optimization rounds.

Notice below that `optimizer` correctly identifies `experiment_6` as being the only saved experiment it can learn from because it's optimizing `AdaBoostClassifier`.

In [15]:
from hyperparameter_hunter import RandomForestOptimization, Real, Integer, Categorical

optimizer = RandomForestOptimization(iterations=12, random_state=42)

optimizer.set_experiment_guidelines(
    model_initializer=AdaBoostClassifier,
    model_init_params=dict(
        n_estimators=Integer(25, 100),
        learning_rate=Real(0.5, 1.0),
        algorithm=Categorical(["SAMME", "SAMME.R"]),
    ),
)

optimizer.go()

Validated Environment with key: "aZlwCukh8RVC3A3BIv1DK27sZYyUCsegMFpr2R4JB60="
[31mSaved Result Files[0m
[31m_________________________________________________________________________________________[0m
 Step |       ID |   Time |      Value |   algorithm |   learning_rate |   n_estimators | 
Experiments matching cross-experiment key/algorithm: 1
Experiments fitting in the given space: 1
Experiments matching current guidelines: 1
    0 | 2e7ead68 | 00m00s | [35m   0.96425[0m | [32m    SAMME.R[0m | [32m         1.0000[0m | [32m            50[0m | 
[31mHyperparameter Optimization[0m
[31m_________________________________________________________________________________________[0m
 Step |       ID |   Time |      Value |   algorithm |   learning_rate |   n_estimators | 
    1 | f70db9be | 00m00s |    0.96425 |     SAMME.R |          0.6864 |             68 | 
    2 | 83f81d30 | 00m00s |    0.95438 |     SAMME.R |          0.8683 |             44 | 
    3 | 262dbae0 | 00m00s |