# Format DataFrame

Be advised, this dataset (SKLearn's Forest Cover Types) can take a little while to download...

This is a multi-class classification task, in which the target is label-encoded.

In [1]:
import pandas as pd
from sklearn.datasets import fetch_covtype

data = fetch_covtype(shuffle=True, random_state=32)
train_df = pd.DataFrame(data.data, columns=["x_{}".format(_) for _ in range(data.data.shape[1])])
train_df["y"] = data.target

print(train_df.shape)
train_df.head()

(581012, 55)


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_45,x_46,x_47,x_48,x_49,x_50,x_51,x_52,x_53,y
0,3247.0,289.0,12.0,268.0,40.0,1624.0,186.0,238.0,193.0,2525.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,3200.0,46.0,17.0,162.0,45.0,1592.0,223.0,200.0,105.0,2254.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2368.0,48.0,19.0,277.0,121.0,1260.0,224.0,196.0,99.0,1237.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,2828.0,50.0,11.0,417.0,73.0,1252.0,225.0,215.0,123.0,962.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2932.0,32.0,11.0,618.0,55.0,638.0,218.0,217.0,134.0,1092.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


# Set Up Environment

In [2]:
from hyperparameter_hunter import Environment, CVExperiment
from sklearn.metrics import f1_score

env = Environment(
    train_dataset=train_df,
    results_path="HyperparameterHunterAssets",
    target_column="y",
    metrics=dict(f1=lambda y_true, y_pred: f1_score(y_true, y_pred, average="micro")),
    cv_type="StratifiedKFold",
    cv_params=dict(n_splits=5, random_state=32),
)

Cross-Experiment Key:   'WQMO2i1RnEaE7cguWwpBywkh25UKTgtwR12Z0LqWIUM='


Now that HyperparameterHunter has an active `Environment`, we can do two things:

# 1. Perform Experiments

In [3]:
from lightgbm import LGBMClassifier

experiment = CVExperiment(
    model_initializer=LGBMClassifier,
    model_init_params=dict(boosting_type="gbdt", num_leaves=31, max_depth=-1, subsample=0.5),
    model_extra_params=dict(
        fit=dict(
            feature_name=train_df.columns.values[:-1].tolist(),
            categorical_feature=train_df.columns.values[11:-1].tolist(),
        ),
    ),
)

<15:06:35> Validated Environment:  'WQMO2i1RnEaE7cguWwpBywkh25UKTgtwR12Z0LqWIUM='
<15:06:35> Initialized Experiment: 'f2096258-17fd-47b4-a384-362a43cc8cbd'
<15:06:35> Hyperparameter Key:     'Hyx-Jo5QIqiXxDRLIQjh5_uQ2JVsViCjaGhWzzoYpy4='
<15:06:35> 
<15:06:44> F0.0 AVG:   OOF(f1=0.83622)  |  Time Elapsed: 9.24039 s
<15:06:54> F0.1 AVG:   OOF(f1=0.83796)  |  Time Elapsed: 9.17901 s
<15:07:03> F0.2 AVG:   OOF(f1=0.83635)  |  Time Elapsed: 9.43469 s
<15:07:12> F0.3 AVG:   OOF(f1=0.83682)  |  Time Elapsed: 9.32817 s
<15:07:22> F0.4 AVG:   OOF(f1=0.83370)  |  Time Elapsed: 9.22297 s
<15:07:22> 
<15:07:22> FINAL:    OOF(f1=0.83621)  |  Time Elapsed: 46.77216 s
<15:07:22> 
<15:07:22> Saving results for Experiment: 'f2096258-17fd-47b4-a384-362a43cc8cbd'


# 2. Hyperparameter Optimization

In [4]:
from hyperparameter_hunter import RandomForestOptPro, Real, Integer, Categorical

optimizer = RandomForestOptPro(iterations=10, random_state=32)

optimizer.set_experiment_guidelines(
    model_initializer=LGBMClassifier,
    model_init_params=dict(
        boosting_type=Categorical(["gbdt", "dart"]),
        num_leaves=Integer(10, 40),
        max_depth=-1,
        subsample=Real(0.3, 0.7),
    ),
    model_extra_params=dict(
        fit=dict(
            feature_name=train_df.columns.values[:-1].tolist(),
            categorical_feature=train_df.columns.values[11:-1].tolist(),
        ),
    ),
)

optimizer.go()

Validated Environment with key: "WQMO2i1RnEaE7cguWwpBywkh25UKTgtwR12Z0LqWIUM="
[31mSaved Result Files[0m
[31m_______________________________________________________________________________________[0m
 Step |       ID |   Time |      Value |   boosting_type |   num_leaves |   subsample | 
Experiments matching cross-experiment key/algorithm: 1
Experiments fitting in the given space: 1
Experiments matching current guidelines: 1
    0 | f2096258 | 00m00s | [35m   0.83621[0m | [32m           gbdt[0m | [32m          31[0m | [32m     0.5000[0m | 
[31mHyperparameter Optimization[0m
[31m_______________________________________________________________________________________[0m
 Step |       ID |   Time |      Value |   boosting_type |   num_leaves |   subsample | 
    1 | 00708066 | 01m23s |    0.76716 |            dart |           15 |      0.4684 | 
    2 | dd0307d2 | 00m52s |    0.83191 |            gbdt |           29 |      0.5947 | 
    3 | e3b29434 | 01m20s |    0.76080 | 

Notice, `optimizer` recognizes our earlier `experiment`'s hyperparameters fit inside the search space/guidelines set for `optimizer`.

Then, when optimization is started, it automatically learns from `experiment`'s results - without any extra work for us!