In [1]:
import optuna

print(optuna.__version__)

2.10.0


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()

cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target'] = dataset.target
x_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x_features, y_label, 
                                                    test_size=0.2, random_state=156)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, 
                                            test_size=0.1, random_state=156)

In [4]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):

    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'eval_metric':'logloss',
    }
    
    xgb_clf = XGBClassifier(**params)
    accuracy = cross_val_score(xgb_clf, x_train, y_train, scoring='accuracy', cv=3)
    
    return np.mean(accuracy)

In [5]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=600, show_progress_bar=True)

[32m[I 2022-04-23 23:24:49,295][0m A new study created in memory with name: no-name-83e8cb00-c1f9-404c-82d8-e9e2bd280706[0m


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

[32m[I 2022-04-23 23:24:49,402][0m Trial 0 finished with value: 0.9604827466016034 and parameters: {'max_depth': 8, 'min_child_weight': 4, 'learning_rate': 0.14240606184014973, 'colsample_bytree': 0.5751694136896937, 'n_estimators': 100}. Best is trial 0 with value: 0.9604827466016034.[0m
[32m[I 2022-04-23 23:24:49,813][0m Trial 1 finished with value: 0.967047170907401 and parameters: {'max_depth': 7, 'min_child_weight': 1, 'learning_rate': 0.11711193164449787, 'colsample_bytree': 0.5771231471103424, 'n_estimators': 1000}. Best is trial 1 with value: 0.967047170907401.[0m
[32m[I 2022-04-23 23:24:50,073][0m Trial 2 finished with value: 0.9626757290577438 and parameters: {'max_depth': 7, 'min_child_weight': 5, 'learning_rate': 0.1784565712962714, 'colsample_bytree': 0.5158566802978718, 'n_estimators': 800}. Best is trial 1 with value: 0.967047170907401.[0m
[32m[I 2022-04-23 23:24:50,324][0m Trial 3 finished with value: 0.9626466829324968 and parameters: {'max_depth': 6, 'min_c

[32m[I 2022-04-23 23:24:58,959][0m Trial 28 finished with value: 0.9670616939700244 and parameters: {'max_depth': 8, 'min_child_weight': 1, 'learning_rate': 0.1425081796536227, 'colsample_bytree': 0.547987229442102, 'n_estimators': 900}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:24:59,231][0m Trial 29 finished with value: 0.9604537004763564 and parameters: {'max_depth': 8, 'min_child_weight': 3, 'learning_rate': 0.13945072547334747, 'colsample_bytree': 0.5443966180859763, 'n_estimators': 700}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:24:59,582][0m Trial 30 finished with value: 0.9604537004763566 and parameters: {'max_depth': 10, 'min_child_weight': 2, 'learning_rate': 0.15354935247676682, 'colsample_bytree': 0.5910497810986433, 'n_estimators': 900}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:24:59,970][0m Trial 31 finished with value: 0.9626612059951203 and parameters: {'max_depth': 

[32m[I 2022-04-23 23:25:08,608][0m Trial 56 finished with value: 0.960439177413733 and parameters: {'max_depth': 4, 'min_child_weight': 2, 'learning_rate': 0.1517238117316874, 'colsample_bytree': 0.6421784741619132, 'n_estimators': 800}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:25:08,974][0m Trial 57 finished with value: 0.9604682235389799 and parameters: {'max_depth': 8, 'min_child_weight': 1, 'learning_rate': 0.10790213096497502, 'colsample_bytree': 0.5169738662481778, 'n_estimators': 900}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:25:09,164][0m Trial 58 finished with value: 0.9626466829324968 and parameters: {'max_depth': 8, 'min_child_weight': 2, 'learning_rate': 0.13306731344919615, 'colsample_bytree': 0.5576373590059223, 'n_estimators': 400}. Best is trial 16 with value: 0.9692546764261647.[0m
[32m[I 2022-04-23 23:25:09,572][0m Trial 59 finished with value: 0.9692691994887882 and parameters: {'max_depth': 9

[32m[I 2022-04-23 23:25:18,851][0m Trial 84 finished with value: 0.9648541884512606 and parameters: {'max_depth': 10, 'min_child_weight': 1, 'learning_rate': 0.17541058435636292, 'colsample_bytree': 0.5691485596948413, 'n_estimators': 1000}. Best is trial 59 with value: 0.9692691994887882.[0m
[32m[I 2022-04-23 23:25:19,206][0m Trial 85 finished with value: 0.9626612059951203 and parameters: {'max_depth': 9, 'min_child_weight': 1, 'learning_rate': 0.14263408645622594, 'colsample_bytree': 0.5302129269811056, 'n_estimators': 900}. Best is trial 59 with value: 0.9692691994887882.[0m
[32m[I 2022-04-23 23:25:19,554][0m Trial 86 finished with value: 0.9560532125014524 and parameters: {'max_depth': 6, 'min_child_weight': 4, 'learning_rate': 0.12778244709065203, 'colsample_bytree': 0.5560197430041094, 'n_estimators': 1000}. Best is trial 59 with value: 0.9692691994887882.[0m
[32m[I 2022-04-23 23:25:19,960][0m Trial 87 finished with value: 0.9582752410828395 and parameters: {'max_dept

In [6]:
print(f'Number of finished trials: {len(study.trials)}')

trial = study.best_trial

print('Best trial:')
print(f'\tAccuracy: {trial.value}')
print(f'\tBest Hyperparameters:')
for key, value in trial.params.items():
    print(f'\t\t{key}: {value}')

Number of finished trials: 100
Best trial:
	Accuracy: 0.9692691994887882
	Best Hyperparameters:
		max_depth: 9
		min_child_weight: 1
		learning_rate: 0.1237459933232244
		colsample_bytree: 0.6064401654451399
		n_estimators: 1000


In [7]:
evals = [(x_tr, y_tr), (x_val, y_val)]

best_clf = XGBClassifier(**study.best_params)
best_clf.fit(x_tr, y_tr, early_stopping_rounds=100, eval_metric='logloss', 
             eval_set=evals, verbose=True)

[0]	validation_0-logloss:0.588882	validation_1-logloss:0.623807
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.508719	validation_1-logloss:0.577291
[2]	validation_0-logloss:0.442136	validation_1-logloss:0.52227
[3]	validation_0-logloss:0.386536	validation_1-logloss:0.477589
[4]	validation_0-logloss:0.340778	validation_1-logloss:0.44554
[5]	validation_0-logloss:0.301169	validation_1-logloss:0.428629
[6]	validation_0-logloss:0.266524	validation_1-logloss:0.399554
[7]	validation_0-logloss:0.23714	validation_1-logloss:0.377767
[8]	validation_0-logloss:0.211496	validation_1-logloss:0.358936
[9]	validation_0-logloss:0.189292	validation_1-logloss:0.339962
[10]	validation_0-logloss:0.169644	validation_1-logloss:0.324557
[11]	validation_0-logloss:0.153248	validation_1-logloss:0.310906
[12]	validation_0-logloss:0.138716	validation_1-logloss:0.30202
[13

[124]	validation_0-logloss:0.007865	validation_1-logloss:0.250808
[125]	validation_0-logloss:0.007838	validation_1-logloss:0.250808
[126]	validation_0-logloss:0.007809	validation_1-logloss:0.249374
[127]	validation_0-logloss:0.007781	validation_1-logloss:0.250589
[128]	validation_0-logloss:0.007754	validation_1-logloss:0.250091
[129]	validation_0-logloss:0.007727	validation_1-logloss:0.251142
[130]	validation_0-logloss:0.007701	validation_1-logloss:0.251081
[131]	validation_0-logloss:0.007675	validation_1-logloss:0.251316
[132]	validation_0-logloss:0.007648	validation_1-logloss:0.249942
[133]	validation_0-logloss:0.007623	validation_1-logloss:0.249961
[134]	validation_0-logloss:0.007597	validation_1-logloss:0.251004
[135]	validation_0-logloss:0.007572	validation_1-logloss:0.250528
[136]	validation_0-logloss:0.007547	validation_1-logloss:0.250418
Stopping. Best iteration:
[36]	validation_0-logloss:0.027208	validation_1-logloss:0.237965



XGBClassifier(colsample_bytree=0.6064401654451399,
              learning_rate=0.1237459933232244, max_depth=9, n_estimators=1000)

In [8]:
preds = best_clf.predict(x_test)
pred_proba = best_clf.predict_proba(x_test)[:, 1]

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
                            recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print(f'confusion matrix\n{confusion}')
    print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}')
    print(f'F1: {f1:.4f}, AUC: {roc_auc:.4f}')

In [10]:
get_clf_eval(y_test, preds, pred_proba)

confusion matrix
[[34  3]
 [ 2 75]]
accuracy: 0.9561, precision: 0.9615, recall: 0.9740
F1: 0.9677, AUC: 0.9926
