In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score, classification_report
from xgboost import XGBClassifier
import polars as pl
import optuna

In [4]:
iris = load_iris()

data = pl.DataFrame(iris.data)
data.columns = iris.feature_names
target = pl.Series(iris.target)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [6]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [65]:
def objective(trial):
    params = {
        'booster': 'gbtree', # gbtree, gblinear or dart
        'verbosity': 3, # 2 (silent) - 3 (debug)
        'objective': 'multi:softmax', # multi:softmax, multi:softprob, binary:logistic, binary:logitraw, count:poisson, reg:linear, reg:logistic, reg:gamma, reg:tweedie
        'nthread': -1, # -1 means use all processors
        
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5), # default 0.3
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), # default 100
        'max_depth': trial.suggest_int('max_depth', 3, 10), # default 6
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10), # default 1
        'gamma': trial.suggest_float('gamma', 0.0, 10.0), # default 0.0
        'subsample': trial.suggest_float('subsample', 0.5, 1.0), # default 1.0
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42,
    }
    # 设置不打印信息
    bst = XGBClassifier(**params)
    bst.fit(X_train, Y_train)
    accuracy = accuracy_score(Y_test, bst.predict(X_test))
    
    # 减枝
    trial.report(accuracy, step=trial.number)
    
    # 如果准确率低于 90%，则剪枝
    if accuracy < 0.9:
        raise optuna.exceptions.TrialPruned()
    
    return accuracy

In [66]:
# 对 RandomSampler 而言 MedianPruner 是最好的
# 对 TPESampler 而言 HyperbandPruner 是最好的
optuna.logging.set_verbosity(optuna.logging.INFO)
study = optuna.create_study(
    study_name='XGBoost', direction='maximize', 
    sampler=optuna.samplers.TPESampler(seed=42), pruner=optuna.pruners.HyperbandPruner()
)

[I 2023-12-01 00:05:42,824] A new study created in memory with name: XGBoost


In [67]:
study.optimize(objective, n_trials = 100, show_progress_bar = True, n_jobs=4)

  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-01 00:05:44,266] Trial 2 finished with value: 1.0 and parameters: {'learning_rate': 0.18317382168230945, 'n_estimators': 312, 'max_depth': 8, 'min_child_weight': 5, 'gamma': 0.6738872117175809, 'subsample': 0.9463704537788391, 'colsample_bytree': 0.9127282983408311, 'colsample_bylevel': 0.7342874129645383, 'reg_alpha': 5.320742550913302, 'reg_lambda': 9.44111873072055}. Best is trial 2 with value: 1.0.
[I 2023-12-01 00:05:44,643] Trial 0 finished with value: 1.0 and parameters: {'learning_rate': 0.15620424018985526, 'n_estimators': 374, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 8.215960491453142, 'subsample': 0.9703551475080667, 'colsample_bytree': 0.6276920477602974, 'colsample_bylevel': 0.7176649628070166, 'reg_alpha': 2.782940976524184, 'reg_lambda': 1.1783168493214435}. Best is trial 2 with value: 1.0.
[I 2023-12-01 00:05:45,716] Trial 1 finished with value: 1.0 and parameters: {'learning_rate': 0.1611419810833, 'n_estimators': 735, 'max_depth': 8, 'min_child_weigh

In [68]:
best_params = study.best_params
best_value = study.best_value
print(f'Best params: {best_params}')
print(f'Best value: {best_value}\n')

Best params: {'learning_rate': 0.18317382168230945, 'n_estimators': 312, 'max_depth': 8, 'min_child_weight': 5, 'gamma': 0.6738872117175809, 'subsample': 0.9463704537788391, 'colsample_bytree': 0.9127282983408311, 'colsample_bylevel': 0.7342874129645383, 'reg_alpha': 5.320742550913302, 'reg_lambda': 9.44111873072055}
Best value: 1.0


In [69]:
Y_pred = XGBClassifier(**best_params).fit(X_train, Y_train).predict(X_test)

print("Accuracy : %.4g" % accuracy_score(Y_test, Y_pred))
print("Precision : %.4g" % precision_score(Y_test, Y_pred, average='macro'))
print("Recall : %.4g" % recall_score(Y_test, Y_pred, average='macro'))
print("F1 : %.4g" % f1_score(Y_test, Y_pred, average='macro'))
print("Cohen's kappa : %.4g" % cohen_kappa_score(Y_test, Y_pred))

Accuracy : 1
Precision : 1
Recall : 1
F1 : 1
Cohen's kappa : 1


In [70]:
print('\n clasification report:\n', classification_report(Y_test, Y_pred))


 clasification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


In [71]:
# save log
study.trials_dataframe().to_csv('./Result/study.csv', index=False)