## Imports

In [97]:
import xgboost as xgb
import numpy as np
from data.Data import Data
import sklearn

### SEED

In [98]:
import random

SEED = None
if not SEED:
  SEED = random.randint(0, 10000)
print(f"SEED: {SEED}")

np.random.seed(seed=SEED)
random.seed(SEED)

SEED: 9538


## Datasets

In [99]:
dataset = "breast-cancer"
COLUMNS_FOR_BREAST_CANCER = ['node-caps', 'inv-nodes', 'tumor-size', 'deg-malig', 'irradiat', 'class']

In [100]:
distinct_columns = None
if dataset == "breast-cancer":
  distinct_columns = COLUMNS_FOR_BREAST_CANCER

data = Data(dataset, distinct_columns=distinct_columns)

In [101]:
train, eval = data.get_train_and_valid_set(0.7)
train_x = train.drop('class', axis=1)
train_y = train['class']

eval_x = eval.drop('class', axis=1)
eval_y = eval['class']

## Hyperparameter tuning

In [102]:
import optuna

In [103]:
xgb_models_saves_dir = "./model_saves/xgb_models_saves/"

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Create model
    model = xgb.XGBClassifier(**params)

    # Fit model
    model.fit(train_x, train_y)

    # Eval model
    y_pred = model.predict(eval_x)
    accuracy = sklearn.metrics.accuracy_score(eval_y, y_pred)

    if accuracy > 0.8:
        model.save_model(f"{xgb_models_saves_dir}model_{dataset}_{accuracy:4.2f}.json")
    
    return accuracy

In [104]:
study = optuna.create_study(direction="maximize",
                            sampler=optuna.samplers.TPESampler(seed=SEED),
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=50, show_progress_bar=True)

pruned_trials = [t for t in study.trials if t.state == optuna.TrialPruned]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))

[I 2023-06-08 16:38:42,046] A new study created in memory with name: no-name-7ac5ae85-0336-4fac-9f10-54ff029e56e1


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-08 16:38:42,114] Trial 0 finished with value: 0.7325581395348837 and parameters: {'max_depth': 8, 'learning_rate': 0.1715316363710388, 'n_estimators': 130, 'min_child_weight': 8, 'gamma': 1.959626199541495e-06, 'subsample': 0.011628475188418726, 'colsample_bytree': 0.4392432498579906, 'reg_alpha': 0.00048057372155045636, 'reg_lambda': 4.988539792254573e-06}. Best is trial 0 with value: 0.7325581395348837.
[I 2023-06-08 16:38:42,335] Trial 1 finished with value: 0.7325581395348837 and parameters: {'max_depth': 2, 'learning_rate': 0.12774703656070946, 'n_estimators': 417, 'min_child_weight': 8, 'gamma': 3.937853162885529e-07, 'subsample': 0.06165492670347774, 'colsample_bytree': 0.13211299655617792, 'reg_alpha': 0.029363474426732285, 'reg_lambda': 1.4425211945762475e-08}. Best is trial 0 with value: 0.7325581395348837.
[I 2023-06-08 16:38:42,461] Trial 2 finished with value: 0.7325581395348837 and parameters: {'max_depth': 6, 'learning_rate': 0.10323161516984816, 'n_estimators

In [105]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.8023255813953488
  Params: 
    max_depth: 9
    learning_rate: 0.021293532451690887
    n_estimators: 366
    min_child_weight: 2
    gamma: 2.5049861036687497e-06
    subsample: 0.16750648356581507
    colsample_bytree: 0.04229281082115818
    reg_alpha: 3.969227080574277e-07
    reg_lambda: 3.744263342403242e-05


## Visualize hyperparameter tuning

In [106]:
optuna.visualization.plot_optimization_history(study)

In [107]:
optuna.visualization.plot_param_importances(study)

## Model

In [111]:
clf_xgb = xgb.XGBClassifier()
clf_xgb.load_model(f"{xgb_models_saves_dir}/model_{dataset}.json")

## Evaluate

### Train set

In [114]:
result_test = clf_xgb.score(train_x, train_y)
print("Accuracy : {}".format(result_test))

Accuracy : 0.73


### Eval set

In [115]:
result_test = clf_xgb.score(eval_x, eval_y)
print("Accuracy : {}".format(result_test))

Accuracy : 0.8023255813953488


# Wnioski

Dla breast_cancer accuracy modelu waha się w granicy 0.8. Jest to związane ze słabą jakością zbioru danych.
Dla breast_cancer_wisconsin accuracy modelu jest bliskie 100%. Pokazuje to różnicę jakości w zbiorach danych.