In [10]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

output_directory = '/home/marco/DataspellProjects/thesis/output/pls/'

palette = sns.color_palette(["#2176AB", "#F97662", "#FFBF00", "#50C878", "#B284BE"])
sns.set(style="whitegrid", font_scale=1.5)
sns.set_palette(palette)

df = pd.read_csv('../../../data/ST002498_z.csv')

Performing extreme gradient boost model

In [11]:

# Split the data into features (X) and target variable (y)
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis']

In [13]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

space = {
    'max_depth': hp.quniform("max_depth", 3, 30, 1),  # Aumentato il range di max_depth
    'gamma': hp.uniform('gamma', 0, 20),  # Aumentato il range di gamma
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),  # Aumentato il range di n_estimators
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),  # Aggiunto learning_rate
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Aggiunto subsample
    'seed': 0
}

def objective(space):
    clf = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        eval_metric="auc",
        callbacks=[xgb.callback.EarlyStopping(rounds=10, metric_name='auc', save_best=True)]
    )

    clf.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=False)

    pred = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred)
    return {'loss': -auc, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=1000,
                        trials=trials)

best_params = space_eval(space, best_hyperparams)

best_clf = xgb.XGBClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    gamma=best_params['gamma'],
    seed=0
)

best_clf.fit(X_train, y_train)

pred = best_clf.predict_proba(X_test)[:, 1]
best_auc = roc_auc_score(y_test, pred)
print("Miglior score AUC:", best_auc)
print("Miglior modello:", best_clf)
print("Migliori parametri:", best_params)



100%|██████████| 1000/1000 [01:21<00:00, 12.20trial/s, best loss: -0.6818181818181818]
Miglior score AUC: 0.5
Miglior modello: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=16.237646658034798, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None, ...)
Migliori parametri: {'gamma': 16.237646658034798, 'learning_rate': 0.2054963925732792, 'max_depth': 3.0, 'n_es