In [1]:
import numpy as np
import pandas as pd

from skimpy import skim

In [2]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [3]:
from xgboost import XGBClassifier

In [4]:
N_JOBS = -1

In [5]:
df = pd.read_csv('XGBoost_Book/Chapter06/heart_disease.csv')

In [6]:
skim(df)

In [7]:
# Base Line Accuracy
df.loc[:, 'target'].value_counts(normalize=True).max()

0.5445544554455446

In [8]:
transformer = ColumnTransformer([
    ("ohe", OneHotEncoder(), ["sex", "cp", "restecg", "slope", "thal"]),
    ("scaler", StandardScaler(), ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"])
], n_jobs=N_JOBS, remainder='passthrough')

In [9]:
X = transformer.fit_transform(df.iloc[:, :-1])

In [10]:
clf = XGBClassifier(n_jobs=N_JOBS)

In [11]:
params = {"n_estimators": range(1, 50),
          "max_depth": range(1, 64),
          "learning_rate": np.linspace(0, 10, 100),
          "gamma": np.linspace(0, 10, 100),
          "subsample": np.linspace(0, 1, 100),
          "colsample_bytree": np.linspace(0, 1, 100),
          "colsample_bylevel": np.linspace(0, 1, 100),
          "colsample_bynode": np.linspace(0, 1, 100),
          "lambda": np.logspace(0, 2, 10)}

In [12]:
CV = 10

In [13]:
skf = StratifiedKFold(n_splits=CV, shuffle=True)

In [14]:
N_ITER = 1_000

In [15]:
grid = RandomizedSearchCV(clf, params, n_iter=N_ITER, scoring="accuracy", n_jobs=N_JOBS, cv=skf)

In [16]:
_ = grid.fit(X, df.loc[:, 'target'])

In [17]:
grid.best_score_, grid.best_params_

(0.8579569892473119,
 {'subsample': 0.5656565656565657,
  'n_estimators': 36,
  'max_depth': 45,
  'learning_rate': 0.20202020202020202,
  'lambda': 4.641588833612778,
  'gamma': 1.7171717171717171,
  'colsample_bytree': 0.5656565656565657,
  'colsample_bynode': 0.06060606060606061,
  'colsample_bylevel': 0.9191919191919192})

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, df.loc[:, 'target'], stratify=df.loc[:, 'target'])

In [23]:
clf = XGBClassifier(**grid.best_params_)

In [24]:
# Train
clf.fit(X_train, y_train)
# Predict
y_pred = clf.predict(X_test)
# Scores
acc_score = accuracy_score(y_test, y_pred)
f_score = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)
# Show
print(f'Accuracy: {acc_score:.2%} F1: {f_score:.2%} AUC: {auc_score:.2%}')

Accuracy: 81.58% F1: 81.58% AUC: 82.09%
