In [1]:
import numpy as np
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

df = pd.read_csv('./heart_disease.csv')

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

model = XGBClassifier(booster='gbtree', objective='binary:logistic', 
                      random_state=2)
scores = cross_val_score(model, X, y, cv=5)
print(f"Accuracy: {np.round(scores, 2)}")
print(f"Accuracy mean: {scores.mean():.2f}")


Accuracy: [0.84 0.85 0.82 0.8  0.77]
Accuracy mean: 0.81


In [2]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

# baseline model
scores = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {np.round(scores, 2)}")
print(f"Accuracy mean: {scores.mean():.2f}")


Accuracy: [0.72 0.82 0.75 0.8  0.82]
Accuracy mean: 0.78


In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def grid_search(params, random=False):
  xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', 
                      random_state=2)
  if random:
    grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1)
  else:
    grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)

  grid.fit(X, y)
  best_params = grid.best_params_
  print(f"Best params: {best_params}")

  best_score = grid.best_score_
  print(f"Training score: {best_score:.3f}")

  

In [4]:
grid_search(params={'n_estimators': [100, 200, 400, 800]})


Best params: {'n_estimators': 100}
Training score: 0.782


In [5]:
grid_search(params={'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})

Best params: {'learning_rate': 0.05}
Training score: 0.796


In [6]:
grid_search(params={'max_depth': [2, 3, 5, 6, 8]})


Best params: {'max_depth': 2}
Training score: 0.799


In [7]:
grid_search(params={'gamma': [0, 0.1, 0.5, 1, 2, 5]})


Best params: {'gamma': 0.5}
Training score: 0.796


In [8]:
grid_search(params={'min_child_weight': [1, 2, 3, 4, 5]})

Best params: {'min_child_weight': 5}
Training score: 0.812


In [9]:
grid_search(params={'subsample': [0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'subsample': 0.8}
Training score: 0.796


In [10]:
grid_search(params={'colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bytree': 0.9}
Training score: 0.799


In [11]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

model = XGBClassifier(booster='gbtree', objective='binary:logistic', 
                      random_state=2)
eval_set = [(X_test, y_test)]
eval_metric = 'error'

model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100.0:.2f}")



[0]	validation_0-error:0.15789
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15789
[12]	validation_0-error:0.15789
[13]	validation_0-error:0.17105
[14]	validation_0-error:0.17105
[15]	validation_0-error:0.17105
[16]	validation_0-error:0.15789
[17]	validation_0-error:0.17105
[18]	validation_0-error:0.15789
[19]	validation_0-error:0.17105
[20]	validation_0-error:0.17105
[21]	validation_0-error:0.17105
[22]	validation_0-error:0.18421
[23]	validation_0-error:0.18421
[24]	validation_0-error:0.17105
[25]	validation_0-error:0.18421
[26]	validation_0-error:0.18421
[27]	validation_0-error:0.18421
[28]	validation_0-error:0.18421
[29]	validation_0-error:0.18421
[30]	validation_0-error:0.18421
[31]	validation_0-



[42]	validation_0-error:0.18421
[43]	validation_0-error:0.17105
[44]	validation_0-error:0.18421
[45]	validation_0-error:0.17105
[46]	validation_0-error:0.18421
[47]	validation_0-error:0.18421
[48]	validation_0-error:0.17105
[49]	validation_0-error:0.15789
[50]	validation_0-error:0.17105
[51]	validation_0-error:0.17105
[52]	validation_0-error:0.15789
[53]	validation_0-error:0.17105
[54]	validation_0-error:0.17105
[55]	validation_0-error:0.17105
[56]	validation_0-error:0.17105
[57]	validation_0-error:0.17105
[58]	validation_0-error:0.17105
[59]	validation_0-error:0.17105
[60]	validation_0-error:0.17105
[61]	validation_0-error:0.17105
[62]	validation_0-error:0.17105
[63]	validation_0-error:0.17105
[64]	validation_0-error:0.17105
[65]	validation_0-error:0.17105
[66]	validation_0-error:0.18421
[67]	validation_0-error:0.18421
[68]	validation_0-error:0.18421
[69]	validation_0-error:0.18421
[70]	validation_0-error:0.18421
[71]	validation_0-error:0.18421
[72]	validation_0-error:0.18421
[73]	val

In [12]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', 
                      random_state=2)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric='error', eval_set=eval_set, 
          early_stopping_rounds=10, verbose=True)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}")

[0]	validation_0-error:0.15789
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15789
Accuracy: 89.47




In [13]:
model = XGBClassifier(random_state=2, n_estimators=5000)
eval_set = [(X_test, y_test)]
eval_metric = "error"
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, 
          early_stopping_rounds=100)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy * 100:.2f}")


[0]	validation_0-error:0.15789
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15789
[12]	validation_0-error:0.15789
[13]	validation_0-error:0.17105
[14]	validation_0-error:0.17105
[15]	validation_0-error:0.17105
[16]	validation_0-error:0.15789
[17]	validation_0-error:0.17105
[18]	validation_0-error:0.15789
[19]	validation_0-error:0.17105
[20]	validation_0-error:0.17105
[21]	validation_0-error:0.17105




[22]	validation_0-error:0.18421
[23]	validation_0-error:0.18421
[24]	validation_0-error:0.17105
[25]	validation_0-error:0.18421
[26]	validation_0-error:0.18421
[27]	validation_0-error:0.18421
[28]	validation_0-error:0.18421
[29]	validation_0-error:0.18421
[30]	validation_0-error:0.18421
[31]	validation_0-error:0.18421
[32]	validation_0-error:0.18421
[33]	validation_0-error:0.18421
[34]	validation_0-error:0.18421
[35]	validation_0-error:0.18421
[36]	validation_0-error:0.18421
[37]	validation_0-error:0.18421
[38]	validation_0-error:0.18421
[39]	validation_0-error:0.18421
[40]	validation_0-error:0.18421
[41]	validation_0-error:0.18421
[42]	validation_0-error:0.18421
[43]	validation_0-error:0.17105
[44]	validation_0-error:0.18421
[45]	validation_0-error:0.17105
[46]	validation_0-error:0.18421
[47]	validation_0-error:0.18421
[48]	validation_0-error:0.17105
[49]	validation_0-error:0.15789
[50]	validation_0-error:0.17105
[51]	validation_0-error:0.17105
[52]	validation_0-error:0.15789
[53]	val