Objective:
- learn how to use early stopping to avoid overfitting

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# read in data
df = pd.read_csv("../../data/diabetes.csv", sep=",")

In [3]:
# prepare training and testing set
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, shuffle=True, random_state=89
)

In [4]:
# train the model
clf = XGBClassifier()
eval_set = [(X_train, y_train), (X_test, y_test)]
clf.fit(
    X_train, y_train, 
    eval_set=eval_set, 
    eval_metric="logloss", 
    early_stopping_rounds=20, 
    verbose=True
)

[0]	validation_0-logloss:0.55541	validation_1-logloss:0.61137
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 20 rounds.
[1]	validation_0-logloss:0.46299	validation_1-logloss:0.55319
[2]	validation_0-logloss:0.40203	validation_1-logloss:0.52379
[3]	validation_0-logloss:0.35649	validation_1-logloss:0.50181
[4]	validation_0-logloss:0.32238	validation_1-logloss:0.49589
[5]	validation_0-logloss:0.30001	validation_1-logloss:0.48972
[6]	validation_0-logloss:0.27803	validation_1-logloss:0.48230
[7]	validation_0-logloss:0.25759	validation_1-logloss:0.48017
[8]	validation_0-logloss:0.24287	validation_1-logloss:0.48426
[9]	validation_0-logloss:0.23216	validation_1-logloss:0.48511
[10]	validation_0-logloss:0.21984	validation_1-logloss:0.49243
[11]	validation_0-logloss:0.21085	validation_1-logloss:0.49724
[12]	validation_0-logloss:0.20525	validation_1-logloss:0.49429
[13]	validation_0-logloss:

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [5]:
# evaluate the performance of the model
preds_test = clf.predict(X_test)
acc_test = accuracy_score(y_test, preds_test)
print("[INFO] The test accuracy of XGBoost is: %.2f%%" % (acc_test * 100))

[INFO] The test accuracy of XGBoost is: 75.76%
