Objective:
- learn how to do feature selection by using feature importances

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

In [2]:
# read in data
df = pd.read_csv("../../data/diabetes.csv", sep=",")

In [3]:
# prepare training and testing set
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, shuffle=True, random_state=89
)

In [4]:
class MyXGBClassifier(XGBClassifier):
    @property
    def coef_(self):
        return None

In [5]:
# train the model
clf = MyXGBClassifier()  # use custom class to fix bug in xgboost 1.0.2
clf.fit(X_train, y_train)
preds_test = clf.predict(X_test)
acc_test = accuracy_score(y_test, preds_test)
print("[INFO] The test accuracy of XGBoost is: %.2f%%" % (acc_test * 100))

[INFO] The test accuracy of XGBoost is: 74.03%


In [6]:
# using feature importance to do feature selection
thresholds = np.sort(clf.feature_importances_)
for threshold in thresholds:
    selecter = SelectFromModel(clf, threshold=threshold, prefit=True)
    X_train_selected = selecter.transform(X_train)
    X_test_selected = selecter.transform(X_test)
    model = MyXGBClassifier()
    model.fit(X_train_selected, y_train)
    preds = model.predict(X_test_selected)
    acc = accuracy_score(y_test, preds)
    print(
        "threshold = %f, n = %d, accuracy = %.2f%%"
        % (threshold, X_train_selected.shape[1], acc * 100)
    )

threshold = 0.086725, n = 8, accuracy = 74.03%
threshold = 0.094203, n = 7, accuracy = 73.16%
threshold = 0.097505, n = 6, accuracy = 76.19%
threshold = 0.098924, n = 5, accuracy = 73.59%
threshold = 0.099591, n = 4, accuracy = 73.16%
threshold = 0.134071, n = 3, accuracy = 75.32%
threshold = 0.164807, n = 2, accuracy = 71.86%
threshold = 0.224174, n = 1, accuracy = 72.73%
