In [1]:
# use feature importance for feature selection
# from https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/
from numpy import loadtxt
from numpy import sort
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

In [None]:
# load data
import glob

"""
File Names and format:
(1) Date in MM-DD-YYYY format
(2) Time in XX:YY format
(3) Code
(4) Value
"""
cols = ["date","time","code", "val_label"]
dataset = pd.DataFrame(columns=["datetime","code", "val_label"])
# dataset = loadtxt('data/pima-indians-diabetes.csv', delimiter=",")
for f in sorted(glob.glob("data/Diabetes-Data/data-*")):
    df = pd.read_csv(f, delimiter="\t", 
                     header=None,
                     names=cols, 
                     parse_dates={"datetime":["date","time"]},
                     keep_date_col=True
                    ).drop(["date","time"], axis=1).dropna()
    
    dataset = dataset.append(df, ignore_index=True)


In [86]:
dataset = dataset[dataset.val_label.apply(lambda x: True if type(x) in [int, float] else str.isnumeric(x))]

dataset.code = dataset.code.astype('int64', inplace=True)

dataset.val_label = dataset.val_label.astype('int64', inplace=True, errors='ignore')
# dataset.datetime = pd.to_datetime(dataset.datetime)
dataset.drop('datetime', inplace=True, axis=1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29278 entries, 0 to 29296
Data columns (total 2 columns):
code         29278 non-null int64
val_label    29278 non-null int64
dtypes: int64(2)
memory usage: 686.2 KB


In [88]:
# split data into X and y
X = dataset.iloc[:,0:1]
Y = dataset.iloc[:,1]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

In [89]:
# fit model on all training data
model = XGBClassifier()
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# make predictions for test data and evaluate
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))