# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

# Data

In [2]:
dev = pd.read_csv('Data/dev.csv')
X_dev = dev.drop(columns=['hospital_death'])
y_dev = dev['hospital_death']

dev_us = pd.read_csv('Data/dev_us.csv')
X_dev_us = dev_us.drop(columns=['hospital_death'])
y_dev_us = dev_us['hospital_death']

dev_os = pd.read_csv('Data/dev_os.csv')
X_dev_os = dev_os.drop(columns=['hospital_death'])
y_dev_os = dev_os['hospital_death']

dev_smote = pd.read_csv('Data/dev_smote.csv')
X_dev_smote = dev_smote.drop(columns=['hospital_death'])
y_dev_smote = dev_smote['hospital_death']

dev_pca95 = pd.read_pickle('Data/dev_pca95.pkl')
X_dev_pca95 = dev_pca95.drop(columns=['hospital_death'])
y_dev_pca95 = dev_pca95['hospital_death']

dev_us_pca95 = pd.read_pickle('Data/dev_us_pca95.pkl')
X_dev_us_pca95 = dev_us_pca95.drop(columns=['hospital_death'])
y_dev_us_pca95 = dev_us_pca95['hospital_death']

dev_os_pca95 = pd.read_pickle('Data/dev_os_pca95.pkl')
X_dev_os_pca95 = dev_os_pca95.drop(columns=['hospital_death'])
y_dev_os_pca95 = dev_os_pca95['hospital_death']

dev_smote_pca95 = pd.read_pickle('Data/dev_smote_pca95.pkl')
X_dev_smote_pca95 = dev_smote_pca95.drop(columns=['hospital_death'])
y_dev_smote_pca95 = dev_smote_pca95['hospital_death']

test = pd.read_csv("Data/test.csv")
X_test = test.drop(columns=['hospital_death'])
y_test = test['hospital_death']

test_pca95 = pd.read_pickle("Data/test_pca95.pkl")
X_test_pca95 = test_pca95.drop(columns=['hospital_death'])
y_test_pca95 = test_pca95['hospital_death']

test_us_pca95 = pd.read_pickle("Data/test_us_pca95.pkl")
X_test_us_pca95 = test_us_pca95.drop(columns=['hospital_death'])
y_test_us_pca95 = test_us_pca95['hospital_death']

test_os_pca95 = pd.read_pickle("Data/test_os_pca95.pkl")
X_test_os_pca95 = test_os_pca95.drop(columns=['hospital_death'])
y_test_os_pca95 = test_os_pca95['hospital_death']

test_smote_pca95 = pd.read_pickle("Data/test_smote_pca95.pkl")
X_test_smote_pca95 = test_smote_pca95.drop(columns=['hospital_death'])
y_test_smote_pca95 = test_smote_pca95['hospital_death']

data = {
    "original": (X_dev, y_dev, X_test, y_test),
    "us": (X_dev_us, y_dev_us, X_test, y_test),
    "os": (X_dev_os, y_dev_os, X_test, y_test),
    "smote": (X_dev_smote, y_dev_smote, X_test, y_test)
}

pca_data = {
    "pca95": (X_dev_pca95, y_dev_pca95, X_test_pca95, y_test_pca95),
    "us_pca95": (X_dev_us_pca95, y_dev_us_pca95, X_test_us_pca95, y_test_us_pca95),
    "os_pca95": (X_dev_os_pca95, y_dev_os_pca95, X_test_os_pca95, y_test_os_pca95),
    "smote_pca95": (X_dev_smote_pca95, y_dev_smote_pca95, X_test_smote_pca95, y_test_smote_pca95)
}


# XGB

##Default Settings

In [4]:
xgb_pipe = make_pipeline(StandardScaler(), XGBClassifier())

for key in data.keys():
    xgb_pipe.fit(data[key][0], data[key][1])
    y_pred=xgb_pipe.predict(data[key][2])
    print("XGB / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
    print("XGB / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
    print("XGB / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
    print("XGB / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))

XGB / original [Accuracy]: 0.9277653600828654
XGB / original [Average Precision]: 0.26314106578370905
XGB / original [ROC AUC]: 0.6392674930685068
XGB / original [F1]: 0.4097995545657016
XGB / us [Accuracy]: 0.8106634683530503
XGB / us [Average Precision]: 0.24746886226323092
XGB / us [ROC AUC]: 0.8074370511867591
XGB / us [F1]: 0.4228020608276549
XGB / os [Accuracy]: 0.8140980210434499
XGB / os [Average Precision]: 0.25017306240647774
XGB / os [ROC AUC]: 0.8084584570247423
XGB / os [F1]: 0.42669804976462683
XGB / smote [Accuracy]: 0.9161533009867524
XGB / smote [Average Precision]: 0.2771021941440256
XGB / smote [ROC AUC]: 0.70213065205035
XGB / smote [F1]: 0.477226376614548


###PCA Dataset

In [6]:
xgb_pca_pipe = make_pipeline(StandardScaler(), XGBClassifier())

for key in pca_data.keys():
    xgb_pca_pipe.fit(pca_data[key][0], pca_data[key][1])
    y_pred=xgb_pca_pipe.predict(pca_data[key][2])
    print("Histgradient Boosting with PCA / {} [Accuracy]: {}".format(key, accuracy_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [Average Precision]: {}".format(key, average_precision_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [ROC AUC]: {}".format(key, roc_auc_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [F1]: {}".format(key, f1_score(pca_data[key][3], y_pred)))

Histgradient Boosting with PCA / pca95 [Accuracy]: 0.9254211415798942
Histgradient Boosting with PCA / pca95 [Average Precision]: 0.23043925615748748
Histgradient Boosting with PCA / pca95 [ROC AUC]: 0.611670557700629
Histgradient Boosting with PCA / pca95 [F1]: 0.34980988593155893
Histgradient Boosting with PCA / us_pca95 [Accuracy]: 0.9137000490650384
Histgradient Boosting with PCA / us_pca95 [Average Precision]: 0.08629995093496157
Histgradient Boosting with PCA / us_pca95 [ROC AUC]: 0.5
Histgradient Boosting with PCA / us_pca95 [F1]: 0.0




ValueError: ignored

##Hyperparmeter Tuning

###Tuning Original

In [7]:
xgb_params = {
    'xgbclassifier__max_depth': [10, 50, 100],
    'xgbclassifier__min_samples_leaf': [1, 2, 4],
    'xgbclassifier__n_estimators': [50, 100, 500],
    'xgbclassifier__learning_rate': [0.01,0.1,1,10]
}

xgb_pipe = make_pipeline(StandardScaler(), XGBClassifier())
xgb_grid = RandomizedSearchCV(xgb_pipe, xgb_params, n_iter=5 ,cv = None, scoring='accuracy', refit=True, n_jobs = -1)

for key in data.keys():
    if key=='original':
      xgb_grid.fit(data[key][0], data[key][1])
      y_pred=xgb_grid.predict(data[key][2])
      print("XGB / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
      print("XGB / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
      print("XGB / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
      print("XGB / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))


XGB / original [Accuracy]: 0.9300005451670937
XGB / original [Average Precision]: 0.2803232024904499
XGB / original [ROC AUC]: 0.6462111041088413
XGB / original [F1]: 0.42780748663101603


In [8]:
for key in data.keys():
  if key=='original':
    print("XGB / {} : {}".format(key, xgb_grid.best_params_))

XGB / original : {'xgbclassifier__n_estimators': 500, 'xgbclassifier__min_samples_leaf': 2, 'xgbclassifier__max_depth': 50, 'xgbclassifier__learning_rate': 0.01}


###Tuning Smote

In [4]:
xgb_params = {
    'xgbclassifier__max_depth': [10, 50, 100],
    'xgbclassifier__min_samples_leaf': [1, 2, 4],
    'xgbclassifier__n_estimators': [50, 100, 500],
    'xgbclassifier__learning_rate': [0.01,0.1,1,10]
}
smote_xgb_pipe = make_pipeline(StandardScaler(), XGBClassifier())
smote_xgb_grid = RandomizedSearchCV(smote_xgb_pipe, xgb_params, n_iter=5 ,cv = None, scoring='accuracy', refit=True, n_jobs = -1)

xgb_params = {
    'xgbclassifier__max_depth': [10, 50, 70],
    'xgbclassifier__min_samples_leaf': [1, 2, 4],
    'xgbclassifier__n_estimators': [50, 100, 500, 700],
    'xgbclassifier__learning_rate': [0.01,0.1,1,10]
}

for key in data.keys():
    if key== ('smote'):
      smote_xgb_grid.fit(data[key][0], data[key][1])
      smote_y_pred=smote_xgb_grid.predict(data[key][2])
      print("XGB / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], smote_y_pred)))
      print("XGB / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], smote_y_pred)))
      print("XGB / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], smote_y_pred)))
      print("XGB / {} [F1]: {}".format(key, f1_score(data[key][3], smote_y_pred)))

XGB / smote [Accuracy]: 0.9247124243580658
XGB / smote [Average Precision]: 0.2703417126082993
XGB / smote [ROC AUC]: 0.6647690369182107
XGB / smote [F1]: 0.4456041750301084
