# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import plot_importance
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

# Data

In [2]:
dev = pd.read_csv('Data/dev.csv')
X_dev = dev.drop(columns=['hospital_death'])
y_dev = dev['hospital_death']

dev_us = pd.read_csv('Data/dev_us.csv')
X_dev_us = dev_us.drop(columns=['hospital_death'])
y_dev_us = dev_us['hospital_death']

dev_os = pd.read_csv('Data/dev_os.csv')
X_dev_os = dev_os.drop(columns=['hospital_death'])
y_dev_os = dev_os['hospital_death']

dev_smote = pd.read_csv('Data/dev_smote.csv')
X_dev_smote = dev_smote.drop(columns=['hospital_death'])
y_dev_smote = dev_smote['hospital_death']

dev_pca95 = pd.read_pickle('Data/dev_pca95.pkl')
X_dev_pca95 = dev_pca95.drop(columns=['hospital_death'])
y_dev_pca95 = dev_pca95['hospital_death']

dev_us_pca95 = pd.read_pickle('Data/dev_us_pca95.pkl')
X_dev_us_pca95 = dev_us_pca95.drop(columns=['hospital_death'])
y_dev_us_pca95 = dev_us_pca95['hospital_death']

dev_os_pca95 = pd.read_pickle('Data/dev_os_pca95.pkl')
X_dev_os_pca95 = dev_os_pca95.drop(columns=['hospital_death'])
y_dev_os_pca95 = dev_os_pca95['hospital_death']

dev_smote_pca95 = pd.read_pickle('Data/dev_smote_pca95.pkl')
X_dev_smote_pca95 = dev_smote_pca95.drop(columns=['hospital_death'])
y_dev_smote_pca95 = dev_smote_pca95['hospital_death']

test = pd.read_csv("Data/test.csv")
X_test = test.drop(columns=['hospital_death'])
y_test = test['hospital_death']

test_pca95 = pd.read_pickle("Data/test_pca95.pkl")
X_test_pca95 = test_pca95.drop(columns=['hospital_death'])
y_test_pca95 = test_pca95['hospital_death']

test_us_pca95 = pd.read_pickle("Data/test_us_pca95.pkl")
X_test_us_pca95 = test_us_pca95.drop(columns=['hospital_death'])
y_test_us_pca95 = test_us_pca95['hospital_death']

test_os_pca95 = pd.read_pickle("Data/test_os_pca95.pkl")
X_test_os_pca95 = test_os_pca95.drop(columns=['hospital_death'])
y_test_os_pca95 = test_os_pca95['hospital_death']

test_smote_pca95 = pd.read_pickle("Data/test_smote_pca95.pkl")
X_test_smote_pca95 = test_smote_pca95.drop(columns=['hospital_death'])
y_test_smote_pca95 = test_smote_pca95['hospital_death']

data = {
    "original": (X_dev, y_dev, X_test, y_test),
    "us": (X_dev_us, y_dev_us, X_test, y_test),
    "os": (X_dev_os, y_dev_os, X_test, y_test),
    "smote": (X_dev_smote, y_dev_smote, X_test, y_test)
}

pca_data = {
    "pca95": (X_dev_pca95, y_dev_pca95, X_test_pca95, y_test_pca95),
    "us_pca95": (X_dev_us_pca95, y_dev_us_pca95, X_test_us_pca95, y_test_us_pca95),
    "os_pca95": (X_dev_os_pca95, y_dev_os_pca95, X_test_os_pca95, y_test_os_pca95),
    "smote_pca95": (X_dev_smote_pca95, y_dev_smote_pca95, X_test_smote_pca95, y_test_smote_pca95)
}


# Gradient Boosting

##Default Settings

In [4]:
gb_pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier())

for key in data.keys():
    gb_pipe.fit(data[key][0], data[key][1])
    y_pred=gb_pipe.predict(data[key][2])
    print("Gradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
    print("Gradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
    print("Gradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
    print("Gradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))

Gradient Boosting / original [Accuracy]: 0.928092460339094
Gradient Boosting / original [Average Precision]: 0.27164052804278593
Gradient Boosting / original [ROC AUC]: 0.648313204739498
Gradient Boosting / original [F1]: 0.42677096914385054
Gradient Boosting / us [Accuracy]: 0.8085373166875648
Gradient Boosting / us [Average Precision]: 0.24404213061429064
Gradient Boosting / us [ROC AUC]: 0.8042714054610668
Gradient Boosting / us [F1]: 0.41873551804038406
Gradient Boosting / os [Accuracy]: 0.8149702883933926
Gradient Boosting / os [Average Precision]: 0.24999826026430988
Gradient Boosting / os [ROC AUC]: 0.8075056688231311
Gradient Boosting / os [F1]: 0.4268828098615332
Gradient Boosting / smote [Accuracy]: 0.9163713678242381
Gradient Boosting / smote [Average Precision]: 0.27464739917508646
Gradient Boosting / smote [ROC AUC]: 0.6988177073831898
Gradient Boosting / smote [F1]: 0.4735758407687028


###PCA Dataset

In [6]:
gb_pca_pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier())

for key in pca_data.keys():
    gb_pca_pipe.fit(pca_data[key][0], pca_data[key][1])
    y_pred=gb_pca_pipe.predict(pca_data[key][2])
    print("Gradient Boosting with PCA / {} [Accuracy]: {}".format(key, accuracy_score(pca_data[key][3], y_pred)))
    print("Gradient Boosting with PCA / {} [Average Precision]: {}".format(key, average_precision_score(pca_data[key][3], y_pred)))
    print("Gradient Boosting with PCA / {} [ROC AUC]: {}".format(key, roc_auc_score(pca_data[key][3], y_pred)))
    print("Gradient Boosting with PCA / {} [F1]: {}".format(key, f1_score(pca_data[key][3], y_pred)))

Gradient Boosting with PCA / pca95 [Accuracy]: 0.9252030747424086
Gradient Boosting with PCA / pca95 [Average Precision]: 0.2378708463591891
Gradient Boosting with PCA / pca95 [ROC AUC]: 0.6229921473230641
Gradient Boosting with PCA / pca95 [F1]: 0.3729433272394881
Gradient Boosting with PCA / us_pca95 [Accuracy]: 0.9105925966308673
Gradient Boosting with PCA / us_pca95 [Average Precision]: 0.08812779697877184
Gradient Boosting with PCA / us_pca95 [ROC AUC]: 0.5045920294236044
Gradient Boosting with PCA / us_pca95 [F1]: 0.026128266033254157




ValueError: ignored

##Hyperparameter Tuning

###Original Dataset

In [3]:
gb_params = {
    'gradientboostingclassifier__max_depth': [10, 50, 70],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'gradientboostingclassifier__n_estimators': [50, 100, 250],
    'gradientboostingclassifier__learning_rate': [0.01,0.1,1,10]
}

gb_pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier())
gb_grid = RandomizedSearchCV(gb_pipe, gb_params, n_iter=3,cv = None, scoring='accuracy', refit=True, n_jobs = -1)

for key in data.keys():
    if key== ('original'):
      gb_grid.fit(data[key][0], data[key][1])
      y_pred=gb_grid.predict(data[key][2])
      print("Gradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
      print("Gradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
      print("Gradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
      print("Gradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))


Gradient Boosting / original [Accuracy]: 0.9196968870958949
Gradient Boosting / original [Average Precision]: 0.20314909628296596
Gradient Boosting / original [ROC AUC]: 0.6088241225008556
Gradient Boosting / original [F1]: 0.3337856173677069
