# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import plot_importance
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

# Data

In [2]:
dev = pd.read_csv('Data/dev.csv')
X_dev = dev.drop(columns=['hospital_death'])
y_dev = dev['hospital_death']

dev_us = pd.read_csv('Data/dev_us.csv')
X_dev_us = dev_us.drop(columns=['hospital_death'])
y_dev_us = dev_us['hospital_death']

dev_os = pd.read_csv('Data/dev_os.csv')
X_dev_os = dev_os.drop(columns=['hospital_death'])
y_dev_os = dev_os['hospital_death']

dev_smote = pd.read_csv('Data/dev_smote.csv')
X_dev_smote = dev_smote.drop(columns=['hospital_death'])
y_dev_smote = dev_smote['hospital_death']

dev_pca95 = pd.read_pickle('Data/dev_pca95.pkl')
X_dev_pca95 = dev_pca95.drop(columns=['hospital_death'])
y_dev_pca95 = dev_pca95['hospital_death']

dev_us_pca95 = pd.read_pickle('Data/dev_us_pca95.pkl')
X_dev_us_pca95 = dev_us_pca95.drop(columns=['hospital_death'])
y_dev_us_pca95 = dev_us_pca95['hospital_death']

dev_os_pca95 = pd.read_pickle('Data/dev_os_pca95.pkl')
X_dev_os_pca95 = dev_os_pca95.drop(columns=['hospital_death'])
y_dev_os_pca95 = dev_os_pca95['hospital_death']

dev_smote_pca95 = pd.read_pickle('Data/dev_smote_pca95.pkl')
X_dev_smote_pca95 = dev_smote_pca95.drop(columns=['hospital_death'])
y_dev_smote_pca95 = dev_smote_pca95['hospital_death']

test = pd.read_csv("Data/test.csv")
X_test = test.drop(columns=['hospital_death'])
y_test = test['hospital_death']

test = pd.read_csv("Data/test_kmeans.csv")
X_test_kmeans = test.drop(columns=['hospital_death'])
y_test_kmeans = test['hospital_death']

test_pca95 = pd.read_pickle("Data/test_pca95.pkl")
X_test_pca95 = test_pca95.drop(columns=['hospital_death'])
y_test_pca95 = test_pca95['hospital_death']

test_us_pca95 = pd.read_pickle("Data/test_us_pca95.pkl")
X_test_us_pca95 = test_us_pca95.drop(columns=['hospital_death'])
y_test_us_pca95 = test_us_pca95['hospital_death']

test_os_pca95 = pd.read_pickle("Data/test_os_pca95.pkl")
X_test_os_pca95 = test_os_pca95.drop(columns=['hospital_death'])
y_test_os_pca95 = test_os_pca95['hospital_death']

test_smote_pca95 = pd.read_pickle("Data/test_smote_pca95.pkl")
X_test_smote_pca95 = test_smote_pca95.drop(columns=['hospital_death'])
y_test_smote_pca95 = test_smote_pca95['hospital_death']

data = {
    "original": (X_dev, y_dev, X_test, y_test),
    "us": (X_dev_us, y_dev_us, X_test, y_test),
    "os": (X_dev_os, y_dev_os, X_test, y_test),
    "smote": (X_dev_smote, y_dev_smote, X_test, y_test)
}

pca_data = {
    "pca95": (X_dev_pca95, y_dev_pca95, X_test_pca95, y_test_pca95),
    "us_pca95": (X_dev_us_pca95, y_dev_us_pca95, X_test_us_pca95, y_test_us_pca95),
    "os_pca95": (X_dev_os_pca95, y_dev_os_pca95, X_test_os_pca95, y_test_os_pca95),
    "smote_pca95": (X_dev_smote_pca95, y_dev_smote_pca95, X_test_smote_pca95, y_test_smote_pca95)
}


# HistGradient Boosting

##Default Settings

In [None]:
hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())

for key in data.keys():
    hgb_pipe.fit(data[key][0], data[key][1])
    y_pred=hgb_pipe.predict(data[key][2])
    print("Histgradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))

Histgradient Boosting / original [Accuracy]: 0.9297279616202366
Histgradient Boosting / original [Average Precision]: 0.2871380829602554
Histgradient Boosting / original [ROC AUC]: 0.6577888838298328
Histgradient Boosting / original [F1]: 0.44701844701844706
Histgradient Boosting / us [Accuracy]: 0.8103363680968216
Histgradient Boosting / us [Average Precision]: 0.2459358395703593
Histgradient Boosting / us [ROC AUC]: 0.8055419153686921
Histgradient Boosting / us [F1]: 0.4212277491266012
Histgradient Boosting / os [Accuracy]: 0.8369405222700758
Histgradient Boosting / os [Average Precision]: 0.26523049387709935
Histgradient Boosting / os [ROC AUC]: 0.8083734435235956
Histgradient Boosting / os [F1]: 0.45028487410402496
Histgradient Boosting / smote [Accuracy]: 0.9281469770484654
Histgradient Boosting / smote [Average Precision]: 0.29493170981878775
Histgradient Boosting / smote [ROC AUC]: 0.6772313641208724
Histgradient Boosting / smote [F1]: 0.47322142286171054


###PCA dataset

In [None]:
hgb_pca_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())

for key in pca_data.keys():
    hgb_pca_pipe.fit(pca_data[key][0], pca_data[key][1])
    y_pred=hgb_pca_pipe.predict(pca_data[key][2])
    print("Histgradient Boosting with PCA / {} [Accuracy]: {}".format(key, accuracy_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [Average Precision]: {}".format(key, average_precision_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [ROC AUC]: {}".format(key, roc_auc_score(pca_data[key][3], y_pred)))
    print("Histgradient Boosting with PCA / {} [F1]: {}".format(key, f1_score(pca_data[key][3], y_pred)))

Histgradient Boosting with PCA / pca95 [Accuracy]: 0.9265659924766941
Histgradient Boosting with PCA / pca95 [Average Precision]: 0.2548639013800929
Histgradient Boosting with PCA / pca95 [ROC AUC]: 0.6363229842132322
Histgradient Boosting with PCA / pca95 [F1]: 0.40159928920479787
Histgradient Boosting with PCA / us_pca95 [Accuracy]: 0.9137000490650384
Histgradient Boosting with PCA / us_pca95 [Average Precision]: 0.08629995093496157
Histgradient Boosting with PCA / us_pca95 [ROC AUC]: 0.5
Histgradient Boosting with PCA / us_pca95 [F1]: 0.0




ValueError: ignored

##Hyperparameter Tuning

###Tuning all 4 datasets

In [None]:
hgb_params = {
    'histgradientboostingclassifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'histgradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'histgradientboostingclassifier__learning_rate': [0.01,0.1,1,10],
    'histgradientboostingclassifier__max_bins': [100, 150, 200, 255]
}

hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())
hgb_grid= RandomizedSearchCV(hgb_pipe, hgb_params, n_iter = 50, cv = 2, scoring='accuracy', refit=True, n_jobs = -1)

for key in data.keys():
    hgb_grid.fit(data[key][0], data[key][1])
    y_pred=hgb_grid.predict(data[key][2])
    print("Histgradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], y_pred)))
    print("Histgradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], y_pred)))

Histgradient Boosting / original [Accuracy]: 0.9294553780733795
Histgradient Boosting / original [Average Precision]: 0.28734214720573076
Histgradient Boosting / original [ROC AUC]: 0.6599279034249643
Histgradient Boosting / original [F1]: 0.4498299319727891
Histgradient Boosting / us [Accuracy]: 0.8103363680968216
Histgradient Boosting / us [Average Precision]: 0.24756494485229902
Histgradient Boosting / us [ROC AUC]: 0.8078300996416278
Histgradient Boosting / us [F1]: 0.42276422764227645
Histgradient Boosting / os [Accuracy]: 0.8415744425666467
Histgradient Boosting / os [Average Precision]: 0.21725292408360292
Histgradient Boosting / os [ROC AUC]: 0.7402615536193776
Histgradient Boosting / os [F1]: 0.402303578774167
Histgradient Boosting / smote [Accuracy]: 0.9278743935016083
Histgradient Boosting / smote [Average Precision]: 0.29481109920233084
Histgradient Boosting / smote [ROC AUC]: 0.6785123146136531
Histgradient Boosting / smote [F1]: 0.4743742550655543


#####Tuning with different scoring

In [None]:
hgb_params = {
    'histgradientboostingclassifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'histgradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'histgradientboostingclassifier__learning_rate': [0.01,0.1,1,10],
    'histgradientboostingclassifier__max_bins': [100, 150, 200, 255]
}

hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())
hgb_grid_accuracy = RandomizedSearchCV(hgb_pipe, hgb_params, n_iter = 50, cv = 2, scoring='accuracy', refit=True, n_jobs = -1)
hgb_grid_ap = RandomizedSearchCV(hgb_pipe, hgb_params, n_iter = 50, cv = 2, scoring='average_precision', refit=True, n_jobs = -1)
hgb_grid_auc = RandomizedSearchCV(hgb_pipe, hgb_params, n_iter = 50, cv = 2, scoring='roc_auc', refit=True, n_jobs = -1)
hgb_grid_f1 = RandomizedSearchCV(hgb_pipe, hgb_params, n_iter = 50, cv = 2, scoring='f1', refit=True, n_jobs = -1)

for key in data.keys():
    hgb_grid_accuracy.fit(data[key][0], data[key][1])
    print("Histgradient Boosting / {} [Accuracy]: {}".format(key, hgb_grid_accuracy.score(data[key][2], data[key][3])))
    hgb_grid_ap.fit(data[key][0], data[key][1])
    print("Histgradient Boosting / {} [Average Precision]: {}".format(key, hgb_grid_ap.score(data[key][2], data[key][3])))
    hgb_grid_auc.fit(data[key][0], data[key][1])
    print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, hgb_grid_auc.score(data[key][2], data[key][3])))
    hgb_grid_f1.fit(data[key][0], data[key][1])
    print("Histgradient Boosting / {} [F1]: {}".format(key, hgb_grid_f1.score(data[key][2], data[key][3])))


Histgradient Boosting / original [Accuracy]: 0.9284195605953225
Histgradient Boosting / original [Average Precision]: 0.5500386574240925
Histgradient Boosting / original [ROC AUC]: 0.8930389565747041
Histgradient Boosting / original [F1]: 0.4453746337379657
Histgradient Boosting / us [Accuracy]: 0.8075014992095078
Histgradient Boosting / us [Average Precision]: 0.5396137523235867
Histgradient Boosting / us [ROC AUC]: 0.890015031427292
Histgradient Boosting / us [F1]: 0.4197080291970803
Histgradient Boosting / os [Accuracy]: 0.8391757073543041
Histgradient Boosting / os [Average Precision]: 0.5402645394611757
Histgradient Boosting / os [ROC AUC]: 0.895087308922215
Histgradient Boosting / os [F1]: 0.4021121039805036
Histgradient Boosting / smote [Accuracy]: 0.927056642861037
Histgradient Boosting / smote [Average Precision]: 0.5396141268502195
Histgradient Boosting / smote [ROC AUC]: 0.8909065518629472
Histgradient Boosting / smote [F1]: 0.4814090019569472


In [None]:
for key in data.keys():
  print("Histgradient Boosting / {} [Accuracy]: {}".format(key, hgb_grid_accuracy.best_params_))
  print("Histgradient Boosting / {} [Average Precision]: {}".format(key, hgb_grid_ap.best_params_))
  print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, hgb_grid_auc.best_params_))
  print("Histgradient Boosting / {} [F1]: {}".format(key, hgb_grid_f1.best_params_))

for key in data.keys():
  print("Histgradient Boosting / {} : {}".format(key, hgb_grid.best_params_))


Histgradient Boosting / original [Accuracy]: {'histgradientboostingclassifier__min_samples_leaf': 1, 'histgradientboostingclassifier__max_depth': 10, 'histgradientboostingclassifier__max_bins': 200, 'histgradientboostingclassifier__learning_rate': 0.1}
Histgradient Boosting / original [Average Precision]: {'histgradientboostingclassifier__min_samples_leaf': 2, 'histgradientboostingclassifier__max_depth': 10, 'histgradientboostingclassifier__max_bins': 255, 'histgradientboostingclassifier__learning_rate': 0.1}
Histgradient Boosting / original [ROC AUC]: {'histgradientboostingclassifier__min_samples_leaf': 4, 'histgradientboostingclassifier__max_depth': 50, 'histgradientboostingclassifier__max_bins': 255, 'histgradientboostingclassifier__learning_rate': 0.1}
Histgradient Boosting / original [F1]: {'histgradientboostingclassifier__min_samples_leaf': 2, 'histgradientboostingclassifier__max_depth': 10, 'histgradientboostingclassifier__max_bins': 255, 'histgradientboostingclassifier__learnin

###Tuning Original

In [4]:
hgb_params = {
    'histgradientboostingclassifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'histgradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'histgradientboostingclassifier__learning_rate': [0.01,0.1,1,10],
    'histgradientboostingclassifier__max_bins': [100, 150, 200, 255]
}

original_hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())
original_hgb_grid= RandomizedSearchCV(original_hgb_pipe, hgb_params, n_iter = 100, cv = None, scoring='accuracy', refit=True, n_jobs = -1)

for key in data.keys():
  if key=='original':
      original_hgb_grid.fit(data[key][0], data[key][1])
      original_y_pred=original_hgb_grid.predict(data[key][2])
      print("Histgradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], original_y_pred)))

Histgradient Boosting / original [Accuracy]: 0.9292918279452652
Histgradient Boosting / original [Average Precision]: 0.286013297707298
Histgradient Boosting / original [ROC AUC]: 0.6592663585500478
Histgradient Boosting / original [F1]: 0.44831986388770734


In [5]:
for key in data.keys():
  if key=='original':
    print("Histgradient Boosting / {} : {}".format(key, original_hgb_grid.best_params_))

Histgradient Boosting / original : {'histgradientboostingclassifier__min_samples_leaf': 4, 'histgradientboostingclassifier__max_depth': 90, 'histgradientboostingclassifier__max_bins': 200, 'histgradientboostingclassifier__learning_rate': 0.1}


###Tuning original using F1 scoring

In [3]:
hgb_params = {
    'histgradientboostingclassifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'histgradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'histgradientboostingclassifier__learning_rate': [0.01,0.1,1,10],
    'histgradientboostingclassifier__max_bins': [100, 150, 200, 255]
}

original_hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())
original_hgb_grid= RandomizedSearchCV(original_hgb_pipe, hgb_params, n_iter = 50, cv = None, scoring='f1', refit=True, n_jobs = -1)

for key in data.keys():
  if key=='original':
      original_hgb_grid.fit(data[key][0], data[key][1])
      original_y_pred=original_hgb_grid.predict(data[key][2])
      print("Histgradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], original_y_pred)))
      print("Histgradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], original_y_pred)))

Histgradient Boosting / original [Accuracy]: 0.9295644114921223
Histgradient Boosting / original [Average Precision]: 0.28859025679411854
Histgradient Boosting / original [ROC AUC]: 0.6608456383984369
Histgradient Boosting / original [F1]: 0.4516129032258065


In [4]:
for key in data.keys():
  if key=='original':
    print("Histgradient Boosting / {} : {}".format(key, original_hgb_grid.best_params_))

Histgradient Boosting / original : {'histgradientboostingclassifier__min_samples_leaf': 4, 'histgradientboostingclassifier__max_depth': 10, 'histgradientboostingclassifier__max_bins': 150, 'histgradientboostingclassifier__learning_rate': 0.1}


###Tuning Smote

In [6]:
smote_hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier())
smote_hgb_grid= RandomizedSearchCV(smote_hgb_pipe, hgb_params, n_iter = 100, cv = None, scoring='average', refit=True, n_jobs = -1)

for key in data.keys():
  if key=='smote':
      smote_hgb_grid.fit(data[key][0], data[key][1])
      smote_y_pred=smote_hgb_grid.predict(data[key][2])
      print("Histgradient Boosting / {} [Accuracy]: {}".format(key, accuracy_score(data[key][3], smote_y_pred)))
      print("Histgradient Boosting / {} [Average Precision]: {}".format(key, average_precision_score(data[key][3], smote_y_pred)))
      print("Histgradient Boosting / {} [ROC AUC]: {}".format(key, roc_auc_score(data[key][3], smote_y_pred)))
      print("Histgradient Boosting / {} [F1]: {}".format(key, f1_score(data[key][3], smote_y_pred)))

Histgradient Boosting / smote [Accuracy]: 0.9276018099547512
Histgradient Boosting / smote [Average Precision]: 0.2971090133590154
Histgradient Boosting / smote [ROC AUC]: 0.6826534954476033
Histgradient Boosting / smote [F1]: 0.47962382445141066


In [7]:
for key in data.keys():
  if key=='smote':
    print("Histgradient Boosting / {} : {}".format(key, smote_hgb_grid.best_params_))

Histgradient Boosting / smote : {'histgradientboostingclassifier__min_samples_leaf': 2, 'histgradientboostingclassifier__max_depth': 10, 'histgradientboostingclassifier__max_bins': 255, 'histgradientboostingclassifier__learning_rate': 0.1}


###Training using optimized hyperparameters

In [None]:
original_hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier(original_hgb_grid.best_params_))
original_hgb_grid.fit(data['original'][0], data['original'][1])
original_y_pred=original_hgb_grid.predict(data['original'][2])

print("Histgradient Boosting / {} [Accuracy]: {}".format('original', accuracy_score(data['original'][3], original_y_pred)))
print("Histgradient Boosting / {} [Average Precision]: {}".format('original', average_precision_score(data['original'][3], original_y_pred)))
print("Histgradient Boosting / {} [ROC AUC]: {}".format('original', roc_auc_score(data['original'][3], original_y_pred)))
print("Histgradient Boosting / {} [F1]: {}".format('original', f1_score(data['original'][3], original_y_pred)))

Histgradient Boosting / original [Accuracy]: 0.9285831107234367
Histgradient Boosting / original [Average Precision]: 0.28087791639359233
Histgradient Boosting / original [ROC AUC]: 0.6571623921830547
Histgradient Boosting / original [F1]: 0.44302721088435376


In [None]:
smote_hgb_pipe = make_pipeline(StandardScaler(), HistGradientBoostingClassifier(smote_hgb_grid.best_params_))
smote_hgb_grid.fit(data['smote'][0], data['smote'][1])
smote_y_pred=smote_hgb_grid.predict(data['smote'][2])

print("Histgradient Boosting / {} [Accuracy]: {}".format('smote' , accuracy_score(data['smote'][3], smote_y_pred)))
print("Histgradient Boosting / {} [Average Precision]: {}".format('smote', average_precision_score(data['smote'][3], smote_y_pred)))
print("Histgradient Boosting / {} [ROC AUC]: {}".format('smote', roc_auc_score(data['smote'][3], smote_y_pred)))
print("Histgradient Boosting / {} [F1]: {}".format('smote', f1_score(data['smote'][3], smote_y_pred)))

Histgradient Boosting / smote [Accuracy]: 0.9274927765360083
Histgradient Boosting / smote [Average Precision]: 0.29349510398205947
Histgradient Boosting / smote [ROC AUC]: 0.6788755301329611
Histgradient Boosting / smote [F1]: 0.47389240506329117
