# gonna use XGB now to combine the categorical, continuous and pre-embedded free text stuff

#### Online, we import the usual packages. **xgboost** needs to be installed (with conda install xgboost or pip install -U xgboost).

In [6]:
#!pip install xgboost

Collecting xgboost
  Using cached https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25ldone
[?25h  Stored in directory: /Users/jjaskolkambp/Library/Caches/pip/wheels/e9/48/4d/de4187b5270dff71d3697c5a7857a1e2d9a0c63a28b3462eeb
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.90
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


####  the usual imports

In [1]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics
from collections import Counter
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import random as rn
from IPython.display import clear_output

### load helper functions

In [2]:
def thresholding_analysis(preds, targets, admission_thresholds = [0.01, 0.05, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]):

    i = 0
    for thresh in admission_thresholds:
        thresholded_predictions = [0 if prob[0] >= thresh else 1 for prob in preds]
        
        f1_w, f1, acc, prec, rec,= get_metrics(targets,thresholded_predictions, print_output = False)

        if i == 0:
            output_df = pd.DataFrame([thresh, f1_w, f1, acc, prec, rec, rec[0], rec[1]]).T
            output_df.columns = ['admission_threshold', 'weighted f1', 'f1', 'accuracy', 'precision', 'recall', 'admission sensitivity', 'admission specificity']
        else:
            output_df.loc[len(output_df)] = [thresh, f1_w, f1, acc, prec, rec, rec[0], rec[1]]
        i+=1
    
    return output_df

In [3]:
def show_confusion_matrix(targets, predictions, labels):
    LABELS = labels

    confusion_matrix = sklearn.metrics.confusion_matrix(targets, predictions)

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d", annot_kws={"size": 15});
    plt.title("Confusion matrix", fontsize=10)
    plt.ylabel('True label', fontsize=10)
    plt.xlabel('Predicted label', fontsize=10)
    plt.show()

In [4]:
def get_metrics(y_test, preds, print_output = True):
    f1_w = sklearn.metrics.f1_score(y_test, preds, average='weighted')
    f1 = sklearn.metrics.f1_score(y_test, preds, average=None)
    acc = sklearn.metrics.accuracy_score(y_test, preds)
    prec = sklearn.metrics.precision_score(y_test,preds, average=None) 
    rec = sklearn.metrics.recall_score(y_test,preds, average=None)
    
    if print_output:
        print ('weighted f1: ', f1_w)
        print ('f1:          ', f1)
        print ('accuracy:    ', acc)
        print ('precision:   ', prec)
        print ('recall:      ', rec)
        print ('admission sens: ', rec[0])
        print ('admission spec: ', rec[1])
            
    return f1_w, f1, acc, prec, rec


In [49]:
def JJ_gridsearch(weights, print_output = False):
    epoch = 0
    for weight in weights:
        
        xgc = xgb.XGBClassifier(scale_pos_weight=weight)
        xgc.fit(X_train, y_train)
        preds = xgc.predict(X_test)
        probas = xgc.predict_proba(X_test)
        f1_w, f1, acc, prec, rec = get_metrics(y_test, preds, print_output)
        
        if epoch == 0:
            results_df = pd.DataFrame([epoch+1, 1/weight, f1_w, f1, acc, prec, rec, rec[0], rec[1]]).T
            results_df.columns = ['trial number', 'class penalty', 'weighted f1', 'f1', 'accuracy', 'precision', 'recall', 'admission sensitivity', 'admission specificity']
        else:
            results_df.loc[len(results_df)] = [epoch+1, 1/weight, f1_w, f1, acc, prec, rec, rec[0], rec[1]]
        
        epoch +=1
        
    return xgc, results_df, preds, probas

In [6]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project/combo'
model_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage/models'

In [12]:
clin = pd.read_csv(data_path + '/complete_clean_combo_data.csv', index_col = 0,low_memory = False)

In [13]:
dx_code_dict = {code:i for i,code in enumerate(set(clin['MainDiagnosisCode']))}

def convert_dxcode(s):
    code = dx_code_dict[s]
    return (code)

clin['recoded_diagnosis'] = clin['MainDiagnosisCode'].map(convert_dxcode)

In [10]:
medhx = np.load(data_path + '/medhx_embeds.npy')

#this is the admit vs d/c target
target = np.load(data_path + '/admit_dc_target.npy')

subjnotes = np.load(data_path + '/subj_emeds.npy')


In [15]:
#this cell will define the various groupings of variables from the dataframe
dx_vars = ['recoded_diagnosis', 'PresentingComplaint']

cont_vars = [ 'Triage Date & TimeYear', 'Triage Date & TimeMonth', 'Triage Date & TimeWeek', 'Triage Date & TimeDay',
 'Triage Date & TimeDayofweek', 'Triage Date & TimeDayofyear', 'Triage Date & TimeHour', 'Triage Date & TimeMinute',
 'Triage Date & TimeSecond', 'Triage Date & TimeElapsed',
 'num_comorbids','systolic', 'diastolic', 'o2sat', 'pulse', 'temp', 'AgeInYrs']


cos_date_vars = ['Triage Date & Timeweekday_cos',
       'Triage Date & Timeweekday_sin', 'Triage Date & Timeday_month_cos',
       'Triage Date & Timeday_month_sin', 'Triage Date & Timemonth_year_cos',
       'Triage Date & Timemonth_year_sin', 'Triage Date & Timeday_year_cos',
       'Triage Date & Timeday_year_sin', 'Triage Date & Timehour_cos',
       'Triage Date & Timehour_sin', 'Triage Date & Timeclock_cos',
       'Triage Date & Timeclock_sin', 'Triage Date & Timemin_cos',
       'Triage Date & Timemin_sin', 'Triage Date & Timesec_cos']

cat_vars = ['Triage Date & TimeIs_month_end',
 'Triage Date & TimeIs_month_start',
 'Triage Date & TimeIs_quarter_end',
 'Triage Date & TimeIs_quarter_start',
 'Triage Date & TimeIs_year_end',
 'Triage Date & TimeIs_year_start',
 'GenderDesc']

inf_control_vars = ['Are you feeling feverish or have had shakes or chills in the last 24 hours?',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?',
 'Do you have a new Rash?',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?',
 'Have you travelled outside of Canada/USA in the last 3 weeks?',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?',
 'Have you received Health Care in another country in the last 2 years?',
 'Do you have a new/worse cough or shortness of breath?',
 'If so, select all countries that apply',
 'If so, select all infectious diseases that apply']

In [9]:
#sanity checking output variables
Counter(target)

Counter({1.0: 149218, 0.0: 16215})

### preprocessing variables to use in XGB

In [19]:
#need to preprocess cat vars for xgb
X = clin[cat_vars + inf_control_vars].values.astype(str)

features = []
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    features.append(feature)
encoded_x = np.array(features)
encoded_x = encoded_x.reshape(X.shape[0], X.shape[1])

In [20]:
encoded_x.shape

(165433, 17)

In [27]:
X2 = clin[dx_vars].values.astype(str)
encoded_x2 = None
for i in range(0, X2.shape[1]):
	label_encoder = LabelEncoder()
	feature = label_encoder.fit_transform(X2[:,i])
	feature = feature.reshape(X2.shape[0], 1)
	onehot_encoder = OneHotEncoder(sparse=False)
	feature = onehot_encoder.fit_transform(feature)
	if encoded_x2 is None:
		encoded_x2 = feature
	else:
		encoded_x2 = np.concatenate((encoded_x2, feature), axis=1)
print("X shape: : ", encoded_x2.shape)

X shape: :  (165433, 3702)


In [38]:
#splitting this one hot encoded matrix into one for the presenting complaint and one for the medical history
X_pres = encoded_x2[:,:169]

X_dx = encoded_x2[:,169:]

### gonna do this first pass with just tabular type variables

In [122]:
features = np.concatenate((encoded_x,X_pres, X_dx, clin[cont_vars].values), axis =1)
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)

xgc = xgb.XGBClassifier(scale_pos_weight = 1/9)
%time xgc.fit(X_train, y_train)

preds = xgc.predict(X_test)
predictions = xgc.predict_proba(X_test)

f1_w, f1, acc, prec, rec = get_metrics(y_test, preds, print_output=True)

thresholding_analysis(predictions, y_test)

CPU times: user 11min 7s, sys: 10.7 s, total: 11min 17s
Wall time: 11min 23s


  if diff:


weighted f1:  0.825825525531334
f1:           [0.40011    0.87200023]
accuracy:     0.7890181097221887
precision:    [0.27716925 0.96315619]
recall:       [0.71905115 0.79660699]
admission sens:  0.7190511489992587
admission spec:  0.7966069897084048


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,admission_threshold,weighted f1,f1,accuracy,precision,recall,admission sensitivity,admission specificity
0,0.01,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
1,0.05,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
2,0.1,0.0355414,"[0.1797428438187027, 0.019900761535808102]",0.106917,"[0.09874585203982042, 1.0]","[1.0, 0.010050385934819897]",1.0,0.0100504
3,0.2,0.239234,"[0.20033322557381944, 0.243453711328079]",0.222491,"[0.11137532489078139, 0.9963412285769305]","[0.9953051643192489, 0.13866852487135506]",0.995305,0.138669
4,0.3,0.550023,"[0.2571572411478436, 0.5817886363206923]",0.464857,"[0.14878825539847756, 0.9861636025879188]","[0.9466271312083024, 0.4126018439108062]",0.946627,0.412602
5,0.4,0.732108,"[0.3310391298069981, 0.7756090473192231]",0.663943,"[0.20555887627017333, 0.9753136546347801]","[0.8497652582159625, 0.6437875214408233]",0.849765,0.643788
6,0.5,0.825826,"[0.40010999587515467, 0.8720002347004635]",0.789018,"[0.27716925421468713, 0.9631561892417368]","[0.7190511489992587, 0.7966069897084048]",0.719051,0.796607
7,0.6,0.876093,"[0.45319910947633335, 0.9219611256164781]",0.863415,"[0.372533418204965, 0.9513613684960799]","[0.5784531751914999, 0.8943235420240138]",0.578453,0.894324
8,0.7,0.899054,"[0.4527260179434093, 0.94746465623468]",0.904132,"[0.5128205128205128, 0.9369251329891775]","[0.4052384482332592, 0.9582439965694682]",0.405238,0.958244
9,0.8,0.893984,"[0.3373358348968105, 0.9543598490722076]",0.914601,"[0.700701480904131, 0.9214492464317796]","[0.22213985668396344, 0.9897084048027445]",0.22214,0.989708


### now gonna repeat the process with the embedded subjective notes

In [123]:
#X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(subjnotes, target, random_state=1)

xgc2 = xgb.XGBClassifier(scale_pos_weight = 1/9)
%time xgc2.fit(X_train2, y_train2)

preds2 = xgc2.predict(X_test2)
predictions2 = xgc2.predict_proba(X_test2)

f1_w, f1, acc, prec, rec = get_metrics(y_test2, preds2, print_output=True)

thresholding_analysis(predictions2, y_test)

CPU times: user 10min 2s, sys: 1.93 s, total: 10min 4s
Wall time: 10min 7s


  if diff:


weighted f1:  0.7358339767372062
f1:           [0.28169458 0.78509165]
accuracy:     0.669165115210716
precision:    [0.17884282 0.94824904]
recall:       [0.66296022 0.66983812]
admission sens:  0.662960217445021
admission spec:  0.6698381217838765


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,admission_threshold,weighted f1,f1,accuracy,precision,recall,admission sensitivity,admission specificity
0,0.01,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
1,0.05,0.0187091,"[0.17836051123843102, 0.0013926830574749584]",0.0984792,"[0.09791207993612852, 1.0]","[1.0, 0.0006968267581475129]",1.0,0.000696827
2,0.1,0.0499213,"[0.18067978533094814, 0.035738723090688984]",0.114099,"[0.09932879305681902, 0.9897959183673469]","[0.9982703236965653, 0.018197898799313893]",0.99827,0.0181979
3,0.2,0.200174,"[0.1940120923682102, 0.2008426628144938]",0.197442,"[0.10757755277897459, 0.9876864788065356]","[0.9871509760316284, 0.11178709262435678]",0.987151,0.111787
4,0.3,0.370156,"[0.21283177286160135, 0.3872202274730709]",0.310888,"[0.11980721393034825, 0.9789107511686053]","[0.9520632567333828, 0.24134326758147512]",0.952063,0.241343
5,0.4,0.559031,"[0.2428121648271806, 0.5933289974914058]",0.470853,"[0.1411731573865465, 0.9673998666909047]","[0.8670620212503088, 0.4278784305317324]",0.867062,0.427878
6,0.5,0.735834,"[0.2816945771431571, 0.7850916458559111]",0.669165,"[0.17884282095720572, 0.9482490420002276]","[0.662960217445021, 0.6698381217838765]",0.66296,0.669838
7,0.6,0.836772,"[0.2957351290684624, 0.8954544192374976]",0.817936,"[0.23792325056433408, 0.9289623782911793]","[0.3906597479614529, 0.8642795883361921]",0.39066,0.86428
8,0.7,0.864684,"[0.21349029589564109, 0.9353150512874189]",0.880461,"[0.2996873604287628, 0.9137014314928426]","[0.16580182851494935, 0.9579759862778731]",0.165802,0.957976
9,0.8,0.860009,"[0.05535224153705398, 0.9472851198529599]",0.900143,"[0.3723076923076923, 0.9043232441390067]","[0.029898690387941684, 0.994532590051458]",0.0298987,0.994533


###  now gonna do this with just the embedded medical history column

In [124]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(medhx, target, random_state=1)

xgc3 = xgb.XGBClassifier(scale_pos_weight = 1/9)
%time xgc3.fit(X_train3, y_train3)

preds3 = xgc3.predict(X_test3)
predictions3 = xgc3.predict_proba(X_test3)

f1_w, f1, acc, prec, rec = get_metrics(y_test3, preds3, print_output=True)

thresholding_analysis(predictions3, y_test)

CPU times: user 5min 7s, sys: 1.45 s, total: 5min 9s
Wall time: 5min 10s


  if diff:


weighted f1:  0.7382562301967925
f1:           [0.2735758  0.78865722]
accuracy:     0.6725742885466283
precision:    [0.17471737 0.94406666]
recall:       [0.63009637 0.6771816 ]
admission sens:  0.6300963676797627
admission spec:  0.6771816037735849


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,admission_threshold,weighted f1,f1,accuracy,precision,recall,admission sensitivity,admission specificity
0,0.01,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
1,0.05,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
2,0.1,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
3,0.2,0.01793,"[0.17829764736981232, 0.0005358769626493757]",0.0980923,"[0.09787419284625988, 1.0]","[1.0, 0.00026801029159519727]",1.0,0.00026801
4,0.3,0.586613,"[0.23719123359964817, 0.6245129167268004]",0.496748,"[0.13924867679332156, 0.955242825607064]","[0.7996046454163578, 0.46389901372212694]",0.799605,0.463899
5,0.4,0.646507,"[0.2509383378016086, 0.6894122073435603]",0.560894,"[0.15060897118526587, 0.9525069703700203]","[0.7516679021497406, 0.5402015437392796]",0.751668,0.540202
6,0.5,0.738256,"[0.27357579658834885, 0.7886572195517821]",0.672574,"[0.1747173689619733, 0.9440666567030339]","[0.6300963676797627, 0.6771816037735849]",0.630096,0.677182
7,0.6,0.817183,"[0.28613091380427735, 0.8747833006508086]",0.786939,"[0.21284801735567072, 0.9310084084447402]","[0.4363726216950828, 0.8249624785591767]",0.436373,0.824962
8,0.7,0.860283,"[0.2519536513069253, 0.9262643433914153]",0.865761,"[0.277037037037037, 0.9180707666385847]","[0.231035334815913, 0.9346054888507719]",0.231035,0.934605
9,0.8,0.861893,"[0.08625921109666233, 0.9460206903615692]",0.898063,"[0.3509700176366843, 0.9056677779956854]","[0.049172226340499135, 0.9901372212692967]",0.0491722,0.990137


### this now the entire dataset combined

In [125]:
all_features = np.concatenate((features,subjnotes,medhx), axis = 1)
all_features.shape

(165433, 5272)

In [126]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(all_features, target, random_state=1)

xgc4 = xgb.XGBClassifier(scale_pos_weight = 1/9)
%time xgc4.fit(X_train4, y_train4)

preds4 = xgc4.predict(X_test4)
predictions4 = xgc4.predict_proba(X_test4)

f1_w, f1, acc, prec, rec = get_metrics(y_test4, preds4, print_output=True)

thresholding_analysis(predictions4, y_test)

CPU times: user 26min 20s, sys: 44.8 s, total: 27min 4s
Wall time: 27min 35s


  if diff:


weighted f1:  0.8218600615566528
f1:           [0.40026738 0.86758759]
accuracy:     0.783070190285065
precision:    [0.27435169 0.96541418]
recall:       [0.73980726 0.78776265]
admission sens:  0.7398072646404744
admission spec:  0.7877626500857633


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,admission_threshold,weighted f1,f1,accuracy,precision,recall,admission sensitivity,admission specificity
0,0.01,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
1,0.05,0.0181735,"[0.1783172875680201, 0.0008037077718541539]",0.0982132,"[0.0978860294117647, 1.0]","[1.0, 0.0004020154373927959]",1.0,0.000402015
2,0.1,0.0724242,"[0.18282097649186257, 0.060450127345496124]",0.125898,"[0.10061453486925584, 0.9974271012006861]","[0.9992587101556709, 0.03116959691252144]",0.999259,0.0311696
3,0.2,0.360148,"[0.21757960395500234, 0.3756113465927617]",0.305472,"[0.1222678013837017, 0.9939031404578397]","[0.986903879416852, 0.23156089193825044]",0.986904,0.231561
4,0.3,0.591511,"[0.27239377426363215, 0.626123240789883]",0.506057,"[0.1591344153141906, 0.987131398234174]","[0.9448974549048678, 0.4584584048027444]",0.944897,0.458458
5,0.4,0.734796,"[0.3346871675853399, 0.7781933071119994]",0.667303,"[0.20805530507965134, 0.9762983336029769]","[0.8552013837410427, 0.6469232418524872]",0.855201,0.646923
6,0.5,0.82186,"[0.40026737967914433, 0.8675875911331503]",0.78307,"[0.27435169064418585, 0.9654141759180188]","[0.7398072646404744, 0.7877626500857633]",0.739807,0.787763
7,0.6,0.873085,"[0.4515282205137821, 0.9188086382057403]",0.858556,"[0.36380117842574405, 0.9528209556706966]","[0.5950086483815171, 0.8871408662092625]",0.595009,0.887141
8,0.7,0.897826,"[0.45442359249329756, 0.9459193706981318]",0.901593,"[0.4966305303252271, 0.938017182311706]","[0.41882876204596, 0.9539558319039451]",0.418829,0.953956
9,0.8,0.895266,"[0.35051546391752575, 0.9543513702352302]",0.914698,"[0.6873646209386282, 0.9225746735377995]","[0.23523597726711143, 0.988395154373928]",0.235236,0.988395


### manual ensembling

remember that admit is class 0 and discharge is class 1

In [131]:
combined_predictions = (predictions + predictions2 + predictions3)/3

In [132]:
thresholding_analysis(combined_predictions, y_test)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,admission_threshold,weighted f1,f1,accuracy,precision,recall,admission sensitivity,admission specificity
0,0.01,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
1,0.05,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
2,0.1,0.0174427,"[0.17825837994978638, 0.0]",0.0978505,"[0.09785052830097439, 0.0]","[1.0, 0.0]",1.0,0.0
3,0.2,0.0632888,"[0.18196547921776898, 0.050416655782241845]",0.121086,"[0.10009903441445903, 0.9958720330237358]","[0.9990116135408945, 0.025862993138936537]",0.999012,0.025863
4,0.3,0.343016,"[0.21498563719831404, 0.356902505003409]",0.292995,"[0.12059514487079091, 0.994728454088513]","[0.9893748455646157, 0.21746355060034306]",0.989375,0.217464
5,0.4,0.651932,"[0.29079172809499143, 0.6911022023220702]",0.569646,"[0.17334916864608077, 0.9804027770938992]","[0.9016555473190018, 0.5336352915951973]",0.901656,0.533635
6,0.5,0.81228,"[0.38223436077513334, 0.8589248589248589]",0.770304,"[0.2593769305445239, 0.9631011056347409]","[0.7262169508277737, 0.7750857632933105]",0.726217,0.775086
7,0.6,0.88188,"[0.42543064369900274, 0.9313882047256882]",0.877415,"[0.3929244295583002, 0.9406812093379258]","[0.4638003459352607, 0.9222770154373928]",0.4638,0.922277
8,0.7,0.884325,"[0.27220739611132294, 0.9507176786451881]",0.907686,"[0.5954962468723937, 0.9170069721115538]","[0.17642698295033357, 0.9870015008576329]",0.176427,0.987002
9,0.8,0.85986,"[0.036214389183969097, 0.9491956831602525]",0.903479,"[0.7894736842105263, 0.9037417603722373]","[0.018532246108228317, 0.9994639794168096]",0.0185322,0.999464


### now gonna try to save these models

In [133]:
import pickle

In [134]:
pickle.dump(xgc, open(data_path + "/tabular_xgb_model.pkl", "wb"))
pickle.dump(xgc2, open(data_path + "/subjnotes_xgb_model.pkl", "wb"))
pickle.dump(xgc3, open(data_path + "/med_hx_xgb_model.pkl", "wb"))
pickle.dump(xgc4, open(data_path + "/combined_xgb_model.pkl", "wb"))

In [188]:
# for reloading later
with open(data_path + "/tabular_xgb.pkl", mode = 'rb') as pkl:
    xgc3 = pickle.load(pkl)