returning to this approach after some delay working on neural network approaches

#### Online, we import the usual packages. **xgboost** needs to be installed (with conda install xgboost or pip install -U xgboost).

In [None]:
#!pip install xgboost

#### now the usual imports

In [1]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics
from collections import Counter
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import sys
import random as rn

from IPython.display import clear_output

### load helper functions

In [3]:
def thresholding_analysis(preds, targets):
    admission_thresholds = [0.01, 0.05, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    i = 0
    for thresh in admission_thresholds:
        thresholded_predictions = [0 if prob[0] >= thresh else 1 for prob in preds]
        
        f1_w, f1, acc, prec, rec,= get_metrics(targets,thresholded_predictions, print_output = False)

        if i == 0:
            output_df = pd.DataFrame([thresh, f1_w, f1, acc, prec, rec, rec[0], rec[1]]).T
            output_df.columns = ['admission_threshold', 'weighted f1', 'f1', 'accuracy', 'precision', 'recall', 'admission sensitivity', 'admission specificity']
        else:
            output_df.loc[len(output_df)] = [thresh, f1_w, f1, acc, prec, rec, rec[0], rec[1]]
        i+=1
    
    return output_df

In [4]:
def show_confusion_matrix(targets, predictions, labels):
    LABELS = labels

    confusion_matrix = sklearn.metrics.confusion_matrix(targets, predictions)

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d", annot_kws={"size": 15});
    plt.title("Confusion matrix", fontsize=10)
    plt.ylabel('True label', fontsize=10)
    plt.xlabel('Predicted label', fontsize=10)
    plt.show()

In [5]:
def get_metrics(y_test, preds, print_output = False):
    f1_w = sklearn.metrics.f1_score(y_test, preds, average='weighted')
    f1 = sklearn.metrics.f1_score(y_test, preds, average=None)
    acc = sklearn.metrics.accuracy_score(y_test, preds)
    prec = sklearn.metrics.precision_score(y_test,preds, average=None) 
    rec = sklearn.metrics.recall_score(y_test,preds, average=None)
    
    if print_output:
        print ('weighted f1: ', f1_w)
        print ('f1:          ', f1)
        print ('accuracy:    ', acc)
        print ('precision:   ', prec)
        print ('recall:      ', rec)
        print ('admission sens: ', rec[0])
        print ('admission spec: ', rec[1])
            
    return f1_w, f1, acc, prec, rec


In [6]:
def JJ_gridsearch(weights):
    epoch = 0
    for weight in weights:
        
        xgc = xgb.XGBClassifier(scale_pos_weight=weight)
        xgc.fit(X_train, y_train)
        preds = xgc.predict(X_test)
        f1_w, f1, acc, prec, rec = get_metrics(y_test, preds)
        
        if epoch == 0:
            results_df = pd.DataFrame([epoch+1, 1/weight, f1_w, f1, acc, prec, rec, rec[0], rec[1]]).T
            results_df.columns = ['trial number', 'class penalty', 'weighted f1', 'f1', 'accuracy', 'precision', 'recall', 'admission sensitivity', 'admission specificity']
        else:
            results_df.loc[len(results_df)] = [epoch+1, 1/weight, f1_w, f1, acc, prec, rec, rec[0], rec[1]]
        
        epoch +=1
        
    return results_df, preds

In [9]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project/combo'
model_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage/models'

#embeds = pd.read_csv(data_path + '/subjnote_embeds.csv', index_col = 0,low_memory = False)
clin = pd.read_csv(data_path + '/complete_clean_combo_data.csv', index_col = 0,low_memory = False)

In [10]:
medhx = np.load(data_path + '/medhx_embeds.npy')

In [11]:
target = np.load(data_path + '/admit_dc_target.npy')

In [12]:
features = np.load(data_path + '/subj_emeds.npy')

#### sanity checking the outcome variables

In [13]:
Counter(target)

Counter({1.0: 149218, 0.0: 16215})

### This array is an array of 768 element vectors for each entry into subjective notes

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)

In [None]:
LABELS = ['admit', 'discharge'] 

#### first we are going to train xgb without class penalties

In [None]:
xgc = xgb.XGBClassifier()

%time xgc.fit(X_train, y_train)

In [None]:
preds = xgc.predict(X_test)

predictions = xgc.predict_proba(X_test)

In [None]:
f1_w, f1, acc, prec, rec = get_metrics(y_test, preds, print_output=True)

In [None]:
show_confusion_matrix(y_test, preds, LABELS)

In [None]:
thresholding_analysis(predictions, y_test)

#### now we are going to try with an empiric class penality

In [None]:
xgc = xgb.XGBClassifier(scale_pos_weight=1/9)

%time xgc.fit(X_train, y_train)

In [None]:
preds = xgc.predict(X_test)
predictions = xgc.predict_proba(X_test)

In [None]:
f1_w, f1, acc, prec, rec = get_metrics(y_test, preds, print_output=True)

In [None]:
show_confusion_matrix(y_test, preds, LABELS)

In [None]:
thresholding_analysis(predictions, y_test)

### now we are going to repeat the process with medical history

#### first we are going to train xgb without class penalties

In [None]:
xgc = xgb.XGBClassifier()

%time xgc.fit(X_train[:10000], y_train[:10000])

In [None]:
preds = xgc.predict(X_test)

predictions = xgc.predict_proba(X_test)

In [None]:
f1_w, f1, acc, prec, rec = get_metrics(y_test, preds, print_output=True)

In [None]:
show_confusion_matrix(y_test, preds, LABELS)

In [None]:
thresholding_analysis(predictions, y_test)

#### now we are going to try with an empiric class penality

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(medhx, target, random_state=1)

In [None]:
set(y_train2 == y_train), set(y_test ==  y_test2)

In [None]:
xgc2 = xgb.XGBClassifier(scale_pos_weight=1/9)

%time xgc2.fit(X_train2, y_train)

In [None]:
preds2 = xgc2.predict(X_test2)
predictions2 = xgc2.predict_proba(X_test2)

In [None]:
f1_w, f1, acc, prec, rec = get_metrics(y_test, preds2, print_output=True)

In [None]:
show_confusion_matrix(y_test, preds2, LABELS)

In [None]:
thresholding_analysis(predictions2, y_test)

#### ensembling the two models

In [None]:
combined_predictions = (predictions + predictions2)/2

In [None]:
thresholding_analysis(combined_predictions, y_test)

In [None]:
xgc2

In [None]:
import pickle

In [None]:
pickle.dump(xgc, open(data_path + "/subj_hx_xgb.pkl", "wb"))
pickle.dump(xgc2, open(data_path + "/medhx_xgb.pkl", "wb"))

In [None]:
with open(data_path + "/tabular_xgb.pkl", mode = 'rb') as pkl:
    xgc3 = pickle.load(pkl)

need to revisit this. gotta figure out how to get all three models trained and testing together.  probably easiest, just to do it all one more time (!) in a new notebook