In [141]:
import pickle

In [111]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Data Prep

In [5]:
strDataDir = '/project/hackathon/hackers09/hack095/NOVEL-CLINICAL-PREDICTION-APPROACHES-TO-MANAGING-CARE-FOR-ACUTE-PULMONARY-EMBOLISM-PATIENTS/'
strMergedCsv = os.path.join(strDataDir, 'enc_outcome_echo.csv')
strEchoLabeledCsv = os.path.join(strDataDir, 'echo_tag.csv')

In [44]:
dfMerged = pd.read_csv(strMergedCsv, index_col=0)
dfEchoLabeled = pd.read_csv(strEchoLabeledCsv, encoding='ISO-8859-1', index_col=0)

In [45]:
def preproc_labeled_echo(dfEchoLabeled):
    # Changing mlid to mild
    dfEchoLabeled['function'] = dfEchoLabeled['function'].replace('mlid', 'mild')
    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].replace('mlid', 'mild')
    
    # na to 0, mild to 1, moderate to 2, severe to 3
    dictReplace = {'mild':1, 'moderate':2, 'severe':3}
    dfEchoLabeled['function'] = dfEchoLabeled['function'].fillna(0)
    dfEchoLabeled['function'] = dfEchoLabeled['function'].replace(dictReplace)

    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].fillna(0)
    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].replace(dictReplace)
    
    return dfEchoLabeled

In [46]:
def merge_labeled_echo(dfMerged, dfEchoLabeled):
    """
    Parse labeled echo data to keep matching enc, order_proc_id
    If multiple rows per enc and order id, keep max bc this 
    represents different line of report
    """
    srsIsNullNarrative = dfMerged['NARRATIVE_compiled'].isnull()
    for nEnc in dfMerged.index:
        nOrder = dfMerged.at[nEnc, 'ORDER_PROC_ID']
        dfTemp = dfEchoLabeled[dfEchoLabeled['HSP_ENC']==nEnc]
        dfTemp = dfTemp[dfTemp['ORDER_PROC_ID']==nOrder]
        if dfTemp.shape[0] > 0:    
            dfMerged.at[nEnc, 'echo_dilation'] = dfTemp['dilation'].max()
            dfMerged.at[nEnc, 'echo_function'] = dfTemp['function'].max()
        elif not srsIsNullNarrative[nEnc]:
            dfMerged.at[nEnc, 'echo_dilation'] = 0
            dfMerged.at[nEnc, 'echo_function'] = 0
    
    return dfMerged

In [47]:
dfEchoLabeled = preproc_labeled_echo(dfEchoLabeled)
dfMerged = merge_labeled_echo(dfMerged, dfEchoLabeled)

In [48]:
dfMerged.shape

(1642, 71)

In [49]:
dfMerged['b48hr'] = dfMerged['b48hr'].fillna(0).astype(bool)

In [51]:
dfMerged.head()

Unnamed: 0_level_0,PATIENT_ID_x,ED_EPISODE_ID,ED_DISP,DISCH_DISP,ADT_ARRIVAL_TIME_DIFFSEC,ED_DISP_TIME_DIFFSEC,HOSP_DISCH_TIME_DIFFSEC,ADMIT_SOURCE,ADT_PAT_CLASS,HOSP_SERVICE,...,PATIENT_ID,ORDER_PROC_ID,NAME,ORDER_INST_DIFFSEC,PROC_START_TIME_DIFFSEC,RESULT_TIME_DIFFSEC,ECHO_TYPE,NARRATIVE_compiled,echo_dilation,echo_function
HSP_ENC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260755660,1305,34382396,Admitted,To Home Or Self Care,-420,10500,627600,Home & Outside Location,Inpatient,Pulmonary,...,1305.0,330967554.0,CV ECHO,27060.0,27060.0,105180.0,echo_old,STUDY DATE: 04/08/2017 * REASON FOR STUDY: Sho...,0.0,0.0
192470437,1261,27636517,Admitted,To Home With Home Health,-120,9120,516300,Home & Outside Location,Inpatient,Cardiology,...,,,,,,,,,,
258754156,785,34189980,Admitted,Expired,0,40200,899640,Home & Outside Location,Inpatient,Cardiology,...,,,,,,,,,,
306050512,504,38409274,Admitted,To Home Or Self Care,0,7860,388440,Home & Outside Location,Inpatient,Internal Medicine,...,504.0,355830397.0,CV ECHO,15060.0,15120.0,21000.0,echo_old,STUDY DATE: 06/25/2018 * REASON FOR STUDY: Tac...,0.0,0.0
163297609,631,22833060,Admitted,To Rehab Unit Zale Lipshy,0,18300,758880,Home & Outside Location,Inpatient,Hematology-Oncology,...,,,,,,,,,,


# Split Data

In [13]:
nSeed = 42
np.random.seed(nSeed)

In [27]:
dictSplitIndices = {}

nOuterFolds = 3
nInnerFolds = 3

nSamples = dfMerged.shape[0]

objOuterStrat = StratifiedKFold(n_splits=nOuterFolds, shuffle=True,
                                random_state=nSeed)
objInnerStrat = StratifiedKFold(n_splits=nInnerFolds, shuffle=True,
                                    random_state=nSeed)

In [66]:
lsOuterSplits = list(objOuterStrat.split(np.zeros(nSamples), dfMerged['b48hr']))
lsTupInnerSplits = []
for nOuterIdx, tupOuterSplits in enumerate(lsOuterSplits):
    arrOuterTrain = tupOuterSplits[0]
    arrOuterTest = tupOuterSplits[1]
    
    dictSplitIndices['outer_train_{}'.format(nOuterIdx)] = arrOuterTrain
    dictSplitIndices['outer_test_{}'.format(nOuterIdx)] = arrOuterTest
    
    nInnerTrainSamples = len(arrOuterTrain)
    lsInnerSplits = list(objInnerStrat.split(np.zeros(nInnerTrainSamples),
                                             dfMerged['b48hr'].iloc[arrOuterTrain]))
    for nInnerIdx, tupInnerSplits in enumerate(lsInnerSplits):
        arrInnerTrain = tupInnerSplits[0]
        arrInnerTest = tupInnerSplits[1]
        
        arrInnerTrain = arrOuterTrain[arrInnerTrain]
        arrInnerTest = arrOuterTrain[arrInnerTest]
        dictSplitIndices['outer_{}_inner_train_{}'.format(nOuterIdx, nInnerIdx)] = arrInnerTrain
        dictSplitIndices['outer_{}_inner_test_{}'.format(nOuterIdx, nInnerIdx)] = arrInnerTest
        
        lsTupInnerSplits.append((arrInnerTrain, arrInnerTest))

In [41]:
print(len(dictSplitIndices['outer_train_0']))
print(len(dictSplitIndices['outer_test_0']))
print(len(dictSplitIndices['outer_0_inner_train_0']))
print(len(dictSplitIndices['outer_0_inner_test_0']))

1094
548
729
365


# Conducting random search on labeled echo data

In [55]:
lsFeatures = ['echo_dilation', 'echo_function']
strTarget = 'b48hr'
dfData = dfMerged[lsFeatures+[strTarget]]

In [59]:
dfData.isna()['echo_dilation'].value_counts()

True     1208
False     434
Name: echo_dilation, dtype: int64

In [60]:
dfData.isna()['echo_function'].value_counts()

True     1208
False     434
Name: echo_function, dtype: int64

In [120]:
(100,)*5

(100, 100, 100, 100, 100)

In [121]:
nModelConfigs = 4
dfModelSearch = pd.DataFrame(columns=['estimator', 'param_distributions'],
                             index=range(nModelConfigs))
dfModelSearch['estimator'] = [SVC(), DecisionTreeClassifier(), 
                              GradientBoostingClassifier(), MLPClassifier()]
dfModelSearch['param_distributions'] = [{'C': [0.03, 0.3, 0.6, 0.9],
                                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                         'degree': [3, 10, 50]},
                                        {'criterion': ['gini', 'entropy'],
                                         'splitter': ['best', 'random'],
                                         'max_depth': [None, 100, 500, 1000],
                                         'max_leaf_nodes': [None, 100, 500, 1000],
                                         'min_samples_split': [0.1, 0.5, 0.9],
                                         'max_features':['auto', 'sqrt', 'log2', None]},
                                        {'loss':['deviance', 'exponential'],
                                         'learning_rate':[0.001, 0.01, 0.1, 0.5, 0.9],
                                         'n_estimators': [100, 300, 500, 1000, 1500, 2000],
                                         'max_depth': [None, 100, 500, 1000]},
                                        {'hidden_layer_sizes': [(100,),
                                                                (100,)*5,
                                                                (100,)*10,
                                                                (100,)*20],
                                         'activation': ['identity', 'logistic', 'tanh', 'relu'],
                                         'solver': ['lbfgs', 'sgd', 'adam'],
                                         'learning_rate': ['constant', 'invscaling', 'adaptive']}]

In [122]:
nConfig = 3
objRandomSearch = RandomizedSearchCV(dfModelSearch.at[nConfig, 'estimator'],
                                     dfModelSearch.at[nConfig, 'param_distributions'],
                                     n_iter=50,
                                     scoring='roc_auc',
                                     refit=True,
                                     random_state=nSeed,
                                     return_train_score=True,
                                     n_jobs=-1,
                                     cv=lsTupInnerSplits)

In [123]:
objScaler = StandardScaler()
objImpute = SimpleImputer(strategy='median')

lsSteps = [('std_scaler', objScaler),
           ('simple_imputer', objImpute),
           ('random_search', objRandomSearch)]

objPipeline = Pipeline(lsSteps)

In [124]:
objPipeline.fit(dfData[lsFeatures], dfData[strTarget]);

In [125]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [126]:
objPipeline.named_steps['random_search'].cv_results_

{'mean_fit_time': array([ 5.1675992 ,  2.13013816,  1.65441272,  6.59320988,  2.89472959,
         0.93995463, 12.94307439,  4.5310177 ,  1.76930759,  0.75738557,
         0.4207868 ,  0.42197516,  8.33695801,  8.37882966,  8.84187696,
         5.29660726,  2.07746341,  1.66893975,  2.46641697,  0.46807239,
         0.7060229 ,  2.60600734,  0.30869211,  0.30248494,  2.58991382,
         0.42833596,  0.28189434,  0.46511465,  1.71089212,  5.0106744 ,
         1.44883174,  4.12227027,  0.13093943,  2.20434297,  0.89783878,
        10.21030074,  0.30488226,  0.39303409,  3.54673719,  5.66507771,
         9.35799432,  0.94195016,  0.01957276,  5.82281046,  3.91937009,
         1.615026  ,  2.27070657,  0.84002622,  3.42520409,  1.660127  ]),
 'std_fit_time': array([9.05327930e-01, 1.82218161e-01, 2.26406907e-01, 7.65689379e-01,
        4.21847493e-01, 2.75193881e-02, 6.96332083e-01, 8.25291145e-01,
        3.74389103e-01, 1.30282512e-01, 7.26491848e-02, 1.69760377e-01,
        2.55190730e

In [130]:
dfData.shape

(1642, 3)

In [138]:
objBestModel = objPipeline.named_steps['random_search'].best_estimator_

In [139]:
objTestPipeline = Pipeline([('std_scaler', objScaler),
                            ('simple_imputer', objImpute),
                            ('best_model', objBestModel)])

In [140]:
for nOuterFold in range(nOuterFolds):
    arrTrain = dictSplitIndices['outer_train_{}'.format(nOuterFold)]
    arrTest = dictSplitIndices['outer_test_{}'.format(nOuterFold)]
    
    arrTrainX = dfData[lsFeatures].iloc[arrTrain].values
    arrTrainY = dfData[strTarget].iloc[arrTrain].values
    
    arrTestX = dfData[lsFeatures].iloc[arrTest]
    arrTestY = dfData[strTarget].iloc[arrTest]
    objTestPipeline.fit(arrTrainX, 
                    arrTrainY)
    print(objTestPipeline.score(arrTestX,
                            arrTestY))

0.8959854014598541
0.9014598540145985
0.9029304029304029


In [142]:
with open("bestModel.p", "wb") as objFile:
    pickle.dump(objTestPipeline, objFile)

In [127]:
report(objPipeline.named_steps['random_search'].cv_results_)

Model with rank: 1
Mean validation score: 0.597 (std: 0.037)
Parameters: {'solver': 'lbfgs', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100, 100, 100, 100, 100, 100, 100, 100, 100, 100), 'activation': 'identity'}

Model with rank: 2
Mean validation score: 0.597 (std: 0.037)
Parameters: {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100, 100, 100, 100, 100, 100, 100, 100, 100, 100), 'activation': 'identity'}

Model with rank: 3
Mean validation score: 0.597 (std: 0.037)
Parameters: {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100, 100, 100, 100, 100, 100, 100, 100, 100, 100), 'activation': 'tanh'}



# Keras and DNN search and stuff