In [141]:
import pickle

In [173]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Data Prep

In [145]:
#Paths to files
data_path = '/project/hackathon/hackers09/shared/data/'
cancer_path = os.path.join(data_path, 'df_cancer.csv')
echo_path = os.path.join(data_path, 'df_echo.csv')
encounter_path = os.path.join(data_path, 'df_encounter.csv')
labs_path = os.path.join(data_path, 'df_labs.csv')
outcome_path = os.path.join(data_path, 'df_outcome.csv')
problist_path = os.path.join(data_path, 'df_problist.csv')
radiology_path = os.path.join(data_path, 'df_radiology.csv')
registry_path = os.path.join(data_path, 'df_registry.csv')
vitals_path = os.path.join(data_path, 'df_vitals.csv')

In [146]:
#Read in dataframes
cancer_df = pd.read_csv(cancer_path, encoding='ISO-8859-1')
cancer_df.set_index("HSP_ENC", inplace = True)
echo_df = pd.read_csv(echo_path, encoding='ISO-8859-1')
echo_df.set_index("HSP_ENC", inplace = True)
encounter_df = pd.read_csv(encounter_path, encoding='ISO-8859-1')
encounter_df.set_index("HSP_ENC", inplace = True)
labs_df = pd.read_csv(labs_path, encoding='ISO-8859-1')
labs_df.set_index("HSP_ENC", inplace = True)
outcome_df = pd.read_csv(outcome_path, encoding='ISO-8859-1')
outcome_df.set_index("HSP_ENC", inplace = True)
problist_df = pd.read_csv(problist_path, encoding='ISO-8859-1')
problist_df.set_index("HSP_ENC", inplace = True)
radiology_df = pd.read_csv(radiology_path, encoding='ISO-8859-1')
radiology_df.set_index("HSP_ENC", inplace = True)
registry_df = pd.read_csv(registry_path, encoding='ISO-8859-1')
registry_df.set_index("HSP_ENC", inplace = True)
vitals_df = pd.read_csv(vitals_path, encoding='ISO-8859-1')
vitals_df.set_index("HSP_ENC", inplace = True)

In [147]:
#function to merge cancer data
def merge_cancer(enc_df, cnc_df):
    #Make modifications to cancer df
    mod_cancer_df = cnc_df.copy()
    mod_cancer_df.reset_index(level=0, inplace=True)
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Unknown, Missing Remission Date', 'cancer_at_enc'] = 'Unknown'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Unknown, Previously Positive', 'cancer_at_enc'] = 'Unknown'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Unknown, Not Documented', 'cancer_at_enc'] = 'Unknown'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Unknown, Not documented', 'cancer_at_enc'] = 'Unknown'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'No Cancer', 'cancer_at_enc'] = '1'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Unknown', 'cancer_at_enc'] = '2'
    mod_cancer_df.loc[mod_cancer_df.cancer_at_enc == 'Cancer', 'cancer_at_enc'] = '3'
    mod_cancer_df['cancer_at_enc'] = mod_cancer_df['cancer_at_enc'].astype(int)
    mod_cancer_df.drop_duplicates(['PATIENT_ID', 'HSP_ENC'])
    
    #Take only cancer status column with max value
    drop_cancer_df = mod_cancer_df[['HSP_ENC', 'cancer_at_enc']]
    drop_cancer_df = drop_cancer_df.groupby('HSP_ENC',group_keys=False).apply(lambda x: x.loc[x['cancer_at_enc']==x['cancer_at_enc'].max()])

    #Merge with encounter df and drop dups
    mergeRes = pd.merge(enc_df, drop_cancer_df, on='HSP_ENC', how='left')
    mergeRes = mergeRes.drop_duplicates('HSP_ENC')

    #Replace NaN in encounter df with 0 (Never had cancer)
    mergeRes['cancer_at_enc'].fillna(0, inplace=True)
    
    #Renaming the cancer column
    mergeRes.rename(columns={'cancer_at_enc': 'CANCER_RANK'}, inplace=True)
    
    mergeRes.reset_index(drop=True)
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [148]:
def merge_registry(enc_df, reg_df):
    reg_list = reg_df.REGISTRY_NAME.unique().tolist()
    reg_df_copy = reg_df.copy()
    reg_df_copy.reset_index(level=0, inplace=True)

    tst_df = reg_df_copy[['HSP_ENC']].copy()
    for item in reg_list:
        tst_df[item] = False
    tst_df = tst_df.drop_duplicates()


    for index, row in reg_df_copy.iterrows():
        enc_id = reg_df_copy.iloc[index, 0]
        curr_reg = reg_df_copy.iloc[index, 2]
        tst_df.loc[tst_df['HSP_ENC'] == enc_id, [curr_reg]] = True

    #Merge with encounter df and drop dups
    mergeRes = pd.merge(enc_df, tst_df, on='HSP_ENC', how='left')
    mergeRes = mergeRes.drop_duplicates('HSP_ENC')
    
    #Replace NaN in encounter df with False, no record
    for item in reg_list:
        mergeRes[item].fillna(False, inplace=True)
    
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [149]:
def merge_vitals(enc_df, data_path):
    vital_data_path = os.path.join(data_path, 'vitals.csv')
    vital_data_df = pd.read_csv(vital_data_path, encoding='ISO-8859-1')
    vital_data_df = vital_data_df.drop('Unnamed: 0', 1)
    
    mergeRes = pd.merge(enc_df, vital_data_df, on='HSP_ENC', how='left')
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [150]:
def merge_co_morbid(enc_df, data_path):
    co_mobid_path = os.path.join(data_path, 'comorbitidity_score_sm2.csv')
    co_mobid_df = pd.read_csv(co_mobid_path, encoding='ISO-8859-1')
    
    mergeRes = pd.merge(enc_df, co_mobid_df, on='HSP_ENC', how='left')
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [151]:
def merge_labs(enc_df, data_path):
    lab_data_path = os.path.join(data_path, 'cleaned_lab_data.csv')
    lab_data_df = pd.read_csv(lab_data_path, encoding='ISO-8859-1')
    
    mergeRes = pd.merge(enc_df, lab_data_df, on='HSP_ENC', how='left')
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [152]:
def merge_encounter_radiology(df_encounter, df_radiology, cutoff_OrderTime=12, cutoff_ED_Disp=12):
    # generate a new df_CT dataframe
    # 1. focus on 'CT ANGIOGRAM' only
    # 2. order time within 12h
    # 3. keep only the first order for the outliers (only one data point that has 2 orders)

    df_CT = df_radiology[df_radiology['NAME'].apply(lambda x: x.startswith('CT AN'))]
    df_CT = df_CT[df_CT['ORDER_TIME_DIFFSEC'] <= cutoff_OrderTime*3600]
    df_CT = df_CT.drop_duplicates('HSP_ENC', keep='first')

    # combine df_encounter and df_CT based on 'HSP_ENC' id
    df_encounter = df_encounter[df_encounter['ED_DISP_TIME_DIFFSEC']<=cutoff_ED_Disp*3600]
    df_enc_CT = pd.merge(df_encounter, df_CT, how='left', on='HSP_ENC')
    df_enc_CT.set_index('HSP_ENC', inplace=True)
    
    return df_enc_CT

In [153]:
def expand_outcome(dfOutcome):
    ## Outcome expansion   
    
    # Drop any outcomes after 48 hr
    dfOutcome = dfOutcome[dfOutcome['ORDER_TIME_DIFFSEC']<48*60*60]
    
    # Create columns for each unique outcome
    lsExpandedOutcomeCols = []
    lsOutcomeColsToRetain = dfOutcome.columns[2:].tolist()
    lsUniqueOutcomes = dfOutcome['name_gen'].value_counts().index.tolist()
    for strOutcome in lsUniqueOutcomes:
        lsExpandedOutcomeCols += [strCol + '_' + strOutcome for strCol in lsOutcomeColsToRetain]

    dfExpandedOutcome = pd.DataFrame(columns=lsExpandedOutcomeCols+['PATIENT_ID'])
    dfExpandedOutcome['HSP_ENC'] = dfOutcome['HSP_ENC'].value_counts().index
    dfExpandedOutcome = dfExpandedOutcome.set_index('HSP_ENC')

    for nEnc in dfOutcome['HSP_ENC'].value_counts().index:
        for nIdx in dfOutcome[dfOutcome['HSP_ENC']==nEnc].index:
            lsTempCols = [strCol + '_' + dfOutcome.at[nIdx, 'name_gen'] for strCol in lsOutcomeColsToRetain]
            lsTempCols.append('PATIENT_ID')
            dfExpandedOutcome.loc[nEnc, lsTempCols] = dfOutcome.loc[nIdx, lsOutcomeColsToRetain+['PATIENT_ID']].values

    # Create boolean column for order time < 48 hrs for any outcome
    lsOrderTimeCols = [strCol for strCol in dfExpandedOutcome.columns if 'ORDER_TIME' in strCol]

    dfExpandedOutcome['b48hr'] = np.zeros(dfExpandedOutcome.shape[0])
    for nEnc in dfExpandedOutcome.index:
        for nVal in dfExpandedOutcome.loc[nEnc, lsOrderTimeCols].values:
            if nVal < 172800:
                dfExpandedOutcome.at[nEnc, 'b48hr'] = 1

    # Column for minimum order time from all outcomes
    dfExpandedOutcome['MinOrderTime'] = dfExpandedOutcome.loc[:, lsOrderTimeCols].min(axis=1)
    dfExpandedOutcome = dfExpandedOutcome.sort_values('MinOrderTime')
    
    return dfExpandedOutcome


def compile_echo(dfEcho):    
    # Keep only 12 hr echos
    dfEcho = dfEcho[dfEcho['ORDER_INST_DIFFSEC'] < 12*60*60]

    # Add all narratives together
    lsUniqueEchoEnc = dfEcho['HSP_ENC'].value_counts().index.tolist()

    lsEchoCols = dfEcho.columns.tolist()
    lsEchoCols.remove('new_line')
    lsEchoCols.remove('NARRATIVE')
    dfCompiledEcho = pd.DataFrame(columns=lsEchoCols)
    dfCompiledEcho['HSP_ENC'] = lsUniqueEchoEnc
    dfCompiledEcho = dfCompiledEcho.set_index('HSP_ENC')
    lsEchoCols.remove('HSP_ENC')

    lsCompiledNarratives = []
    for nEnc in lsUniqueEchoEnc:
        lsUniqueEchoOrderId = dfEcho[dfEcho['HSP_ENC']==nEnc]['ORDER_PROC_ID'].value_counts().index.tolist()
        #if len(lsUniqueEchoOrderId)>1:
            #print(nEnc)
        nFirstEchoOrderId = lsUniqueEchoOrderId[0] # Keep first one only
        
        # Compile echo data
        strCompiled = dfEcho[dfEcho['ORDER_PROC_ID']==nFirstEchoOrderId]['NARRATIVE'].str.cat(sep=' ')
        dfCompiledEcho.loc[nEnc, lsEchoCols] = dfEcho[dfEcho['ORDER_PROC_ID']==nFirstEchoOrderId][lsEchoCols].iloc[0]
        lsCompiledNarratives.append(strCompiled)
    dfCompiledEcho['NARRATIVE_compiled'] = lsCompiledNarratives

    # Keep only 12 hr echos
    dfCompiledEcho = dfCompiledEcho[dfCompiledEcho['ORDER_INST_DIFFSEC'] < 12*60*60]
    
    return dfCompiledEcho

def get_merge_dfs(dfOutcome, dfEcho, dfEncounter):
    dfExpandedOutcome = expand_outcome(dfOutcome)
    dfCompiledEcho = compile_echo(dfEcho)
    dfEncounter = dfEncounter.set_index('HSP_ENC')
    
    dfMerged = dfEncounter.merge(dfExpandedOutcome, how='left', on='HSP_ENC')
    dfMerged = dfMerged.merge(dfCompiledEcho, how='left', on='HSP_ENC')
    
    return dfMerged

In [154]:
def preproc_labeled_echo(dfEchoLabeled):
    # Changing mlid to mild
    dfEchoLabeled['function'] = dfEchoLabeled['function'].replace('mlid', 'mild')
    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].replace('mlid', 'mild')
    
    # na to 0, mild to 1, moderate to 2, severe to 3
    dictReplace = {'mild':1, 'moderate':2, 'severe':3}
    dfEchoLabeled['function'] = dfEchoLabeled['function'].fillna(0)
    dfEchoLabeled['function'] = dfEchoLabeled['function'].replace(dictReplace)

    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].fillna(0)
    dfEchoLabeled['dilation'] = dfEchoLabeled['dilation'].replace(dictReplace)
    
    return dfEchoLabeled


def merge_labeled_echo(dfMerged, dfEchoLabeled):
    """
    Parse labeled echo data to keep matching enc, order_proc_id
    If multiple rows per enc and order id, keep max bc this 
    represents different line of report
    """
    srsIsNullNarrative = dfMerged['NARRATIVE_compiled'].isnull()
    for nEnc in dfMerged.index:
        nOrder = dfMerged.at[nEnc, 'ORDER_PROC_ID']
        dfTemp = dfEchoLabeled[dfEchoLabeled['HSP_ENC']==nEnc]
        dfTemp = dfTemp[dfTemp['ORDER_PROC_ID']==nOrder]
        if dfTemp.shape[0] > 0:    
            dfMerged.at[nEnc, 'echo_dilation'] = dfTemp['dilation'].max()
            dfMerged.at[nEnc, 'echo_function'] = dfTemp['function'].max()
        elif not srsIsNullNarrative[nEnc]:
            dfMerged.at[nEnc, 'echo_dilation'] = 0
            dfMerged.at[nEnc, 'echo_function'] = 0
    
    return dfMerged

In [163]:
def merge_echo(out_df, ech_df, enc_df, rad_df):
    # read each dataset
    mod_out_df = out_df.copy()
    mod_out_df.reset_index(level=0, inplace=True)
    mod_ech_df = ech_df.copy()
    mod_ech_df.reset_index(level=0, inplace=True)
    mod_enc_df = enc_df.copy()
    mod_enc_df.reset_index(level=0, inplace=True)
    mod_rad_df = rad_df.copy()
    mod_rad_df.reset_index(level=0, inplace=True)
    
    df_enc_echo_outcome = get_merge_dfs(mod_out_df, mod_ech_df, mod_enc_df)
    df_enc_CT = merge_encounter_radiology(mod_enc_df, mod_rad_df)

    # clean the dataset
    df_cleaned = df_enc_echo_outcome.merge(df_enc_CT, how='left', on='HSP_ENC')

    # raw column names
    name_gen = []
    for i in df_cleaned.columns.values:
        if i.startswith('name_gen') or i.startswith('NARR'):
            name_gen.append(i)
    name_gen = sorted(name_gen) + ['b48hr']

    # rename the column names
    df_cleaned = df_cleaned[name_gen]
    new_columns = ['CT', 'Echo', 'CPR', 'DEATH', 'INTUBATION', 'PPV', 'THROMBOLYSIS', 'THROMBOLYSIS_PROC', 'VASOPRESSORS', 'B48hr']
    df_cleaned.columns = new_columns

    # fill Nan
    df_cleaned['CT'].fillna('No_CT', inplace=True)
    df_cleaned['Echo'].fillna('No_Echo', inplace=True)
    df_cleaned.fillna(0, inplace=True)

    # convert to 1
    map_dict = {'DEATH': 1, 
                'INTUBATION': 1,
                'PPV': 1,
                'THROMBOLYSIS': 1,
                'THROMBOLYSIS_PROC': 1,
                'VASOPRESSORS': 1,
                0:0}
    for col in ['DEATH', 'INTUBATION', 'PPV', 'THROMBOLYSIS', 'THROMBOLYSIS_PROC', 'VASOPRESSORS']:
        df_cleaned[col] = df_cleaned[col].map(map_dict)
        
    
    echo_tag_path = '/project/hackathon/hackers09/hack095/NOVEL-CLINICAL-PREDICTION-APPROACHES-TO-MANAGING-CARE-FOR-ACUTE-PULMONARY-EMBOLISM-PATIENTS/echo_tag.csv'
    df_echo_label = pd.read_csv(echo_tag_path, encoding='ISO-8859-1')

    dfEchoLabeled = preproc_labeled_echo(df_echo_label)
    dfMerged = merge_labeled_echo(df_enc_echo_outcome, dfEchoLabeled)

    #dfMerged.head()
    df_new_features_echo_tags = dfMerged[['echo_dilation','echo_function']]
    mergeRes = pd.merge(df_cleaned, df_new_features_echo_tags, on='HSP_ENC', how='left')
        
    return mergeRes

In [164]:
def drop_rows(enc_df, data_path):
    drop_path = os.path.join(data_path, "final_enc_list.csv")
    drop_data = pd.read_csv(drop_path, encoding='ISO-8859-1')
    
    mergeRes = pd.merge(enc_df, drop_data, on='HSP_ENC')
    mergeRes.set_index("HSP_ENC", inplace = True)
    return mergeRes

In [198]:
def merge_age_gender(master_df, enc_df):
    age_gender_df = enc_df[['ENC_AGE', 'GENDER']].copy()
    mergeRes = pd.merge(master_df, age_gender_df, on='HSP_ENC', how='left')
        
    return mergeRes

In [199]:
output_data = encounter_df
output_data = merge_echo(outcome_df, echo_df, encounter_df, radiology_df)
output_data = merge_cancer(output_data, cancer_df)
output_data = merge_registry(output_data, registry_df)
output_data = merge_vitals(output_data, data_path)
output_data = merge_co_morbid(output_data, data_path)
output_data = merge_labs(output_data, data_path)
output_data = merge_age_gender(output_data, encounter_df)
outcome = output_data.pop('B48hr')
output_data['B48hr']=outcome

In [333]:
output_data.columns

Index(['CT', 'Echo', 'CPR', 'DEATH', 'INTUBATION', 'PPV', 'THROMBOLYSIS',
       'THROMBOLYSIS_PROC', 'VASOPRESSORS', 'echo_dilation', 'echo_function',
       'CANCER_RANK', 'OBESITY REGISTRY', 'DIABETES REGISTRY', 'COPD REGISTRY',
       'CONGESTIVE HEART FAILURE REGISTRY', 'GENERAL MALIGNANCY REGISTRY',
       'CHRONIC LUNG REGISTRY', 'ASTHMA REGISTRY',
       'CORONARY ARTERY DISEASE REGISTRY', 'CHRONIC HEART REGISTRY',
       'SLEEP APNEA REGISTRY', 'LUNG CANCER REGISTRY',
       'LIVER CIRRHOSIS REGISTRY', 'CRANIOTOMY REGISTRY',
       'CHRONIC RENAL FAILURE REGISTRY', 'PULSE OXIMETRY_min',
       'PULSE OXIMETRY_max', 'PULSE OXIMETRY_std', 'PULSE OXIMETRY_average',
       'TEMPERATURE_min', 'TEMPERATURE_max', 'TEMPERATURE_std',
       'TEMPERATURE_average', 'PULSE_min', 'PULSE_max', 'PULSE_std',
       'PULSE_average', 'RESPIRATIONS_min', 'RESPIRATIONS_max',
       'RESPIRATIONS_std', 'RESPIRATIONS_average',
       'CPM F16 R AS OXYGEN AMOUNT_min', 'CPM F16 R AS OXYGEN AMOUNT_max

In [334]:
cols_to_keep = [
    #'CT',
    #'Echo',
    'CPR',
    'DEATH',
    'INTUBATION',
    'PPV',
    'THROMBOLYSIS',
    'THROMBOLYSIS_PROC',
    'VASOPRESSORS',
    'echo_dilation',
    'echo_function',
    'CANCER_RANK',
#     'OBESITY REGISTRY',
#     'DIABETES REGISTRY',
#     'COPD REGISTRY',
#     'CONGESTIVE HEART FAILURE REGISTRY',
#     #'GENERAL MALIGNANCY REGISTRY',
#     'CHRONIC LUNG REGISTRY',
#     #'ASTHMA REGISTRY',
#     'CORONARY ARTERY DISEASE REGISTRY',
#     #'CHRONIC HEART REGISTRY',
#     'SLEEP APNEA REGISTRY',
#     #'LUNG CANCER REGISTRY',
#     'LIVER CIRRHOSIS REGISTRY',
#     #'CRANIOTOMY REGISTRY',
#     #'CHRONIC RENAL FAILURE REGISTRY',
    'PULSE OXIMETRY_min',
    #'PULSE OXIMETRY_max',
    #'PULSE OXIMETRY_std',
    #'PULSE OXIMETRY_average',
    #'TEMPERATURE_min',
    #'TEMPERATURE_max',
    #'TEMPERATURE_std',
    #'TEMPERATURE_average',
    #'PULSE_min',
    'PULSE_max',
    #'PULSE_std',
    #'PULSE_average',
    #'RESPIRATIONS_min',
    'RESPIRATIONS_max',
    #'RESPIRATIONS_std',
    #'RESPIRATIONS_average',
    #'CPM F16 R AS OXYGEN AMOUNT_min',
    #'CPM F16 R AS OXYGEN AMOUNT_max',
    #'CPM F16 R AS OXYGEN AMOUNT_std',
    #'CPM F16 R AS OXYGEN AMOUNT_average',
    #'CPM F16 R INV OXYGEN CONCENTRATION (%)_min',
    #'CPM F16 R INV OXYGEN CONCENTRATION (%)_max',
    #'CPM F16 R INV OXYGEN CONCENTRATION (%)_std',
    #'CPM F16 R INV OXYGEN CONCENTRATION (%)_average',
    'BLOOD PRESSURE SBP_min',
    #'BLOOD PRESSURE SBP_max',
    #'BLOOD PRESSURE SBP_std',
    #'BLOOD PRESSURE SBP_average',
    'BLOOD PRESSURE DBP_min',
    #'BLOOD PRESSURE DBP_max',
    #'BLOOD PRESSURE DBP_std',
    #'BLOOD PRESSURE DBP_average',
    #'score',
    #'index',
    'wscore',
    #'windex',
    'BNP_STATUS',
    'TROPONIN_MAX',
    'TROPONIN_STATUS',
    'ENC_AGE', 'GENDER',
    'B48hr'
]

In [335]:
min_data = output_data[cols_to_keep].copy()
min_data

Unnamed: 0_level_0,CPR,DEATH,INTUBATION,PPV,THROMBOLYSIS,THROMBOLYSIS_PROC,VASOPRESSORS,echo_dilation,echo_function,CANCER_RANK,...,RESPIRATIONS_max,BLOOD PRESSURE SBP_min,BLOOD PRESSURE DBP_min,wscore,BNP_STATUS,TROPONIN_MAX,TROPONIN_STATUS,ENC_AGE,GENDER,B48hr
HSP_ENC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260755660,0,0,0,0,0,0,0,0.0,0.0,2.0,...,28.0,106.0,72.0,6,,,,60,FEMALE,0.0
192470437,0,0,0,0,0,0,1,,,0.0,...,30.0,96.0,54.0,5,1.0,,,74,FEMALE,1.0
258754156,0,0,0,0,0,0,0,,,0.0,...,28.0,119.0,49.0,3,,,,96,MALE,0.0
306050512,0,0,0,0,1,1,0,0.0,0.0,0.0,...,25.0,112.0,74.0,1,1.0,0.004695,0.0,78,FEMALE,1.0
163297609,0,0,0,0,0,0,0,,,3.0,...,28.0,92.0,51.0,9,,,,47,MALE,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178796884,0,0,0,0,0,0,0,,,0.0,...,20.0,134.0,74.0,8,,,,57,FEMALE,0.0
1829587297,0,0,1,0,0,0,1,,,0.0,...,39.0,140.0,69.0,2,,,,80,MALE,1.0
1802565292,0,0,0,0,0,0,0,,,0.0,...,20.0,98.0,57.0,0,,,,31,FEMALE,0.0
1829448073,0,0,0,0,0,0,0,1.0,1.0,0.0,...,20.0,115.0,54.0,1,,,,78,MALE,0.0


In [336]:
min_data['GENDER'].value_counts()

FEMALE    862
MALE      780
Name: GENDER, dtype: int64

In [338]:
min_data['GENDER'] = min_data['GENDER'].replace({'MALE':1, 'FEMALE':0})

In [339]:
min_data['CPR'] = min_data['CPR'].replace('CPR', 1)

In [340]:
min_data = min_data.loc[:, 'echo_dilation':'B48hr']

In [341]:
min_data.shape

(1642, 15)

In [342]:
min_data

Unnamed: 0_level_0,echo_dilation,echo_function,CANCER_RANK,PULSE OXIMETRY_min,PULSE_max,RESPIRATIONS_max,BLOOD PRESSURE SBP_min,BLOOD PRESSURE DBP_min,wscore,BNP_STATUS,TROPONIN_MAX,TROPONIN_STATUS,ENC_AGE,GENDER,B48hr
HSP_ENC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
260755660,0.0,0.0,2.0,88.0,140.0,28.0,106.0,72.0,6,,,,60,0,0.0
192470437,,,0.0,98.0,127.0,30.0,96.0,54.0,5,1.0,,,74,0,1.0
258754156,,,0.0,85.0,71.0,28.0,119.0,49.0,3,,,,96,1,0.0
306050512,0.0,0.0,0.0,91.0,145.0,25.0,112.0,74.0,1,1.0,0.004695,0.0,78,0,1.0
163297609,,,3.0,94.0,103.0,28.0,92.0,51.0,9,,,,47,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178796884,,,0.0,96.0,114.0,20.0,134.0,74.0,8,,,,57,0,0.0
1829587297,,,0.0,94.0,132.0,39.0,140.0,69.0,2,,,,80,1,1.0
1802565292,,,0.0,97.0,123.0,20.0,98.0,57.0,0,,,,31,0,0.0
1829448073,1.0,1.0,0.0,87.0,108.0,20.0,115.0,54.0,1,,,,78,1,0.0


# Split Data

In [343]:
nSeed = 42
np.random.seed(nSeed)

In [344]:
dictSplitIndices = {}

nOuterFolds = 3
nInnerFolds = 3

nSamples = min_data.shape[0]

objOuterStrat = StratifiedKFold(n_splits=nOuterFolds, shuffle=True,
                                random_state=nSeed)
objInnerStrat = StratifiedKFold(n_splits=nInnerFolds, shuffle=True,
                                    random_state=nSeed)

In [345]:
lsOuterSplits = list(objOuterStrat.split(np.zeros(nSamples), min_data['B48hr']))
lsTupInnerSplits = []
for nOuterIdx, tupOuterSplits in enumerate(lsOuterSplits):
    arrOuterTrain = tupOuterSplits[0]
    arrOuterTest = tupOuterSplits[1]
    
    dictSplitIndices['outer_train_{}'.format(nOuterIdx)] = arrOuterTrain
    dictSplitIndices['outer_test_{}'.format(nOuterIdx)] = arrOuterTest
    
    nInnerTrainSamples = len(arrOuterTrain)
    lsInnerSplits = list(objInnerStrat.split(np.zeros(nInnerTrainSamples),
                                             min_data['B48hr'].iloc[arrOuterTrain]))
    for nInnerIdx, tupInnerSplits in enumerate(lsInnerSplits):
        arrInnerTrain = tupInnerSplits[0]
        arrInnerTest = tupInnerSplits[1]
        
        arrInnerTrain = arrOuterTrain[arrInnerTrain]
        arrInnerTest = arrOuterTrain[arrInnerTest]
        dictSplitIndices['outer_{}_inner_train_{}'.format(nOuterIdx, nInnerIdx)] = arrInnerTrain
        dictSplitIndices['outer_{}_inner_test_{}'.format(nOuterIdx, nInnerIdx)] = arrInnerTest
        
        lsTupInnerSplits.append((arrInnerTrain, arrInnerTest))

# Conducting random search on min_data

In [346]:
nModelConfigs = 4
dfModelSearch = pd.DataFrame(columns=['estimator', 'param_distributions', 
                                      'best_inner_models', 'cv_results',
                                      'test_results', 'test_predictions'],
                             index=range(nModelConfigs))
dfModelSearch['test_predictions'] = dfModelSearch['test_predictions'].astype('object')
dfModelSearch['test_results'] = dfModelSearch['test_results'].astype('object')
dfModelSearch['estimator'] = [SVC(), DecisionTreeClassifier(), 
                              GradientBoostingClassifier(), MLPClassifier()]
dfModelSearch['param_distributions'] = [{'C': np.arange(0.1, 1, 0.1),
                                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                         'degree': np.arange(3, 100, 3),
                                         'gamma': ['scale']},
                                        {'criterion': ['gini', 'entropy'],
                                         'splitter': ['best', 'random'],
                                         'max_depth': [None, 100, 500, 1000],
                                         'max_leaf_nodes': [None, 100, 500, 1000],
                                         'min_samples_split': np.arange(0.1, 1, 0.1),
                                         'max_features':['auto', 'sqrt', 'log2', None]},
                                        {'loss':['deviance', 'exponential'],
                                         'learning_rate':np.arange(0.1, 1, 0.1),
                                         'n_estimators': np.arange(100, 5000, 100),
                                         'max_depth': [None, 100, 500, 1000]},
                                        {'hidden_layer_sizes': [(100,),
                                                                (100,)*5,
                                                                (100,)*10,
                                                                (100,)*20],
                                         'activation': ['identity', 'logistic', 'tanh', 'relu'],
                                         'solver': ['lbfgs', 'sgd', 'adam'],
                                         'learning_rate': ['constant', 'invscaling', 'adaptive']}]

In [347]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [348]:
def get_ls_best_model_params(dictCvResults, nOuterFolds, lsSummaryMetrics):
    dfCvResults = pd.DataFrame(dictCvResults)
    lsBestParams = []
    for strCol in lsSummaryMetrics:
        nIdxMax = dfCvResults[strCol].idxmax()
        lsBestParams.append(dfCvResults.at[nIdxMax, 'params'])
        
    return lsBestParams

In [349]:
def add_summary_measures_dictCv(dictCvResults, nOuterFolds, nInnerFolds):
    dfCvResults = pd.DataFrame(dictCvResults)
    for nOuter in range(nOuterFolds):
        lsKeys = ['split{}_test_score'.format(nInner) for nInner in range(nOuter*nInnerFolds, nOuter*nInnerFolds + nInnerFolds)]
        # Compute bottom 25th percentile
        dfCvResults['outer{}_test_score_summary'.format(nOuter)] = dfCvResults[lsKeys].mean(axis=1) - dfCvResults[lsKeys].std(axis=1)*1.645 

    return dfCvResults.to_dict()

In [350]:
def run_random_search(nConfig, dfModelSearch, dfData, lsFeatures, strTarget):
    objRandomSearch = RandomizedSearchCV(dfModelSearch.at[nConfig, 'estimator'],
                                         dfModelSearch.at[nConfig, 'param_distributions'],
                                         n_iter=50,
                                         scoring='roc_auc',
                                         random_state=nSeed,
                                         return_train_score=True,
                                         n_jobs=-1,
                                         cv=lsTupInnerSplits)
    
    objScaler = StandardScaler()
    objImpute = SimpleImputer(strategy='median')

    lsSteps = [('std_scaler', objScaler),
               ('simple_imputer', objImpute),
               ('random_search', objRandomSearch)]

    objPipeline = Pipeline(lsSteps)
    
    objPipeline.fit(dfData[lsFeatures], dfData[strTarget])
    
    dictCvResults = objPipeline.named_steps['random_search'].cv_results_
    dictCvResults = add_summary_measures_dictCv(dictCvResults, nOuterFolds, nInnerFolds)
    dfModelSearch.at[nConfig, 'cv_results'] = dictCvResults

    lsSummaryMetrics = ['outer{}_test_score_summary'.format(nOuter) for nOuter in range(nOuterFolds)]
    dfModelSearch.at[nConfig, 'best_inner_models'] = get_ls_best_model_params(dictCvResults, 
                                                                              nOuterFolds,
                                                                              lsSummaryMetrics)
    
    dfModelSearch.at[nConfig, 'test_results'] = []
    dfModelSearch.at[nConfig, 'test_predictions'] = []
    for dictParams in dfModelSearch.at[nConfig, 'best_inner_models']:
        objBestModel = dfModelSearch.at[nConfig, 'estimator']
        objBestModel.set_params(**dictParams)
        objTestPipeline = Pipeline([('std_scaler', objScaler),
                            ('simple_imputer', objImpute),
                            ('best_model', objBestModel)])
        arrTrain = dictSplitIndices['outer_train_{}'.format(nOuterFold)]
        arrTest = dictSplitIndices['outer_test_{}'.format(nOuterFold)]

        arrTrainX = dfData[lsFeatures].iloc[arrTrain].values
        arrTrainY = dfData[strTarget].iloc[arrTrain].values

        arrTestX = dfData[lsFeatures].iloc[arrTest]
        arrTestY = dfData[strTarget].iloc[arrTest]
        objTestPipeline.fit(arrTrainX, 
                        arrTrainY)
        dfModelSearch.at[nConfig, 'test_results'].append(objTestPipeline.score(arrTestX, arrTestY))
        dfModelSearch.at[nConfig, 'test_predictions'].append(objTestPipeline.predict(arrTestX))
        print(objTestPipeline.score(arrTestX,
                                arrTestY))    
    return dfModelSearch

In [None]:
for i in dfModelSearch.index:
    dfModelSearch = run_random_search(0, dfModelSearch, min_data, min_data.columns[:-1], min_data.columns[-1])

In [142]:
with open("bestModel.p", "wb") as objFile:
    pickle.dump(objTestPipeline, objFile)

# Keras and DNN search and stuff