In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
%matplotlib inline

In [2]:
strDataDir = '/project/hackathon/hackers09/shared/data/'
strEchoCsv = os.path.join(strDataDir, 'df_echo.csv')
strRadCsv = os.path.join(strDataDir, 'df_radiology.csv')
strOutcomeCsv = os.path.join(strDataDir, 'df_outcome.csv')
strEncounterCsv = os.path.join(strDataDir, 'df_encounter.csv')

In [3]:
dfEcho = pd.read_csv(strEchoCsv, encoding='ISO-8859-1')
dfRad = pd.read_csv(strRadCsv, encoding='ISO-8859-1')
dfOutcome = pd.read_csv(strOutcomeCsv, encoding='ISO-8859-1')
dfEncounter = pd.read_csv(strEncounterCsv, encoding='ISO-8859-1')

In [4]:
dfEncounter.shape

(1642, 24)

In [5]:
dfOutcome.shape

(1128, 7)

In [9]:
def expand_outcome(dfOutcome):
    ## Outcome expansion
     
    
    # Create columns for each unique outcome
    lsExpandedOutcomeCols = []
    lsOutcomeColsToRetain = dfOutcome.columns[2:].tolist()
    lsUniqueOutcomes = dfOutcome['name_gen'].value_counts().index.tolist()
    for strOutcome in lsUniqueOutcomes:
        lsExpandedOutcomeCols += [strCol + '_' + strOutcome for strCol in lsOutcomeColsToRetain]

    dfExpandedOutcome = pd.DataFrame(columns=lsExpandedOutcomeCols+['PATIENT_ID'])
    dfExpandedOutcome['HSP_ENC'] = dfOutcome['HSP_ENC'].value_counts().index
    dfExpandedOutcome = dfExpandedOutcome.set_index('HSP_ENC')

    for nEnc in dfOutcome['HSP_ENC'].value_counts().index:
        for nIdx in dfOutcome[dfOutcome['HSP_ENC']==nEnc].index:
            lsTempCols = [strCol + '_' + dfOutcome.at[nIdx, 'name_gen'] for strCol in lsOutcomeColsToRetain]
            lsTempCols.append('PATIENT_ID')
            dfExpandedOutcome.loc[nEnc, lsTempCols] = dfOutcome.loc[nIdx, lsOutcomeColsToRetain+['PATIENT_ID']].values

    # Create boolean column for order time < 48 hrs for any outcome
    lsOrderTimeCols = [strCol for strCol in dfExpandedOutcome.columns if 'ORDER_TIME' in strCol]

    dfExpandedOutcome['b48hr'] = np.zeros(dfExpandedOutcome.shape[0])
    for nEnc in dfExpandedOutcome.index:
        for nVal in dfExpandedOutcome.loc[nEnc, lsOrderTimeCols].values:
            if nVal < 172800:
                dfExpandedOutcome.at[nEnc, 'b48hr'] = 1

    # Column for minimum order time from all outcomes
    dfExpandedOutcome['MinOrderTime'] = dfExpandedOutcome.loc[:, lsOrderTimeCols].min(axis=1)
    dfExpandedOutcome = dfExpandedOutcome.sort_values('MinOrderTime')
    
    return dfExpandedOutcome

In [10]:
dfExpandedOutcome = expand_outcome(dfOutcome)

In [18]:
dfExpandedOutcome.head()

Unnamed: 0_level_0,NAME_VASOPRESSORS,ORDER_TIME_DIFFSEC_VASOPRESSORS,PROC_START_TIME_DIFFSEC_VASOPRESSORS,PROC_ENDING_TIME_DIFFSEC_VASOPRESSORS,name_gen_VASOPRESSORS,NAME_DEATH,ORDER_TIME_DIFFSEC_DEATH,PROC_START_TIME_DIFFSEC_DEATH,PROC_ENDING_TIME_DIFFSEC_DEATH,name_gen_DEATH,...,PROC_ENDING_TIME_DIFFSEC_THROMBOLYSIS,name_gen_THROMBOLYSIS,NAME_PPV,ORDER_TIME_DIFFSEC_PPV,PROC_START_TIME_DIFFSEC_PPV,PROC_ENDING_TIME_DIFFSEC_PPV,name_gen_PPV,PATIENT_ID,b48hr,MinOrderTime
HSP_ENC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
178271953,"DOPAMINE 800 MG/250 ML (3,200 MCG/ML) IN 5 % D...",157680.0,158040.0,,VASOPRESSORS,Expired,676740.0,,,DEATH,...,,,,,,,,55,1.0,-480.0
195511480,,,,,,,,,,,...,,,IP RT BIPAP,180.0,240.0,2400.0,PPV,303,1.0,180.0
299237590,EPINEPHRINE 1 MG/ML INJECTION,180.0,,,VASOPRESSORS,,,,,,...,,,,,,,,1237,1.0,180.0
305160067,EPINEPHRINE HCL 4 MG/250 ML (16 MCG/ML) IN 5 %...,23400.0,23520.0,,VASOPRESSORS,Expired,34440.0,,,DEATH,...,,,,,,,,221,1.0,240.0
302730841,VASOPRESSIN STANDARD CONCENTRATION INFUSION,4560.0,4860.0,,VASOPRESSORS,Expired,160200.0,,,DEATH,...,,,,,,,,1202,1.0,540.0


In [20]:
dfEcho.columns

Index(['PATIENT_ID', 'HSP_ENC', 'ORDER_PROC_ID', 'NAME', 'ORDER_INST_DIFFSEC',
       'PROC_START_TIME_DIFFSEC', 'new_line', 'RESULT_TIME_DIFFSEC',
       'ECHO_TYPE', 'NARRATIVE'],
      dtype='object')

In [29]:
dfEcho[dfEcho['ORDER_INST_DIFFSEC'] < 12*60*60]['ORDER_PROC_ID'].value_counts()

291302410    16
289100082    16
339472793    16
290659621    16
292323530    16
             ..
342923419     4
371951252     4
355830397     4
358692685     3
354936942     3
Name: ORDER_PROC_ID, Length: 436, dtype: int64

In [24]:
dfEcho[dfEcho['ORDER_INST_DIFFSEC'] < 12*60*60]

(5616, 10)

In [11]:
def expand_echo(dfEcho):
    # Keep only 12 hr echos
    dfEcho = dfEcho[dfEcho['ORDER_INST_DIFFSEC'] < 12*60*60]
    
    ## Echo expansion
    # Create columns for unique lines in echo narrative
    lsExpandedEchoCols = []
    for strNewLine in dfEcho['new_line'].value_counts().index.astype(str):
        lsExpandedEchoCols += [strCol + '_' + strNewLine for strCol in dfEcho.columns[3:]]
        
    lsUniqueEchoCols = dfEcho.columns[3:].tolist()

    dfExpandedEcho = pd.DataFrame(columns=['PATIENT_ID']+lsExpandedEchoCols)
    dfExpandedEcho['HSP_ENC'] = dfEcho['HSP_ENC'].value_counts().index
    dfExpandedEcho = dfExpandedEcho.set_index('HSP_ENC')

    for nEnc in dfEcho['HSP_ENC'].value_counts().index:
        for nIdx in dfEcho[dfEcho['HSP_ENC']==nEnc].index:
            lsTempCols = [strCol + '_' + dfEcho.at[nIdx, 'new_line'].astype(str) for strCol in lsUniqueEchoCols]
            lsTempCols = ['PATIENT_ID'] + lsTempCols
            dfExpandedEcho.loc[nEnc, lsTempCols] = dfEcho.loc[nIdx, ['PATIENT_ID']+lsUniqueEchoCols].values
    
    return dfExpandedEcho

In [12]:
dfExpandedEcho = expand_echo(dfEcho)

In [15]:
dfEncounter = dfEncounter.set_index('HSP_ENC')

In [None]:
dfMerged = dfEncounter.merge(dfExpandedEcho, how='left', )