In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
%matplotlib inline

In [3]:
strDataDir = '/project/hackathon/hackers09/shared/data/'
strEchoCsv = os.path.join(strDataDir, 'df_echo.csv')
strRadCsv = os.path.join(strDataDir, 'df_radiology.csv')
strOutcomeCsv = os.path.join(strDataDir, 'df_outcome.csv')
strEncounterCsv = os.path.join(strDataDir, 'df_encounter.csv')

In [4]:
dfEcho = pd.read_csv(strEchoCsv, encoding='ISO-8859-1')
dfRad = pd.read_csv(strRadCsv, encoding='ISO-8859-1')
dfOutcome = pd.read_csv(strOutcomeCsv, encoding='ISO-8859-1')
dfEncounter = pd.read_csv(strEncounterCsv, encoding='ISO-8859-1')

In [4]:
dfEncounter.shape

(1642, 24)

In [5]:
dfOutcome.shape

(1128, 7)

In [18]:
dfOutcome.columns

Index(['PATIENT_ID', 'HSP_ENC', 'NAME', 'ORDER_TIME_DIFFSEC',
       'PROC_START_TIME_DIFFSEC', 'PROC_ENDING_TIME_DIFFSEC', 'name_gen'],
      dtype='object')

In [25]:
def expand_outcome(dfOutcome):
    ## Outcome expansion   
    
    # Drop any outcomes after 48 hr
    dfOutcome = dfOutcome[dfOutcome['ORDER_TIME_DIFFSEC']<48*60*60]
    
    # Create columns for each unique outcome
    lsExpandedOutcomeCols = []
    lsOutcomeColsToRetain = dfOutcome.columns[2:].tolist()
    lsUniqueOutcomes = dfOutcome['name_gen'].value_counts().index.tolist()
    for strOutcome in lsUniqueOutcomes:
        lsExpandedOutcomeCols += [strCol + '_' + strOutcome for strCol in lsOutcomeColsToRetain]

    dfExpandedOutcome = pd.DataFrame(columns=lsExpandedOutcomeCols+['PATIENT_ID'])
    dfExpandedOutcome['HSP_ENC'] = dfOutcome['HSP_ENC'].value_counts().index
    dfExpandedOutcome = dfExpandedOutcome.set_index('HSP_ENC')

    for nEnc in dfOutcome['HSP_ENC'].value_counts().index:
        for nIdx in dfOutcome[dfOutcome['HSP_ENC']==nEnc].index:
            lsTempCols = [strCol + '_' + dfOutcome.at[nIdx, 'name_gen'] for strCol in lsOutcomeColsToRetain]
            lsTempCols.append('PATIENT_ID')
            dfExpandedOutcome.loc[nEnc, lsTempCols] = dfOutcome.loc[nIdx, lsOutcomeColsToRetain+['PATIENT_ID']].values

    # Create boolean column for order time < 48 hrs for any outcome
    lsOrderTimeCols = [strCol for strCol in dfExpandedOutcome.columns if 'ORDER_TIME' in strCol]

    dfExpandedOutcome['b48hr'] = np.zeros(dfExpandedOutcome.shape[0])
    for nEnc in dfExpandedOutcome.index:
        for nVal in dfExpandedOutcome.loc[nEnc, lsOrderTimeCols].values:
            if nVal < 172800:
                dfExpandedOutcome.at[nEnc, 'b48hr'] = 1

    # Column for minimum order time from all outcomes
    dfExpandedOutcome['MinOrderTime'] = dfExpandedOutcome.loc[:, lsOrderTimeCols].min(axis=1)
    dfExpandedOutcome = dfExpandedOutcome.sort_values('MinOrderTime')
    
    return dfExpandedOutcome

In [6]:
dfEcho.head()

Unnamed: 0,PATIENT_ID,HSP_ENC,ORDER_PROC_ID,NAME,ORDER_INST_DIFFSEC,PROC_START_TIME_DIFFSEC,new_line,RESULT_TIME_DIFFSEC,ECHO_TYPE,NARRATIVE
0,1193,182324191,287214096,CV ECHO STUDY,16200,37260,1,81720,echo_old,STUDY DATE: 10/06/2014
1,1193,182324191,287214096,CV ECHO STUDY,16200,37260,2,81720,echo_old,REASON FOR STUDY: PULMONARY EMBOLISM
2,1193,182324191,287214096,CV ECHO STUDY,16200,37260,3,81720,echo_old,"ORDERING PHYSICIAN: LANCE TERADA,"
3,1193,182324191,287214096,CV ECHO STUDY,16200,37260,4,81720,echo_old,PERFORMED BY: * *** * InterpretingPhysician:KA...
4,1193,182324191,287214096,CV ECHO STUDY,16200,37260,5,81720,echo_old,INTERPRETATION SUMMARY * A complete two-dimens...


In [7]:
dfEcho[dfEcho['ORDER_PROC_ID']==287214096]['NARRATIVE'].sum()

'STUDY DATE: 10/06/2014REASON FOR STUDY: PULMONARY EMBOLISMORDERING PHYSICIAN: LANCE TERADA,PERFORMED BY: * *** * InterpretingPhysician:KATY LONERGAN,MD electronically signed on 10-06-2014 18:21:39 * ***INTERPRETATION SUMMARY * A complete two-dimensional transthoracic echocardiogram was performed * (2D, M-mode, Doppler and color flow Doppler). Normal LV size and * systolic function. LV EF 55% (biplane). Mildly dilated right ventricle * with mildly reduced systolic function. Flattened interventricular septum * during systole suggests RV pressure overload. Normal atrial size. No * significant valve dysfunction. Peak TR jet velocity estimates RVSP 33 * mmhg plus CVP. Normal IVC size/collapse suggests normal CVP. trace * pericardial effusion. Compared to prior exam 10/3/2014, estimated LV EF * is higher. RV appears larger. RV hypokinesis and septal flattening more * evident. * ***MMODE/2D MEASUREMENTS & CALCULATIONS * RVDd: 4.2 cm             LVIDd: 5.0 cm            IVS/LVPW: 0.9 * IVSd: 

In [26]:
def compile_echo(dfEcho):    
    # Keep only 12 hr echos
    dfEcho = dfEcho[dfEcho['ORDER_INST_DIFFSEC'] < 12*60*60]

    # Add all narratives together
    lsUniqueEchoEnc = dfEcho['HSP_ENC'].value_counts().index.tolist()

    lsEchoCols = dfEcho.columns.tolist()
    lsEchoCols.remove('new_line')
    lsEchoCols.remove('NARRATIVE')
    dfCompiledEcho = pd.DataFrame(columns=lsEchoCols)
    dfCompiledEcho['HSP_ENC'] = lsUniqueEchoEnc
    dfCompiledEcho = dfCompiledEcho.set_index('HSP_ENC')
    lsEchoCols.remove('HSP_ENC')

    lsCompiledNarratives = []
    for nEnc in lsUniqueEchoEnc:
        lsUniqueEchoOrderId = dfEcho[dfEcho['HSP_ENC']==nEnc]['ORDER_PROC_ID'].value_counts().index.tolist()
        #if len(lsUniqueEchoOrderId)>1:
            #print(nEnc)
        nFirstEchoOrderId = lsUniqueEchoOrderId[0] # Keep first one only
        
        # Compile echo data
        strCompiled = dfEcho[dfEcho['ORDER_PROC_ID']==nFirstEchoOrderId]['NARRATIVE'].sum()
        dfCompiledEcho.loc[nEnc, lsEchoCols] = dfEcho[dfEcho['ORDER_PROC_ID']==nFirstEchoOrderId][lsEchoCols].iloc[0]
        lsCompiledNarratives.append(strCompiled)
    dfCompiledEcho['NARRATIVE_compiled'] = lsCompiledNarratives

    # Keep only 12 hr echos
    dfCompiledEcho = dfCompiledEcho[dfCompiledEcho['ORDER_INST_DIFFSEC'] < 12*60*60]
    
    return dfCompiledEcho

In [27]:
def get_merge_dfs(dfOutcome, dfEcho, dfEncounter):
    dfExpandedOutcome = expand_outcome(dfOutcome)
    dfCompiledEcho = compile_echo(dfEcho)
    dfEncounter = dfEncounter.set_index('HSP_ENC')
    
    dfMerged = dfEncounter.merge(dfExpandedOutcome, how='left', on='HSP_ENC')
    dfMerged = dfMerged.merge(dfCompiledEcho, how='left', on='HSP_ENC')
    
    return dfMerged

In [28]:
dfMerged = get_merge_dfs(dfOutcome, dfEcho, dfEncounter)

In [16]:
dfMerged.to_csv('./enc_outcome_echo.csv')

In [30]:
dfOutcome[dfOutcome['HSP_ENC']==306050512]

Unnamed: 0,PATIENT_ID,HSP_ENC,NAME,ORDER_TIME_DIFFSEC,PROC_START_TIME_DIFFSEC,PROC_ENDING_TIME_DIFFSEC,name_gen
111,504,306050512,IR THROMBOLYSIS,41940.0,42000.0,,THROMBOLYSIS_PROC
113,504,306050512,IR THROMBOLYSIS,71220.0,97860.0,155400.0,THROMBOLYSIS_PROC
119,504,306050512,IR THROMBOLYSIS,56160.0,97860.0,,THROMBOLYSIS_PROC
123,504,306050512,IR THROMBOLYSIS,42000.0,42000.0,69000.0,THROMBOLYSIS_PROC
974,504,306050512,ALTEPLASE INFUSION (RADIOLOGY),48840.0,51060.0,,THROMBOLYSIS
