In [3]:
# Needed Modules
import os
import logging
import codecs
import inspect
import glob

from xml.dom.minidom import parse
from subprocess import call

In [4]:
# parse_output function is based on code provided by Pascal Pfiffner.
# Link to code: https://github.com/p2/ClinicalTrialsNLP/blob/master/ctakes.py

In [5]:
def parse_output(filename):
  
    # parse XMI file
    root = parse(filename).documentElement

    # get all "textsem:*" for disease and medicines which store negation information.
    neg_ids = []
    for node in root.getElementsByTagName('textsem:DiseaseDisorderMention') or root.getElementsByTagName('textsem:MedicineMention'):
        polarity = node.attributes.get('polarity')
        #print(polarity.value)
        if polarity is not None and int(polarity.value) == 0:
            ids = node.attributes.get('ontologyConceptArr')
            if ids is not None and ids.value:
                neg_ids.extend([int(i) for i in ids.value.split()])
    
    # Parse cTAKES XMI output. 
    snomeds = []
    cuis = []
    rxnorms = []
    # UMLS as dictionary to match with cui's.
    #umls_ids = {}

    # pluck apart nodes that carry codified data ("refsem:UmlsConcept" namespace)
    code_nodes = root.getElementsByTagNameNS('http:///org/apache/ctakes/typesystem/type/refsem.ecore', '*')
    #print(code_nodes)
    if len(code_nodes) > 0:
        for node in code_nodes:
            #print node.toprettyxml()
            #print(node.attributes.keys())
            # check if this node is negated
            is_neg = False
            node_id_attr = node.attributes.get('xmi:id')
            if node_id_attr is not None:
                is_neg = int(node_id_attr.value) in neg_ids
                # extract SNOMED and RxNORM
                if 'codingScheme' in node.attributes.keys() \
                    and 'code' in node.attributes.keys():
                        code = node.attributes['code'].value
                        if is_neg:
                            code = "-%s" % code

                        # extract SNOMED code
                        if 'SNOMEDCT_US' == node.attributes['codingScheme'].value \
                            and 'preferredText' in node.attributes.keys(): 
                                snomeds.append( node.attributes['preferredText'].value )

                        # extract RXNORM code
                        elif 'RXNORM' == node.attributes['codingScheme'].value \
                            and 'preferredText' in node.attributes.keys(): 
                            rxnorms.append( node.attributes['preferredText'].value )

                # extract UMLS CUI
                if 'cui' in node.attributes.keys():
                    #print(node.attributes['preferredText'].value)
                    #umls_ids[node.attributes['preferredText'].value] = node.attributes['cui'].value
                    code = node.attributes['cui'].value
                    cuis.append(code)
                    if is_neg:
                        code = "-%s" % code
                        cuis.append(code)

        # make lists unique
        snomeds = list(set(snomeds))
        rxnorms = list(set(rxnorms))
        cuis = list(set(cuis))

    # create and return a dictionary (don't filter empty lists)
    ret = {
        'snomed': snomeds,
        'cui': cuis,
        'rxnorm': rxnorms,
        #'umls_ids': umls_ids
        }

    return ret

In [6]:
# Example of output.
file1 = parse_output('D:\\Project2\\cTakes_out\\17_161087.txt.xmi')
file2 = parse_output('D:\\Project2\\cTakes_out\\17_194023.txt.xmi')

# Note this .txt file is missing the preferredText key for one SNOMED instance.
#parse_output('D:\\Project2\\cTakes_out\\1606_198612.txt.xmi')
#print(file1)

In [7]:
def intersect_icus(dic1, dic2):
    # Function returns a tuple with diseases and medications from both visits of icu.
    intersect = []
    for key in dic1:
        if key in dic2:
            intersect.append(set(dic1[key]).intersection(dic2[key]))
    return intersect 

In [8]:
# Example
#intersect_icus(file1,file2)

In [9]:
def id_match(str1, str2):
    
    str1_split = str1.split("_")[1]
    id1 = str1_split.split("\\")[1]
    
    str2_split = str2.split("_")[1]
    id2 = str2_split.split("\\")[1]

    if id1 == id2:
        id = id1
    else:
        id = None
                
    return(id)

In [10]:
def save_dict_to_file(dic):
    f = open('D:\\Project2\\id_dict.txt','w')
    f.write(str(dic))
    f.close()
    
def load_dict_from_file():
    f = open('D:\\Project2\\id_dict.txt','r')
    data=f.read()
    f.close()
    return eval(data)

In [18]:
def main():
    
    #initial dictionaries
    id_dict = {}
    temp_dict_first = {}
    temp_dict_second = {}
  
    note_ctakes_output_folder = os.path.join("D:\\",'Project2','cTakes_out')
    folder = glob.glob(os.path.join(note_ctakes_output_folder, '*.txt.xmi'))
    sort_folder = sorted(folder)
    #print(sort_folder[0])
    for fp in sort_folder:
        #print(fp)
        
        #parse file 
        ct_parse_dict = parse_output(fp)
        
        # if statement to fill dictionary. 
        if fp == sort_folder[0]:
            temp_dict_first = ct_parse_dict
            
        else:
            id_str = id_match(fp_old, fp)
            temp_dict_second = ct_parse_dict 
            
            if id_str != None:
                #print(id_str)
                # intersect
                intersect_dict = intersect_icus(temp_dict_first, temp_dict_second)
                
                # Print to check number of values intersect.
                #print( len(intersect_dict[0]) )
                #print( len(temp_dict_first['snomed'] ) )
                
                # Note snomed, cui, and rxnorm are stored as tuple.
                id_dict[id_str] = intersect_dict
                
            # Store current cTakes parse output to use for next iteration.
            temp_dict_first = ct_parse_dict
         
        #store old file name 
        fp_old = fp
        
    # Save to text file.
    save_dict_to_file(id_dict)
        
if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling
    


In [11]:
# Load dictionary from main().
id_dict = load_dict_from_file()

In [None]:
# Summarize results from new dictionary.

In [38]:
id_dict['10004'][2]

10004
1004
10059
10071
10094
10117
10119
10124
10125
10134
10149
10152
10167
1018
10197
10206
10224
10226
10236
10248
10251
10257
10262
10272
1027
1029
10301
10302
10303
10304
10305
10310
10315
10328
10369
10390
10399
10414
10416
10417
10422
10423
10428
1044
10471
10478
10512
10525
10532
10560
10569
10594
10595
105
10604
10623
1062
10637
10653
10655
10668
10676
10677
10686
10687
10689
10694
10699
1069
10704
10725
10736
10742
10753
10806
10833
10859
1086
10886
10916
10924
10928
10932
10939
1093
10947
10948
10972
10976
10992
11007
11021
11024
11049
11050
11061
1106
11077
11085
11090
11099
11109
11123
11135
11143
11146
11147
11165
11171
11195
111
11202
1121
11229
11234
1124
11255
11285
11287
11288
11295
112
11317
11321
11323
11328
11335
11338
11343
11348
1134
11356
1135
11362
11369
1137
11382
1141
11427
11432
11448
11460
11464
11473
11474
11477
11479
11505
11512
11526
11554
11563
11567
11585
11587
11588
11604
11608
1160
11610
11618
11623
11627
1162
11634
11643
11646
11656
11667
11674
1167

In [None]:
# Find most mentioned words/ids in id_dict.
snomed_counts = {}
cui_counts = {}
rxnorm_counts = {}

for id in id_dict.keys():
    #print(id)
    # Number of lists stored in each entry is 3
    tup = (0,1,2)
    for i in tup:
        for phrase in id_dict[id][i]:
            
            if i == 0 and phrase not in snomed_counts:
                snomed_counts[phrase] = 1
            elif i == 0 and phrase in snomed_counts:
                snomed_counts[phrase] += 1
            elif i == 1 and phrase not in cui_counts:
                cui_counts[phrase] = 1
            elif i == 1 and phrase in cui_counts:
                cui_counts[phrase] += 1
            elif i == 2 and phrase not in rxnorm_counts:
                rxnorm_counts[phrase] = 1
            else:
                rxnorm_counts[phrase] += 1
                

In [46]:
# Sort the dictionaries by value
def sort_count(dict, n_out):
    lst = list()
    for key, val in dict.items():
        lst.append( (val, key) )

    lst.sort(reverse=True)

    for key, val in lst_snomed[:n_out] :
        print(key, val)

In [None]:
# Sort the dictionaries by value
sort_count(snomed_counts,10)

In [None]:
sort_count(cui_counts,10)

In [None]:
sort_count(rxnorm_counts,10)

In [None]:
# Notes for future research:

# Patient Disease and Treatment extraction:

# 1. Find patients who visited the ICU two times and treated for same illness both times.
        # Q: Should we care about time in between visits?
    
# 2. Extract Medicine that patient recieved while in ICU for both visits. Available through cTAKES using rxNORM concepts.

In [None]:
# Phenotype: Surrogate markers combine with treatment decision 
# Marker that it is probable they have disease and incorporate into decision function.

# Anchor event.

In [24]:
# Maximize time to return to icu. Delay return to icu visit as response variable. 
