
Selecting MGH Cath reports


In [1]:
##septal

import os
import re
import numpy as np
import matplotlib
from pathlib import Path
import pandas as pd
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)


In [2]:
from IPython.display import display, HTML         #allows printing of notes with line breaks (*not helpful for cath reports?)

def pretty_print(df):
    return display(HTML( df.to_html().replace("\\n","<br>") ) )

In [2]:
file=('/mnt/obi0/phi/ehr/pre-sysmex_Hx_labeling/data/jgt-make_CADHx_labelling_questionnaire_20191217_JJ_out__jj.csv')  
df = pd.read_csv(file)
df.shape

(2, 3)

In [4]:
print(df)

  Unnamed: 0  score_CAD                                            methods
0  Z11211310          1  Found a pre-Sysmex note that specified that pa...
1  Z11014558          1  Note immediately before Sysmex specifies that ...


Creating parquet file

In [None]:
path_notes = "/mnt/obi0/phi/ehr/note_pull/parquet"      #finds all files in folder

files = []
for r, d, f in os.walk(path_notes):
    for file in f:
        files.append(os.path.join(file))

#for f in files:
#    print(f)

In [None]:
files.sort()                         #appends files starting with MGH_cath to cathfiles

cathfiles = []

for f in files:
    if f.startswith('MGH_cath'):
        cathfiles.append(f)



In [None]:
cathnotes_df = pd.DataFrame()                  #adds cath reports from each parquet file to cathnotes_df

for cathfile in cathfiles:
    file= "/mnt/obi0/phi/ehr/note_pull/parquet/" + cathfile
    dfnotes = pd.read_parquet(file)
    dfnotes["Stenosis"] = 0
    dfnotes=dfnotes.sort_values(by=['PatientID'], ascending=True)
    cathnotes_df = cathnotes_df.append(dfnotes)
    

In [None]:
cathnotes_df.shape                           #596,900 notes

In [None]:
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('CARDIAC CATHETERIZATION LABORATORY FINAL REPORT', case=False)]

In [None]:
cathnotes_df.shape                           #19968 labeled "Cardiac cath lab final report"

In [None]:
cathnotes_nosummary = cathnotes_df[~cathnotes_df['NoteTXT'].str.contains('CORONARY ANATOMY FINDINGS', case=True)]

In [None]:
cathnotes_nosummary.shape                    #8141 without 'coronary anatomy findings' section

In [None]:
cathnotes_nosummary2 = cathnotes_nosummary[~cathnotes_nosummary['NoteTXT'].str.contains('Right heart catheterization | RHC | RHC, | peripheral | biopsy | pericardial | pericardiocentesis | aborted | thrombolysis | pulmonary embolism | valve | MV | occluder | device | PFO | pacemaker | IABP | transseptal', case=False)]

In [None]:
cathnotes_nosummary2.shape                    #120, of which approx 1/3 do have a diagnostic cath summary, but not in typical template format

In [None]:
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('CORONARY ANATOMY FINDINGS', case=True)]

In [None]:
cathnotes_df.shape                           #11,827 with "coronary anatomy findings" section

In [None]:
cathnotes_incomplete = cathnotes_df[~cathnotes_df['NoteTXT'].str.contains('(?= .*Left Main:)(?= .*LAD:)(?= .*Left Circumflex:)(?= .*RCA:)', case=True)]

In [None]:
cathnotes_incomplete.shape                   #598 without all 4 vessel sections

In [None]:
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('Left Main:', case=True)]
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('LAD:', case=True)]
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('Left Circumflex:', case=True)]
cathnotes_df = cathnotes_df[cathnotes_df['NoteTXT'].str.contains('RCA:', case=True)]

In [None]:
cathnotes_df.shape                          #11,575 with all 4 vessel sections

In [None]:
cathnotes_df = cathnotes_df[~cathnotes_df['NoteTXT'].str.contains('transplant', case=False)]

In [None]:
cathnotes_df.shape                          #10,581 after excluding transplant cases

In [None]:
cathnotes_df = cathnotes_df[~cathnotes_df['NoteTXT'].str.contains('SVG|LIMA', case=False)]

In [None]:
cathnotes_df.shape                          #8879 after excluding CABG

In [None]:
cathnotes_df = cathnotes_df[~cathnotes_df['NoteTXT'].str.contains('Coronary Anomaly:', case=False)]

In [None]:
cathnotes_df.shape                          #8724 after excluding anomalous coronary cases

In [None]:
cathnotes_df["Stenosis"] = cathnotes_df['NoteTXT'].str.contains('stenosis|stenoses', case=False)

In [None]:
dfdatefiles = []                            #concatenates date files

for datefile in Path('/mnt/obi0/phi/ehr/cath_reports/').glob('MGH_cath_stenosis_*'):
    with open(datefile) as infile:
        dfdatefiles.append(pd.read_csv(infile, sep='\t', header=None))
        

In [None]:
dfdates = pd.concat(dfdatefiles)

In [None]:
dfdates.columns = ['PatientID', 'MRN', 'date', 'NoteID']
dfdates.iloc[0:1]

In [None]:
cathnotes_df['NoteID'] = pd.to_numeric(cathnotes_df['NoteID'])

In [None]:
df = pd.merge(cathnotes_df, dfdates, on='NoteID')

In [None]:
df.head()

In [None]:
df.drop('PatientID_y', axis=1, inplace=True)  #also maybe drop 'PatientEncounterID', 'InpatientNoteTypeDSC', 'LastFiledDTS', 'CurrentAuthorID'

In [None]:
def cathresults(cathnote):
    text = cathnote.NoteTXT
    notestart = text.find('CORONARY ANATOMY FINDINGS')
    domstart = text.find('Dominance:', notestart)
    leftmainstart = text.find('Left Main:', notestart)
    ladstart = text.find('LAD:', notestart)
    circumflexstart = text.find('Left Circumflex:', notestart)
    rcastart = text.find('RCA:', notestart)
    
    endcaps = re.compile(r'[A-Z][A-Z][A-Z][A-Z]')
    endcapsmatch = endcaps.finditer(text)
    endindices = []
    position = []
    for capsmatch in endcapsmatch:
        endindices.append(capsmatch.start())
        #print(capsmatch.start())
    for endindex in endindices:
        if endindex > rcastart:
            position = endindex
            break
    
    
    dominance = text[domstart+11 : leftmainstart]
    if domstart== -1: 
        dominance = ''
    leftmain = text[leftmainstart+11 : ladstart]
    lad = text[ladstart+5 : circumflexstart-5]
    circumflex = text[circumflexstart + 17 : rcastart]
    rca = text[rcastart + 5 : position]
    
    dominance = dominance.strip()
    leftmain = leftmain.strip()
    lad = lad.strip()
    circumflex = circumflex.strip()
    rca = rca.strip()
    
    remove = (notestart == -1) | (leftmainstart == -1) | (ladstart == -1) | (circumflexstart == -1) | (rcastart == -1)
    
    return dominance, leftmain, lad, circumflex, rca, remove

In [None]:
df[["dominance", "leftmain", "lad", "circumflex", "rca", "remove"]] = df.apply(cathresults, axis=1, result_type="expand")

In [None]:
df.shape                       #8724

In [None]:
df_temp = df[df['dominance']=='']

In [None]:
df_temp.shape                  #231 with mising dominance information

In [None]:
df['remove'].value_counts()                          #5 additional patients with incomplete cath summary, all with RCA not injected

In [None]:
df = df[~df.remove.apply(lambda x: x)]   #remove rows with "remove==True"
df = df.drop("remove", axis=1)

In [None]:
df.shape                        #8719

In [None]:
df = df[df.leftmain.apply(lambda x: len(str(x)) > 5)] 
df = df[df.lad.apply(lambda x: len(str(x)) > 5)] 
df = df[df.circumflex.apply(lambda x: len(str(x)) > 5)] 
df = df[df.rca.apply(lambda x: len(str(x)) > 5)] 


In [None]:
df.shape                       #8716

In [None]:
df['Stenosis'].value_counts()        #7570 rows with stenosis

In [None]:
dftemp = df[df.dominance.apply(lambda x: len(str(x)) >15)]           #no vessel descriptions are excessively long
dftemp.head()

In [None]:
dftemp = df[df.leftmain.apply(lambda x: len(str(x)) >700)]
dftemp.head()

In [None]:
dftemp = df[df.lad.apply(lambda x: len(str(x)) >1000)]
dftemp.head()

In [None]:
dftemp = df[df.circumflex.apply(lambda x: len(str(x)) >1000)]
dftemp.head()

In [None]:
dftemp = df[df.rca.apply(lambda x: len(str(x)) >1000)]
dftemp.head()

In [None]:
len(df.iloc[0].NoteTXT)                    #duplicates (i.e. index 0, 1, and 2) have same length

In [None]:
df.drop_duplicates(subset = 'NoteTXT', inplace=True)

In [None]:
df.shape                                   #went from 8716 to 8629 after dropping duplicates

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
pretty_print(df.iloc[0:1])                 #prints selected rows with full note text

In [None]:
df.to_parquet('/mnt/obi0/phi/ehr/cath_reports/mgh_cathreports.parquet')  


Stenosis descriptions


In [3]:
file=('/mnt/obi0/phi/ehr/cath_reports/mgh_cathreports.parquet')  
df = pd.read_parquet(file)
df.shape

(8629, 16)

In [4]:
df = df[['NoteID', 'NoteCSNID', 'PatientID_x','dominance','leftmain', 'lad', 'circumflex', 'rca']]   #keeps only ids and vessel descriptions in df 

In [None]:
pretty_print(df.iloc[0:1]) 

In [5]:
df['lad'] = df['lad'].str.replace(' {2,}', ' ')    #substitutes multiple spaces in LAD description paragraph with one space

In [6]:
df['lad'] = df['lad'].str.rstrip('.')              #removes periods at end

In [7]:
df['lad_sentence'] = df.lad.str.split(r'\.(?=[^0-9])', expand=False)  #splits at periods except if the . is followed by a number (decimal); creates list of sentences in lad_sentences

In [8]:
df['lad_sentence'] = df.lad_sentence.apply(lambda x: [y.strip() for y in x])  #removes space in front of sentences

In [9]:
def remove_empty(dframe):                     #removes empty sentences from lad_sentence
    
    lad_sents_all = dframe.lad_sentence
    
    for sent in lad_sents_all:
        if sent == '':
            lad_sents_all.remove(sent)
    
    return lad_sents_all
    

In [10]:
df['lad_sentence']= df.apply(remove_empty, axis=1)

In [None]:
print(df['lad_sentence'][0:5])
#or- df.iloc[0:5].lad_sentence

In [11]:
sentences = []          #creates sentences, a list of lists used to create Dictionary later; each list is a patient's LAD description, with the sentences as separate strings

for index, rows in df.iterrows():
    sentences.append(rows.lad_sentence)

In [13]:
ladvessels = ['ramus','ostial','proximal','mid', 'distal','diag','d1','d2','d3','septal',' sp ','apical']

In [14]:
def separatevessels(dframe):       #separates each patient's LAD description sentences into separate lists depending on which vessel the group of sentences describes -> new column vessel_cat with list of lists
    
    lad_text = dframe.lad_sentence

    vessel_cat = [['The entire LAD ']]  #first list element contains the string 'The entire LAD' (because first sentence sometimes starts with "The vessel" rather than "The LAD")                 

    for sent in lad_text:
        if any(x in sent[0:20].lower() for x in ladvessels):    #septal sometimes called "sp"
            vessel_cat.append([])       #if sentence contains a word indicating a vessel, creates a new empty list element
        vessel_cat[-1].append(sent)     #adds the sentence to the last list 
    
    return vessel_cat

In [15]:
df['vessel_cat'] = df.apply(separatevessels, axis=1)

In [16]:
df = df.explode('vessel_cat')      #separates into a separate row for each vessel

In [17]:
df = df[df.vessel_cat.apply(lambda x: not(x[0]=='The entire LAD ' and len(x)==1))]     #removes rows that only have 'The entire LAD ' in vessel_cat

In [None]:
pretty_print(df.iloc[0:5])

In [None]:
#a = [chr(i) for i in range(ord('a'),ord('m')+1)]
#print(a)

In [32]:
def ladprocess(dframe):               #extracts info from sentences describing each vessel
    
    lad_text = dframe.vessel_cat
    
    
    matched = [None] * 13             #denotes entries that match one of the patterns below (for checking)
    manual = [None] * 15              #denotes entries that need to be assessed manually
    
    remainder = []                    #list of sentences that do not fit one of the patterns below
    
    simplifiedsents = []              #contains sentences without the name of the vessel being described (to check if more than 1 vessel is being described)
    
    sent1start = lad_text[0][0:15]    #uses first 15 characters of first sentence to determine which vessel the sentences are about

    arterydict = {'The entire LAD ' : 'LAD', 'The left anteri' : 'LAD', 'The Mid LAD has' : 'mid LAD', 'The Proximal LA' : 'prox LAD', 'The 1st Diagona' : 'D1', 'The Distal LAD ' : 'distal LAD', 'The 2nd Diagona' : 'D2', 'Left anterior d' : 'LAD', 'Mid LAD at the ' : 'mid LAD', 'Proximal LAD at' : 'prox LAD', 'The Mid LAD is ' : 'mid LAD', 'Distal LAD coro' : 'distal LAD', 'The ramus inter' : 'ramus', 'he 1st Diagonal' : 'D1', 'The artery cont' : 'LAD', 'he Mid LAD has ': 'mid LAD', 'The 3rd Diagona' : 'D3', 'The Ramus has a' : 'ramus', '1st Diagonal at' : 'D1', 'The distal LAD ' : 'distal LAD', 'Not imaged' : 'not imaged', 'The proximal LA' : 'prox LAD', 'he Proximal LAD': 'prox LAD', 'The first diago' : 'D1', 'The mid LAD has' : 'mid LAD', 'Distal LAD at t' : 'distal LAD', 'The second diag' : 'D2', 'he 2nd Diagonal' : 'D2', '1st Diagonal co' : 'D1', '2nd Diagonal co' : 'D2'}
    try:
        vessel = arterydict[sent1start]
        vesselsentstart = sent1start
    except KeyError:
        vessel = 'manual'             #if the sentence does not have one of the standard sentence beginnings, labels the vessel as needing to be assessed manually
        manual[0] = 1

    
    sten_type = ''
    sten_percent = ''
    sten_descriptor = ''
    vessel_size = ''
    lad_length = ''
    lad_givesoff = ''
    prior_type = ''
    prior_status = ''
    collat_from_extent = ''
    collat_from = ''
    collat_to_extent = ''
    collat_to = ''
    occlusion_point = ''
    timi_flow = ''
    lesion_continues = ''
    
    
    for sent in lad_text:                     #assesses each sentence for the vessel
        
    
        pattern_1 = re.search(r'has an? (?P<sten_type>[a-zA-Z \,]*)? ?(?P<sten_percent>[0-9\-]+) ?% stenosis? ?(?P<sten_descriptor>.*)?', sent)
    
        if pattern_1 and sten_percent=='':
            sten_type = pattern_1.group(1)
            sten_percent = pattern_1.group(2)
            sten_descriptor = pattern_1.group(3)
            matched[1] = 1
        elif pattern_1:                               #if a value is already present in sten_percent (another sentence also gives a sten_percent value), then marks as having to be manual   
            sten_type = 'manual'
            sten_percent = 'manual'
            sten_descriptor = 'manual'
            manual[1] = 1
            remainder.append(sent)


        pattern_2 = re.search(r'descending artery is (.+) vessel(?:, which)?(.+ of the heart)?(?: and gives origin to )?(.+)?(\.)?', sent)
        
        if pattern_2 and lad_length=='':
            vessel_size = pattern_2.group(1)
            lad_length = pattern_2.group(2)
            lad_givesoff = pattern_2.group(3)
            matched[2] = 1
        elif pattern_2:
            vessel_size = 'manual'
            lad_length = 'manual'
            lad_givesoff = 'manual'
            manual[2] = 1
            remainder.append(sent)
        
        
        pattern_3 = re.search(r'at the site of previous intervention \((.+)\) (.+)', sent)
        
        if pattern_3 and prior_status=='':
            prior_type = pattern_3.group(1)
            prior_status = pattern_3.group(2)
            matched[3] = 1
        elif pattern_3:
            prior_status = 'manual'
            manual[3] = 1
            remainder.append(sent)
        
        
        pattern_4 = re.search(r'receives (.+) filling (?:through collaterals )?from (.+)', sent)
        
        if pattern_4 and collat_from=='':
            collat_from_extent = pattern_4.group(1)
            collat_from = pattern_4.group(2)
            matched[4] = 1
        elif pattern_4:
            collat_from = 'manual'
            manual[4] = 1
            remainder.append(sent)
            
            
        pattern_5 = re.search('(?:contains|has|are) (?:only )?(minimal|luminal|mild|mild luminal|minimal luminal|multiple luminal) irregulariti', sent)
        
        if pattern_5 and (sten_percent == '' or sten_percent == 'manual'):
            sten_percent = 'minimal'
            matched[5] = 1
        elif pattern_5:
            sten_percent = 'manual'
            manual[5] = 1
            remainder.append(sent)
        
        
        pattern_6 = re.search('is norm', sent)
        
        if pattern_6 and (sten_percent == '' or sten_percent == 'manual'):
            sten_percent = 'zero'
            matched[6] = 1
        elif pattern_6:
            sten_percent = 'manual'
            manual[6] = 1
            remainder.append(sent)
        
        
        pattern_7 = re.search('gives (.+) blood supply through collaterals to (.+)', sent)
        
        if pattern_7 and collat_to=='':
            collat_to_extent = pattern_7.group(1)
            collat_to = pattern_7.group(2)
            matched[7] = 1
        elif pattern_7:
            collat_to = 'manual'
            manual[7] = 1
            remainder.append(sent)
            
            
        pattern_8 = re.search(r'(?i)(not inj|not engag|not imag|not selec)', sent)
        
        if pattern_8 and sten_percent == '':
            sten_percent = 'not injected'
            matched[8] = 1
        elif pattern_8:
            sten_percent = 'manual'
            manual[8] = 1
            remainder.append(sent)
        
        
        pattern_9 = re.search(r'is totally occluded ?(.*)?', sent)
        
        if pattern_9 and sten_percent == '':
            sten_type = 'occluded'
            if pattern_9.group(1) is not None:
                occlusion_point = pattern_9.group(1)
            matched[9] = 1
        elif pattern_9:
            sten_type = 'manual'
            manual[9] = 1
            remainder.append(sent)
        
        
        pattern_10= re.search(r'TIMI flow through the lesion is (.+)', sent)
        
        if pattern_10:
            timi_flow = pattern_10.group(1)
            matched[10] = 1
        
        
        pattern_11 = re.search('is (small|moderate sized|large)$', sent)
        
        if pattern_11 and vessel_size=='':
            vessel_size = pattern_11.group(1)
            matched[11] = 1
        elif pattern_11:
            vessel_size = 'manual'
            manual[11] = 1
            remainder.append(sent)
        
        
        pattern_12 = re.search('This lesions? continues into (.+)', sent)
        
        if pattern_12 and lesion_continues == '':
            lesion_continues = pattern_12.group(1)
            matched[12] = 1
        elif pattern_12:
            lesion_continues = 'manual'
            manual[12] = 1
        
        
        #if not any(matched):
        if not(pattern_1) and not(pattern_2) and not(pattern_3) and not(pattern_4) and not(pattern_5) and not(pattern_6) and not(pattern_7) and not(pattern_8) and not(pattern_9) and not(pattern_10) and not(pattern_11) and not(pattern_12) and sent!='The entire LAD ':       
            remainder.append(sent)
            manual[13] = 1
        
        
        try:
            simplifiedsent = sent.replace(vesselsentstart, '').replace(sten_descriptor, '').replace(lad_givesoff, '').replace(collat_from, '').replace(collat_to, '').replace(occlusion_point, '').replace(lesion_continues, '') 
        except:
            simplifiedsent = sent
        
        simplifiedsents.append(simplifiedsent)        
        
    
    totalsents = ''.join(simplifiedsents)
    if any(x in totalsents.lower() for x in ladvessels): 
        manual[14] = 1                        #manual[14] = 1 if more than 1 vessel is being described in the row
        

    return vessel, sten_type, sten_percent, sten_descriptor, vessel_size, lad_length, lad_givesoff, prior_type, prior_status, collat_from_extent, collat_from, collat_to_extent, collat_to, occlusion_point, timi_flow, lesion_continues, remainder, simplifiedsents, matched, manual
            

In [33]:
df[['vessel', 'sten_type', 'sten_percent', 'sten_descriptor', 'vessel_size', 'lad_length', 'lad_givesoff', 'prior_type', 'prior_status', 'collat_from_extent', 'collat_from', 'collat_to_extent', 'collat_to', 'occlusion_point', 'timi_flow', 'lesion_continues', 'remainder', 'simplifiedsents', 'matched', 'manual']] = df.apply(ladprocess, axis=1, result_type = "expand")

#df[['stent_type', 'sten_percent', 'sten_descriptor']] = df.apply(ladprocess, axis=1, result_type="expand")

In [34]:
df.shape

(16394, 31)

In [73]:
vesseldesc = df['sten_type'].value_counts()

In [None]:
#print(df['sten_type'].value_counts().to_string())

In [100]:
vesseldescs = []

for i in range(len(vesseldesc)):
    if (vesseldesc[i] > 1):
        vesseldescs.append(vesseldesc.index[i])

In [127]:
print(vesseldescs)

['', 'focal ', 'diffuse ', 'occluded', 'long ', 'tubular ', 'eccentric ', 'irregular ', 'hazy ', 'tandem ', 'calcified ', 'long and diffuse ', 'thrombotic ', 'eccentric and calcified ', 'mild ', 'manual', 'severely calcified ', 'focal and calcified ', 'diffuse and calcified ', 'long and tubular ', 'eccentric and focal ', 'focal and severely calcified ', 'long and eccentric ', 'focal and eccentric ', 'long and irregular ', 'tapering ', 'hazy and eccentric ', 'hazy and focal ', 'diffuse and long ', 'focal and hazy ', 'eccentric and diffuse ', 'irregular and long ', 'long irregular ', 'long and calcified ', 'tubular and long ', 'eccentric and tubular ', 'eccentric and irregular ', 'long up to ', 'irregular and diffuse ', 'diffuse and severely calcified ', 'eccentric and long ', 'focal, calcified ', 'long and tandem ', 'diffuse and irregular ', 'focal and thrombotic ', 'tubular and irregular ', 'irregularities ', 'diffuse up to ', 'focal, eccentric ', 'eccentric and hazy ', 'tubular and ca

In [133]:
hello = ["h i", "hel lo", "ho la"]

In [135]:
for i in hello:
    words = i.split(' ')
    print(words)

['h', 'i']
['hel', 'lo']
['ho', 'la']


In [169]:
#words = [i.split(' ') for i in hello]
#words = [i.split(', ') for i in vesseldescs]
vesselde = []

for i in vesseldescs:
    i = i.strip()
    vesselde.extend(re.split(' ',i))     #(', | and ',i))
#for vesseld in vesseldescs:
    #vesseld.split(',')
    #vesseld.strip(',')
    #vesseld.strip(' ')
    #for vesselds in vesseld:


In [178]:
vesselde = [i.strip(',') for i in vesselde]

In [180]:
vesscounts = dict()
for i in vesselde:
    vesscounts[i] = vesscounts.get(i, 0) + 1

In [182]:
print(vesscounts)

{'': 1, 'focal': 23, 'diffuse': 22, 'occluded': 1, 'long': 27, 'tubular': 17, 'eccentric': 32, 'irregular': 17, 'hazy': 17, 'tandem': 12, 'calcified': 31, 'and': 73, 'thrombotic': 7, 'mild': 3, 'manual': 1, 'severely': 9, 'tapering': 3, 'up': 2, 'to': 2, 'irregularities': 3, 'proximal': 1, 'ulcerated': 2, 'concentric': 1, 'hazey': 1, 'heavily': 1, 'ostial': 1, 'complex': 3, 'moderately': 1, 'approximately': 1, 'subtotal': 1, 'indeterminate': 1, 'severe': 2, 'irrgular': 1, 'calcific': 1, 'plaque': 1, 'associated': 1, 'with': 1, 'a': 1}


In [183]:
for key in sorted(vesscounts):          #prints out dictionary
    print("%s: %s" % (vesscounts[key], key))

1: 
1: a
73: and
1: approximately
1: associated
1: calcific
31: calcified
3: complex
1: concentric
22: diffuse
32: eccentric
23: focal
1: hazey
17: hazy
1: heavily
1: indeterminate
17: irregular
3: irregularities
1: irrgular
27: long
1: manual
3: mild
1: moderately
1: occluded
1: ostial
1: plaque
1: proximal
2: severe
9: severely
1: subtotal
12: tandem
3: tapering
7: thrombotic
2: to
17: tubular
2: ulcerated
2: up
1: with


In [None]:
print(vesselde)

In [None]:
vesseldescs.split(',') 

In [None]:
vesseldescs.strip()

In [None]:
vesseldescs.strip(',')

In [99]:
print(vesseldescs)

['', 'focal ', 'diffuse ', 'occluded', 'long ', 'tubular ', 'eccentric ', 'irregular ', 'hazy ', 'tandem ', 'calcified ', 'long and diffuse ', 'thrombotic ', 'eccentric and calcified ', 'mild ', 'manual', 'severely calcified ', 'focal and calcified ', 'diffuse and calcified ', 'long and tubular ', 'eccentric and focal ', 'focal and severely calcified ', 'long and eccentric ', 'focal and eccentric ', 'long and irregular ', 'tapering ', 'hazy and eccentric ', 'hazy and focal ', 'diffuse and long ', 'focal and hazy ', 'eccentric and diffuse ', 'irregular and long ', 'long irregular ', 'long and calcified ', 'tubular and long ', 'eccentric and tubular ', 'eccentric and irregular ', 'long up to ', 'irregular and diffuse ', 'diffuse and severely calcified ', 'eccentric and long ', 'focal, calcified ', 'long and tandem ', 'diffuse and irregular ', 'focal and thrombotic ', 'tubular and irregular ', 'irregularities ', 'diffuse up to ', 'focal, eccentric ', 'eccentric and hazy ', 'tubular and ca

In [None]:
print(df['sten_type'].value_counts().to_string())

In [22]:
dftemp = df[df.manual.apply(lambda x: 1 in x)]          #shows rows where manual contains 1 in any position
dftemp.shape

(1833, 31)

In [None]:
pretty_print(dftemp.iloc[0:5])

In [62]:
dftemp = df[df.manual.apply(lambda x: x[14]==1)]        #shows rows where manual contains 1 in a certain position
dftemp.shape


(1099, 31)

In [None]:
pretty_print(dftemp.iloc[20:30])

In [30]:
dftemp = df[df.matched.apply(lambda x: x[12]==1)]        #shows rows where matched contains 1 in a certain position
dftemp.shape


(118, 31)

In [None]:
pretty_print(dftemp.iloc[0:8])

In [None]:
remainder_sentences = []                            #creates a list with a list for each cath description, which contains sentences as separate strings

for index, rows in df.iterrows():
    remainder_sentences.extend(rows.remainder)

In [None]:
df['manual_review'].value_counts()

In [None]:
df.shape

In [None]:
print(len(remainder_sentences))

In [None]:
remainder_sentences[0:5]

In [None]:
remainder_sentcounts = dict()
for i in remainder_sentences:
    remainder_sentcounts[i] = remainder_sentcounts.get(i, 0) + 1

In [None]:
len(remainder_sentcounts) 

In [None]:
for key in sorted(remainder_sentcounts):          #prints out dictionary
    if remainder_sentcounts[key] >5:
        print("%s: %s" % (remainder_sentcounts[key], key)) 

In [None]:
sentences = []                            #creates a list with a list for each cath description, which contains sentences as separate strings

for index, rows in df.iterrows():
    sentences.append(rows.lad_sentence)

In [None]:
vessel_cat = [[]]                        #separates each list into separate lists depending on which vessel the group of sentences describes

for sent in sentences:
    for i in sent:
        #if re.search('mid', 'distal', re.IGNORECASE):
        if any(x in i[0:20].lower() for x in ('ramus','ostial','proximal','mid', 'distal','diag','d1','d2','d3','septal')):
            vessel_cat.append([])
        vessel_cat[-1].append(i)
    vessel_cat.append([])

In [None]:
vessel_cat = [x for x in vessel_cat if x]         #removes empty lists within vessel_cat

In [None]:
for ves_sents in vessel_cat:                  #check
    if (any(x in ves_sents[0][0:20].lower() for x in ('the artery', 'imaged', 'injected', 'anterior', 'lad','ramus','ostial','proximal','mid', 'distal','diag','d1','d2','d3','septal')))==False:
        print(ves_sents[0])
#no unusual beginnings to territory sentences    

In [None]:
vessel_cat[0:5]
    

In [None]:
for ves_sents in vessel_cat:                    #check  
    for ves_sent in ves_sents:
        if any(x in ves_sent[0:20].lower() for x in ('anterior', 'lad', 'ramus','ostial','proximal','mid', 'distal','diag','d1','d2','d3','septal')):
            print(ves_sents)

In [None]:
for ves_sents in vessel_cat:                    #check
    for ves_sent in ves_sents:
        if ves_sent.startswith('The artery'):         
            if len(ves_sents)>2:
                print(ves_sents)

In [None]:
#vessel_cat = ['. '.join(map(str,x)) for x in vessel_cat]                    #joins sentences for each vessel together

In [None]:
vessel_cat[0:10]

In [None]:
df3=pd.DataFrame(vessel_cat)

In [None]:
df3.head()

In [None]:
df3.columns = ['sent1', 'sent2', 'sent3', 'sent4', 'sent5'] 

In [None]:
pretty_print(df3.iloc[0:5])

In [None]:
df3['sent1start'] = df3['sent1'].str[0:15]

In [None]:
sentstarts = df3['sent1start'].value_counts()

In [None]:
print(sentstarts)

In [None]:
print(sentstarts.to_string())            #prints full series

In [None]:
standsentstarts = []

for i in range(len(sentstarts)):
    if (sentstarts[i] > 10):
        standsentstarts.append(sentstarts.index[i])

In [None]:
print(standsentstarts)

In [None]:
df2 = df[df.sent1start.apply(lambda x: any(standstart in x for standstart in standsentstarts))]       #check

In [None]:
df2 = df[df.sent1start.apply(lambda x: 'Moderate calibe' in x)]             #check

In [None]:
pretty_print(df2)

In [None]:
arterydict = {'The left anteri' : 'LAD', 'The Mid LAD has' : 'mid LAD', 'The Proximal LA' : 'prox LAD', 'The 1st Diagona' : 'D1', 'The Distal LAD ' : 'distal LAD', 'The 2nd Diagona' : 'D2', 'Left anterior d' : 'LAD', 'Mid LAD at the ' : 'mid LAD', 'Proximal LAD at' : 'prox LAD', 'The Mid LAD is ' : 'mid LAD', 'Distal LAD coro' : 'distal LAD', 'The ramus inter' : 'ramus', 'he 1st Diagonal' : 'D1', 'The artery cont' : 'LAD', 'he Mid LAD has ': 'mid LAD', 'The 3rd Diagona' : 'D3', 'The Ramus has a' : 'ramus', '1st Diagonal at' : 'D1', 'The distal LAD ' : 'distal LAD', 'Not imaged' : 'not imaged', 'The proximal LA' : 'prox LAD', 'he Proximal LAD': 'prox LAD', 'The first diago' : 'D1', 'The mid LAD has' : 'mid LAD', 'Distal LAD at t' : 'distal LAD', 'The second diag' : 'D2', 'he 2nd Diagonal' : 'D2'}

df3['vessel'] = df3['sent1start'].map(arterydict)

In [None]:
df3[['stent_type','sten_percent','sten_descriptor']] = df3['sent1'].str.extract(r'has a (?P<sten_type>[a-zA-Z]*) (?P<sten_percent>[0-9]+) % stenosis (?P<sten_descriptor>.*)', expand=True)

In [None]:
pretty_print(df3.iloc[5:10])

In [None]:
stendescriptors = df3['sten_descriptor'].value_counts()

In [None]:
print(stendescriptors.to_string())

In [None]:
df[['lad_size','lad_length','lad_givesoff']] = df['sent1'].str.extract(r'descending artery is (.+) vessel(?:, which)?(.+ of the heart)?(?: and gives origin to )?(.+)?')

In [None]:
pretty_print(df.iloc[0:30])

In [None]:
ladsize = df['lad_length'].value_counts()
print(ladsize.to_string())

In [None]:
at the site of previous intervention (Balloon Only) is

In [None]:
#df['stenosis'] = df['sentence'].str.contains(r'(.+has a .+stenosis)')

Dictionary

In [None]:
sentences_all = [item for sublist in sentences for item in sublist]               #flattens list of lists into a list

In [None]:
print(sentences_all[0:5])

In [None]:
for i in sentences_all:
    i.strip()

In [None]:
print(len(sentences_all))               #total 23,612 left main sentences, 20,020 LAD sentences

In [None]:
sentcounts = dict()
for i in sentences_all:
    sentcounts[i] = sentcounts.get(i, 0) + 1

In [None]:
len(sentcounts)                         #1,368 unique sentences describing left main; 5,247 unique sentences describing LAD

In [None]:
for key in sorted(sentcounts):          #prints out dictionary
    print("%s: %s" % (sentcounts[key], key))

In [None]:
dfs = pd.DataFrame.from_dict(sentcounts, orient = 'index')

In [None]:
dfs['sentence'] = dfs.index
dfs.reset_index(drop=True, inplace=True)
dfs.columns = ['count', 'sentence']
dfs.head()

In [None]:
dfs['sentence'] = dfs['sentence'].astype(str)

In [None]:
dfs.head()

In [None]:
def sentparse(dfs, terr):                              #incomplete
    
    sent = dfs.sentence
    
    vessel = 0
    ost = 0
    proximal = 0
    mid = 0
    not_inj = 0
    focal = 0
    diffuse = 0
    percent = 0
    
    if 'selectively' in sent:
        not_inj = 1
        vessel = 'all'
        ost = terr

    
    return vessel, ost, proximal, mid, not_inj, focal, diffuse, percent

In [None]:
#goes with above cell
dfs[["vessel", "ost", "proximal", "mid", "not_inj", "focal", "diffuse", "percent"]] = dfs.apply(sentparse, terr = 'lad', axis=1, result_type="expand")

Data Exploration (Preliminary steps)

In [None]:
file="/mnt/obi0/phi/ehr/note_pull/parquet/MGH_cath_2018-09%_notes_pulled_10-12-2019.parquet"
df_orig= pd.read_parquet(file)
df = df_orig.copy()     #necessary?
df.shape

In [None]:
df.head()

In [None]:
df.iloc[50:100]

In [None]:
df=df.sort_values(by=['PatientID'], ascending=True)

In [None]:
unique_patients = df['PatientID'].nunique()
print(unique_patients)

In [None]:
df2 = df[df['NoteTXT'].str.contains('CARDIAC CATHETERIZATION LABORATORY FINAL REPORT', case=False)]

In [None]:
pretty_print(df2.iloc[0:2])

In [None]:
unique_patients = df2['PatientID'].nunique()
print(unique_patients)

In [None]:
array1 = df['PatientID'].unique()
array2 = df2['PatientID'].unique()


In [None]:
len(array1)

In [None]:
len(array2)


In [None]:
mask = np.in1d(array1,array2, invert=True)
mask


In [None]:
array3 = array1[mask]             #patients in df1 but not in df2 (patients who don't have any cath reports)
array3

In [None]:
df3 = df[df['PatientID'].isin(array3)]

In [None]:
pretty_print(df3.iloc[0:2])      #appears to mostly have EP procedure notes, and some with prelim cath reports

In [None]:
df4 = df2[df2['NoteTXT'].str.contains('discharge summary', case=False)]
df4.shape

In [None]:
df5 = df2[df2['NoteTXT'].str.contains('transplant|SVG|LIMA', case=False)]
df5.shape

In [None]:
pretty_print(df5.iloc[0:2]) 

In [None]:
df2.shape

In [None]:
df6 = df2[~df2['NoteTXT'].str.contains('transplant|SVG|LIMA', case=False)]

In [None]:
df6.shape

In [None]:
pretty_print(df6.iloc[0:2]) 

In [None]:
df7 = df6[~df6['NoteTXT'].str.contains('LAD|"left anterior descending"', case=False)]
df7.shape                   #RHC and pericardiocenteses contain "Diagnostic Attending" so "diagnostic" not used as a criterion

In [None]:
pretty_print(df7.iloc[0:2])                     

In [None]:
df9 = df6[df6['NoteTXT'].str.contains('LAD|"left anterior descending"', case=False)]
df9.shape

In [None]:
pretty_print(df9.iloc[0:2]) 

In [None]:
df10 = df9[df9['NoteTXT'].str.contains('stenosis|stenoses', case=False)]
df10.shape

In [None]:
df11 = df10[df10['NoteTXT'].str.contains('"no stenosis"|"no stenoses"', case=False)]
df11.shape

Discarded cells


In [None]:
#not necessary
dftemp = df.NoteTXT.str.extract(r'(RCA:)')
dftemp.head()
dftemp.isnull().values.any()
#df["Dominance"] = df["NoteTXT"].str.extract(r'(Dominance:.+Left Main:)')

In [None]:
df.iloc[2]['lad']

In [None]:
df.iloc[1188:1189]
#len(str(dftemp.iloc[0]['circumflex']))

In [None]:
#df["index2"] = df["NoteTXT"].str.find('LAD', start = df["NoteTXT"]str.find('CORONARY ANATOMY FINDINGS'))
df["Dominance"] = df["NoteTXT"].str.extract(r'(Dominance:.+Left Main:)')
#df["Dominance"] = df["Dominance"].str.replace('Dominance: ', '')
#df["Dominance"] = df["Dominance"].str.replace(' Left Main:', '')
#df["Dominance"]

In [None]:
df["Left Main"] = df["NoteTXT"].str.extract(r'(Left Main:.+LAD:)')
df["Left Main"] = df["Left Main"].str.replace('Left Main: ', '')
df["Left Main"] = df["Left Main"].str.replace(' LAD:', '')
df["Left Main"]

In [None]:
df["LAD"] = df["NoteTXT"].str.extract(r'(LAD:.+Circumflex:|LCx)')
df["LAD"] = df["LAD"].str.replace('LAD: ', '')
df["LAD"] = df["LAD"].str.replace('Left Circumflex: ', '')
df["LAD"]

In [None]:
df["Circumflex"] = df["NoteTXT"].str.extract(r'(Circumflex:.+RCA:)')
df["Circumflex"] = df["Circumflex"].str.replace('Circumflex: ', '')
df["Circumflex"] = df["Circumflex"].str.replace('RCA: ', '')
df["Circumflex"]

In [None]:
df["RCA"] = df["NoteTXT"].str.extract(r'(RCA:.+(DECISION|EQUIPMENT))')
df["RCA"] = df["RCA"].str.replace('RCA: ', '')
df["RCA"] = df["RCA"].str.replace('DECISION ', '')
df["RCA"] = df["RCA"].str.replace('EQUIPMENT ', '')
df["RCA"]

In [None]:
df2 = df.groupby("NoteID").apply(lambda x: x.NoteTXT.str.find("LAD", start = x.NoteTXT.str.find('CORONARY ANATOMY FINDINGS')))

In [None]:
Path('/mnt/obi0/phi/ehr/cath_reports/MGH_cath_stenoses.txt').touch() 

with open('/mnt/obi0/phi/ehr/cath_reports/MGH_cath_stenoses.txt', 'w') as outfile:
    for datefile in Path('/mnt/obi0/phi/ehr/cath_reports/').glob('MGH_cath_stenosis_*'):
        with open(datefile) as infile:
            for line in infile:
                outfile.write(line)

In [None]:
dfdates = pd.read_csv('/mnt/obi0/phi/ehr/cath_reports/MGH_cath_stenoses.txt', sep='\t', header=None)

In [None]:
iterables = [['ostial', 'proximal', 'mid', 'distal', 'd1', 'd2', 'd3'], ['not_inj', 'normal', 'focal', 'diffuse', 'calcified', 'eccentric', 'irregular', 'percent', 'prior_pci']]
             
cols = pd.MultiIndex.from_product(iterables, names=['vessel', 'description'])

In [None]:
colsdf = pd.DataFrame(np.zeros(shape=(8629, 63)), columns=cols)    #random.randn(8629, 30), columns=cols)

In [None]:
colsdf = colsdf.astype(int)
colsdf.head()

In [None]:
dfs = pd.concat([dfs, colsdf], axis=1)


In [None]:
for x in range(len(sentences)):
    print(sentences[x], sep='\n')

In [None]:
d = {}
for i in dfs['sentence']:
    d[i]

In [None]:

#sentdf.xs('not_inj', level = 'description', axis=1) = sentdf['sentence'].str.contains('Not selectively imaged')

dfs.loc[:, ('ostial', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('proximal', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('mid', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('distal', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('d1', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('d2', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')
dfs.loc[:, ('d3', 'not_inj')] = dfs['sentence'].str.contains('Not selectively imaged')

if dfs['sentence'].str.contains('.+has a.+stenosis'):
    dfs.loc[:, ('ostial', 'not_inj')] = 1

#sentdf.loc[:, 'not_inj'] =1

#sentdf.loc(axis=1)[:, ['not_inj','focal']] = 1 #sentdf['sentence'].str.contains('Not selectively imaged')

In [None]:
dfstyler = sentdf.style.set_properties(**{'text-align': 'left'})
dfstyler.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

In [None]:
sentdf.head()

In [None]:
df['vessel'] = df['sentence'].apply(lambda x: 'lad' if 'LAD' in x else 0)