In [42]:
import re
import spacy
import pandas as pd
import numpy as np
import pickle
import nltk
import time
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [2]:
nlp = spacy.load('en_core_web_lg')

#### SAVE UPDATES TO THE DICTIONARIES

In [3]:
#Diseases, Anatomy, Devices, Procedures
def save_dicts():
    pd.DataFrame(vis_dis_list, columns = ['name']).to_csv("data/csvs/radlex_vis_dis.csv")
    pd.DataFrame(anatomy_list, columns = ['name']).to_csv("data/csvs/radlex_anatomy.csv")
    pd.DataFrame(device_list, columns = ['name']).to_csv("data/csvs/radlex_devices.csv")
    pd.DataFrame(procedure_list, columns = ['name']).to_csv("data/csvs/radlex_procedures.csv")

    pd.DataFrame(locations, columns = ['name']).to_csv("data/csvs/radlex_location_list.csv")
    pd.DataFrame(descriptors, columns = ['name']).to_csv("data/csvs/radlex_descriptor_list.csv")

    pd.DataFrame(change_list, columns = ['name']).to_csv('data/csvs/change_list.csv')
    pd.DataFrame(normal_list, columns = ['name']).to_csv('data/csvs/normal_list.csv')
    pd.DataFrame(degree_list, columns = ['name']).to_csv('data/csvs/degree_list.csv')

    pd.DataFrame(high_hedges, columns = ['name']).to_csv("data/csvs/high_hedges.csv")
    pd.DataFrame(med_hedges, columns = ['name']).to_csv("data/csvs/med_hedges.csv")
    pd.DataFrame(low_hedges, columns = ['name']).to_csv("data/csvs/low_hedges.csv")
    pd.DataFrame(post_hedge_list, columns = ['name']).to_csv("data/csvs/post_hedge_list.csv")

#### Load Data

In [17]:
nlp_data_dir = "/mnt/nfs/projects/cxr/nlp/data"
cxr_rpt_dir = "/mnt/nfs/projects/cxr/reports/processed/"

In [5]:
# TODO need to get a list of accnums that contain PA views

In [6]:
with open ("{}/pickles/term_rules.p".format(nlp_data_dir), 'rb') as f:
    term_rules = pickle.load(f)
with open("{}/pickles/neg_rules.p".format(nlp_data_dir), "rb") as f:
    neg_rules = pickle.load(f)
with open("{}/pickles/prev_rules.p".format(nlp_data_dir), "rb") as f:
    prev_rules = pickle.load(f)

In [7]:
vis_dis_list = pd.read_csv("{}/csvs/radlex_vis_dis.csv".format(nlp_data_dir)).name.tolist()
anatomy_list = pd.read_csv("{}/csvs/radlex_anatomy.csv".format(nlp_data_dir)).name.tolist()
device_list = pd.read_csv("{}/csvs/radlex_devices.csv".format(nlp_data_dir)).name.tolist()
procedure_list = pd.read_csv('{}/csvs/radlex_procedures.csv'.format(nlp_data_dir)).name.tolist()

change_list = pd.read_csv('{}/csvs/change_list.csv'.format(nlp_data_dir)).name.tolist()
normal_list = pd.read_csv('{}/csvs/normal_list.csv'.format(nlp_data_dir)).name.tolist()
degree_list = pd.read_csv('{}/csvs/degree_list.csv'.format(nlp_data_dir)).name.tolist()

locations = pd.read_csv("{}/csvs/radlex_location_list.csv".format(nlp_data_dir)).name.tolist()
descriptors = pd.read_csv("{}/csvs/radlex_descriptor_list.csv".format(nlp_data_dir)).name.tolist()
post_hedge_list = pd.read_csv("{}/csvs/post_hedge_list.csv".format(nlp_data_dir)).name.tolist()
high_hedges = pd.read_csv("{}/csvs/high_hedges.csv".format(nlp_data_dir)).name.tolist()
med_hedges = pd.read_csv("{}/csvs/med_hedges.csv".format(nlp_data_dir)).name.tolist()
low_hedges = pd.read_csv("{}/csvs/low_hedges.csv".format(nlp_data_dir)).name.tolist()

In [8]:
def sort_list(lst):
    lengths = [len(item) for item in lst]
    lst_df = pd.Series(lengths, index = lst) #, columns = ['len'])
    lst_df = lst_df.sort_values(ascending = False)
    return lst_df.index.tolist()

In [9]:
#Set Up Hedge Scoring System & Check That No Hedges Are Placed in Multiple Categories
for h in high_hedges:
    for m in med_hedges:
        for l in low_hedges:
            if h == m or h == l or l == m:
                print("ERROR: redundancy found", h, m, l)

hedge_list = list(set(high_hedges + low_hedges + med_hedges ))         

hedge_dict = {}
for word in low_hedges:
    hedge_dict[word] = 'low'
for word in med_hedges:
    hedge_dict[word] = 'medium'
for word in high_hedges:
    hedge_dict[word] = 'high'
    
hedge_scores = { 'low': 3,'medium': 2,'high': 1 }

In [18]:
# radnet = pd.read_excel("data/csvs/radnet_norm_parsed.xlsx")
# df = pd.read_csv("{}/radnet_cxr_100K_reports_parsed.csv".format(cxr_rpt_dir), delimiter="|", dtype=str)
df = pd.read_csv("{}/radnet_cxr_784K_reports_parsed.csv".format(cxr_rpt_dir), delimiter="|", dtype=str)

In [20]:
df.shape

(784248, 10)

In [19]:
df.columns

Index(['mrn', 'accnum', 'EXAM', 'HISTORY', 'TECHNIQUE', 'COMPARISON',
       'FINDINGS', 'IMPRESSION', 'Conclusion', 'report'],
      dtype='object')

### Sentence Processing

In [21]:
def pre_process_sents(findings):
    if type(findings) == None or type(findings) == float:
        return []
    else:
        sentences = nltk.tokenize.sent_tokenize(findings)
        sentences = [sent.lower() for sent in sentences]
        sentences = [sent.split("   ") for sent in sentences]
        sentences = [sent for sents in sentences for sent in sents]
        sentences = [re.sub('\d+?/\d+?/\d{2,}', '', sent) for sent in sentences]
        sentences = [sent.replace("/", " ").replace("\n", " ") for sent in sentences]
        sentences = [sent.replace("chronic obstructive pulmonary disease", "copd") for sent in sentences]
        sentences = [sent.replace("coronary artery bypass graft", "cabg") for sent in sentences]
        sentences = [sent.replace("coronary bypass surgery", "cabg") for sent in sentences]
        sentences = [sent.replace("tb", "tuberculosis") for sent in sentences]
        sentences = [sent.replace("cp", "costophrenic") for sent in sentences]
        sentences = [sent.replace(".", " ") for sent in sentences]
        return sentences

def get_sents(df):
    imps, finds = [], []
    for i in range(len(df)):
        pt = df.iloc[i]
        imps.append(pre_process_sents(pt.IMPRESSION))
        finds.append(pre_process_sents(pt.FINDINGS))
    imps = list(set([sent for sents in imps for sent in sents]))
    finds = list(set([sent for sents in finds for sent in sents]))
    return imps, finds

### ClauseSplitter

In [22]:
'''Split initial list of sentences into a list of sentences by clause'''
def split_by_clause(sentence, term_rules):
    
    '''Subfunction to split up sentence if the word AND is present'''
    def split_ands(phrases):
        new_phrases = []
        for phrase in phrases:
            if phrase.count('and') == 1 and "," not in phrase:
                parts = phrase.split('and')
                pos1, pos2 = [token.pos_ for token in nlp(parts[0])], [token.pos_ for token in nlp(parts[1])]
                if 'NOUN' in pos1 and 'VERB' in pos1 and 'NOUN' in pos2 and 'VERB' in pos2:  #maybe also 'ADV'
                    new_phrases.append(parts[0])
                    new_phrases.append(parts[1])
                else:
                    new_phrases.append(phrase)
            else:
                new_phrases.append(phrase)
        return new_phrases
    
    '''Subfunction to split up sentence into comma-separated phrases'''
    def split_sent_by_comma(sent):
        list_start, list_end = 0, 0
        comma_indices = [c.start() for c in re.finditer(',', sent)]

        lst_indices = []

        #if oxford comma
        if re.findall(', (((\w+\s?){1,2},)\s)+?(and|or)', sent):
            lst_indices = [(c.start(), c.end()) for c in re.finditer(', (((\w+\s?){1,2},)\s)+?(and|or)', sent)]
        #if no oxford comma
        #elif re.findall(', ((\w+\s?){1,2},)+?(\s\w+)\s(and|or)', sent):
        elif re.findall('((\w+\s?){1,2},\s?)+?(\s\w+)+?\s(and|or)', sent):
            #lst_indices = [(c.start(), c.end()) for c in re.finditer(', ((\w+\s?){1,2},)+?(\s\w+)\s(and|or)', sent)]
            lst_indices = [(c.start(), c.end()) for c in re.finditer('((\w+\s?){1,2},\s?)+?(\s\w+)+?\s(and|or)', sent)]

        split_zones = [0]
        for j in range(len(lst_indices)):
            split_zones = split_zones + [i for i in range(split_zones[-1], lst_indices[j][0])] + [lst_indices[j][1]]
        split_zones = split_zones + [i for i in range(split_zones[-1], len(sent))]

        to_split = []

        for idx in comma_indices:
            if idx in split_zones:
                to_split.append(idx)

        sxns = [sent[i:j] for i, j in zip([0] + to_split, to_split + [len(sent)])]

        return sxns

    term_pat, clauses = "\[TERM\]", []

    for rule in term_rules:  #check every rule for a clause termination word 
        reformatRule = re.sub(r'\s+', '_', rule[0].strip())
        sentence = rule[3].sub(' ' + rule[2].strip()   #add in Negation tag to 
                                  + reformatRule + rule[2].strip() + ' ', sentence)

    if re.findall(term_pat, sentence, flags = re.IGNORECASE):   #if termination words exist, split up the phrases by them
        phrases = re.split(term_pat, sentence, flags = re.IGNORECASE)
        phrases = [" ".join([word.strip() for word in phrase.split()]) for phrase in phrases if len(phrase.split()) > 1]
        phrases = [split_sent_by_comma(phrase) for phrase in phrases]  #Split phrases by comma, except in list case
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        phrases = [re.split(';|:', phrase) for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        phrases = [phrase.split("  ") for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        for phrase in phrases:
            clauses.append(phrase.lower())
    else:  #if no termination words exist, return listicized sentence following other split rules
        phrases = split_sent_by_comma(sentence) 
        phrases = [re.split(';|:', phrase) for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        phrases = [phrase.split("  ") for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        phrases = split_ands(phrases)
        for phrase in phrases:
            if len(phrase) != 0:
                clauses.append(phrase.lower())      
    return clauses

### Supporting Classes (Descriptor, ChangeEntity, ExtractedEntity)

In [23]:
class Descriptor(object):
    def __init__(self, name, qualifiers = [], hedges = []):
        self.name = name
        self.hedge = self.set_hedging(hedges)
        self.qualifiers = qualifiers
        
    def set_hedging(self, hedges):
        self.hedge = ", ".join(hedges)
    
    def describe(self):
        if len(self.qualifiers) > 0:
            return self.name + " (" + ", ".join(self.qualifiers) + ")"
        else:
            return self.name
        
class ChangeEntity(object):
    def __init__(self, name, prior_exam_exists = False, location = '', description = []):
        self.name = name
        self.prior_exam_exists = prior_exam_exists
        self.location = location
        self.description = description  

        
class ExtractedEntity(object):
    def __init__(self, name, ent_type = '', location = '', hedges = [],
                 description = [], 
                 is_previous = False, is_normal = False, is_negated = False):
        
        self.name = name
        
        self.ent_type = ent_type  #Vis-Disease, Anatomy, Procedure, Device, Change
        self.location = location
        self.hedges = hedges
        self.description = description  #this should hold Descriptor objects
        
        self.previous = is_previous  #should be 'current' or 'previous'
        self.normality = is_normal  #should be 'abnormal' or 'normal'
        self.negated = is_negated
        
        self.hedging = self.set_hedging(self.hedges)
        
    def describe(self):
        return_string = ""
        for descriptor in self.description:
            doc = nlp(descriptor.describe())
            tok_pos = [token.pos_ for token in doc]
            if 'NOUN' not in tok_pos:
            
                if descriptor.hedge is None:
                    return_string = descriptor.describe() + ", " + return_string
                else:
                    #modify this later to match test set format
                     return_string = descriptor.describe() + " (" + descriptor.hedge + "), " + return_string
        return return_string.strip(", ")
               
    def is_changed(self, change_rules):
        change_present = False
        
        if self.name in change_rules:
            change_present = True
        for descriptor in self.description:
            if descriptor.name in change_rules:
                change_present = True
        
        if not change_present:
            return False
        elif self.negated:
            return False
        else:
            return True
    
    def set_hedging(self, hedges):
        self.hedging = ", ".join(hedges)
        self.hedges = hedges
    
    def set_hedge_level(self, hedge_dict, hedge_scores):
        score = np.sum([hedge_scores[hedge_dict[word]] for word in self.hedges])
        if score <= 1:
            self.hedge_level = 'high'
        elif score >= 3:
            self.hedge_level = 'low'
        else:
            self.hedge_level = 'medium'
        return self.hedge_level
    
    def get_description(self, certainties = True):
        if certainties:
            return str([str(mod.certainty) + " " + str(mod.name) for mod in self.description]).replace("[", "").replace("]", "")
        else:
            return str([mod.name for mod in self.description]).replace("[", "").replace("]", "")
        
    def output(self):
        output = str(self.name)
        values = [self.category, self.location, 
                      self.get_description(), self.certainty]
        value_names = ['category', 'location', 'description', 'certainty']
        
        for i in range(len(values)):
            value = values[i]
            if value is not '':
                output = output + "     " + value_names[i].upper() + ": " + value
        return output 

In [125]:
class ReportExtractor(object):
    def __init__(self, 
                 clause = None, neg_rules = None, prev_rules = None, 
                 
                 vis_dis_list = [], anatomy_list = [], 
                 procedure_list = [], device_list = [], change_list = [],
                 
                 locations_list = [], descriptor_list = [], normal_list = [],
                 
                 hedge_list = [], post_hedge_list = [], 
                 hedge_dict = {}, hedge_scores = {}, grab = False):
        
        self.__filler = '_'
        self.__clause = clause
        self.__clause_doc = nlp(clause)
        self.__keys = ['visual_disease', 'anatomy', 'procedure', 'device', 'change']
        self.__neg_rules = neg_rules
        self.__prev_rules = prev_rules
        self.__vis_dis_list = vis_dis_list
        self.__anatomy_list = anatomy_list
        self.__procedure_list = procedure_list
        self.__device_list = device_list
        
        
        self.__joined_rules = []
        
        for rule_set in [neg_rules, prev_rules]:
            if rule_set is not None:
                self.__joined_rules = self.__joined_rules + rule_set
        
        self.__dicts = self.bind_dicts([vis_dis_list, anatomy_list, procedure_list, device_list, change_list])
        
        self.__neg_list = [rule[0] for rule in neg_rules if rule[3] is not '[PSEU]']
        self.__location_list = locations_list
        self.__descriptor_list = descriptor_list
        self.__normal_list = normal_list
        self.__change_list = change_list
        
        
        self.__hedge_list = hedge_list
        self.__post_hedges = post_hedge_list
        self.__hedge_dict = hedge_dict
        self.__hedge_scores = hedge_scores
        
        self.__grab = grab
        return None
    
    '''MAIN CALL'''
    def run_extractor(self):
        self.__chunks, self.__tagged_chunks, self.__raw_mapping_dicts, self.__full_chunks_dicts = self.run_chunker() 
        self.__tagged_clause = self.tag(self.__clause)
        self.__chunk_idx_dicts = self.get_indices()
        self.__all_entities = self.parse_indices()   #should be a list of ExtractedEntity objects
        self.apply_hedges()
        return self.clean_output()

    
    '''INITIALIZING FUNCTIONS'''
    def bind_dicts(self, lists):
        new_dict, key_idx = {}, 0
        for lst in lists:
            new_dict[self.__keys[key_idx]] = lst
            key_idx += 1
        return new_dict
    
    def tag(self, phrase):
        for rule in self.__joined_rules:
                reformatRule = re.sub(r'\s+', self.__filler, rule[0].strip())
                phrase = rule[3].sub(' ' + rule[2].strip() + reformatRule + rule[2].strip() + ' ', phrase)
        return phrase 
    
    '''MAPPING TO RADLEX FUNCTIONS'''
    def map_text_to_radlex(self, text, dictionary):
        text = text.strip()
        if text in dictionary:
            return text, text
        
        #SUFFIX SUBSTITUTIONS
        #plurals
        elif text.replace("'s", "") in dictionary:
            return text, text.replace("'s", "")
        elif text.replace("ies", "y") in dictionary:
            return text, text.replace("ies", "ty")
        elif text.replace("es", "is") in dictionary:
            return text, text.replace("es", "is")
        elif text[:-1] in dictionary:
            return text, text[:-1]
        #-ing/-ed/-ion words all map to -ed (ie, "hyperinflation of the lungs")
        elif text.replace("ing", "ed") in dictionary:
            return text, text.replace("ing", "ion")
        elif text.replace("ing", "") in dictionary:  #should catch also spurring/spur
            return text, text.replace("ing", "")
        elif text.replace("ring", "") in dictionary:
            return text, text.replace("ring", "")
        elif text.replace("ion", "ed") in dictionary:
            return text, text.replace("ion", "ed")
        elif text.replace("ative", "ed") in dictionary:
            return text, text.replace("ative", "ed") 
        #-atous/-a matches (ie emphysema/emphysematous, atheroma/atheromatous)
        elif text.replace("tous", "") in dictionary:
            return text, text.replace("tous", "")
        #WHY THIS?
        elif text.replace("ed", "ement") in dictionary:
            return text, text.replace("ed", "ement")
        #haziness/hazy
        elif text.replace("iness", "y") in dictionary:
            return text, text.replace("iness", "y")
        #mildly/mild
        elif text.replace("ly", "") in dictionary:
            return text, text.replace("ly", "") 
        #tortuousity/tortuous
        elif text.replace("ity", "") in dictionary:
            return text, text.replace("ity", "")
        #infiltrate/infiltration
        elif text.replace("e", "ion") in dictionary:
            return text, text.replace("e", "ion")
        
        #AFFIX SUBSTITUTIONS
        elif text.replace("para", "") in dictionary:
            return text, text.replace("para", "")
        elif text.replace("peri", "") in dictionary:
            return text, text.replace("peri", "")
        elif text.replace("bi", "") in dictionary:
            return text, text.replace("bi", "")
        
        else:
            return None, None
    
    def map_to_radlex(self, chunk, dictionary):
        if type(chunk) != str:
            text = chunk.text
        else:
            text = chunk
        text_words, word_idx = text.split(), 0

        found_mapping = False
        
        while not found_mapping and word_idx < len(text_words):
            raw_text, mapped = self.map_text_to_radlex(text, dictionary) 
            if raw_text is not None:
                 found_mapping = True
            else:
                word_idx += 1
                text = " ".join(text_words[word_idx:])  
        
        if word_idx == len(text_words) and not found_mapping:
            for word in text_words[:-1]:
                if not found_mapping:
                    raw_text, mapped = self.map_text_to_radlex(word, dictionary) 
                    if raw_text is not None:
                        found_mapping = True
                        
        return [raw_text, mapped] 
    
    def remove_submatches(self, matches_to_search):
        unique_matches = []
        
        while len(matches_to_search) > 0:
            match1 = max(matches_to_search, key = len)
            related_matches = [match1]
            matches_to_search.remove(match1)     
            for match2 in matches_to_search:
                if match2 in match1:
                    related_matches.append(match2)
            unique_matches.append(max(related_matches, key = len))
            for match in related_matches:
                if match in matches_to_search:
                    matches_to_search.remove(match)              
        return unique_matches
    
    def map_modifiers(self, possible_modifiers):
        mod_descriptors = []
        is_normal = False

        for modifier in possible_modifiers:
            #print("IN MAP MODIFIERS, mapping", modifier)
            descripts = self.map_to_radlex(modifier, self.__descriptor_list)
            normals = self.map_to_radlex(modifier, self.__normal_list)
            
            if descripts[0] is not None:
                #print("found a descriptor, ", descripts[0])
                descr_name, adverbs = descripts[0], []
                descr_token = [token for token in self.__clause_doc if token.text == descripts[0]][0]
                if descr_token.pos_ == 'VERB' or descr_token.pos_ == 'ADJ':
                    adverbs = [child.text for child in descr_token.children if child.pos_ == 'ADV']
                mod_descriptors.append(Descriptor(name = descr_name, qualifiers = adverbs))
            if normals[0] is not None:
                is_normal = True 
        return is_normal, mod_descriptors
    
    def map_locations(self, possible_locations): 
        #print("in MAP LOCATIONS, considering ", possible_locations)
        locations = []
        for location in possible_locations:
            locs = self.map_to_radlex(location, self.__location_list)
            
            if locs[0] is not None:
                locations.append(locs[0])
        return locations

    
    def run_chunker(self):
        #KEYS ORDER: 'visual-disease', 'anatomy', 'procedure', 'device', 'change'
        num_keys = len(self.__keys)
        chunk_lists, tagged_chunk_lists = [[] for i in range(num_keys)], [[] for i in range(num_keys)]
        mapping_dicts, full_chunk_dicts =  [{} for i in range(num_keys)], [{} for i in range(num_keys)]

        chunks = list(set([chunk for chunk in self.__clause_doc.noun_chunks] + [token for token in self.__clause_doc if token.pos_ == 'VERB' or token.pos_ == 'NOUN' or token.pos_ == 'ADJ']))
        narrowed_chunk_texts = self.remove_submatches([chunk.text for chunk in chunks])
        chunks = [chunk for chunk in chunks if chunk.text in narrowed_chunk_texts]
        self.__spacy_chunks = chunks
        
        #Sort chunks and store mappings
        for chunk_list_idx in range(num_keys):
            for chunk in chunks:
                raw, mapped = self.map_to_radlex(chunk, self.__dicts[self.__keys[chunk_list_idx]])
                
                if raw is not None and raw not in chunk_lists[chunk_list_idx]:
                    full_chunk_dicts[chunk_list_idx][raw] = chunk 
                    chunk_lists[chunk_list_idx].append(raw)
                    mapping_dicts[chunk_list_idx][raw] = mapped
            
            chunk_lists[chunk_list_idx] = self.remove_submatches(chunk_lists[chunk_list_idx])
        
        #caveat
        if "lymph node" in " ".join(chunk_lists[1]):
            replace_vis_dis_list = []
            for chunk in chunk_lists[0]:  #vis_dis 
                if "node" not in chunk:
                    replace_vis_dis_list.append(chunk)
            chunk_lists[0] = replace_vis_dis_list
        
        #Tag the raw_text of the chunks with rules (allow for proper indexing later)
        tagged_chunk_list_idx = 0
        for chunk_list in chunk_lists:
            for raw in chunk_list:
                tagged_chunk_lists[tagged_chunk_list_idx].append(self.tag(raw))
            tagged_chunk_list_idx += 1
        
        return chunk_lists, tagged_chunk_lists, mapping_dicts, full_chunk_dicts
    
    
    def get_indices(self):
        chunk_idx_dicts = []
        for lst in self.__tagged_chunks:
            chunk_idx_dict = {}
            for tagged_chunk in lst:
                chunk_idx_dict[tagged_chunk] = self.__tagged_clause.index(tagged_chunk)
            chunk_idx_dicts.append(chunk_idx_dict)
        return chunk_idx_dicts
    

    '''CHECK NEGATIONS AND PREVIOUS STATUS'''
    def check_change_negation(self, raw_chunk):
        is_negated = False
        
        for token in self.__clause_doc:
            
            if token.text in raw_chunk:
                if token.pos_ == 'NOUN':
                    to_check = [child.text for child in token.children]   
                else:
                    to_check = [child.text for anc in [anc for anc in token.ancestors] for child in anc.children]
                    
                for word in self.__neg_list:
                        if word in to_check:
                            is_negated = True
        return is_negated
        
    def check_negation(self, chunk_idx):
        #Find the indices of the pre-negation, post-negation flags
        clause_words, preneg_idxs, postneg_idxs = self.__tagged_clause.split(), [], []
        
        for word in clause_words:
            if re.findall('\[PREN\]|\[PREP\]', word):
                preneg_idxs.append(self.__tagged_clause.index(word))
            if re.findall('\[POST\]|\[POSP\]', word):
                postneg_idxs.append(self.__tagged_clause.index(word))
              
        #return false if no negation tags present
        if len(preneg_idxs) == 0 and len(postneg_idxs) == 0:
            return False
        
        #extract only those tags that are before or after the chunk idx
        preneg_idxs = [neg_idx for neg_idx in preneg_idxs if neg_idx < chunk_idx]
        postneg_idxs = [neg_idx for neg_idx in postneg_idxs if neg_idx > chunk_idx]
        
        #Set pre-neg/post-neg/previous indices, handling multiple negations
        if len(preneg_idxs) % 2 == 0:
            is_pre_negated = False
        else:
            is_pre_negated = True
        
        if len(postneg_idxs) % 2 == 0:
            is_post_negated = False
        else:
            is_post_negated = True
            
        neg_sum = is_pre_negated + is_post_negated
        
        if neg_sum % 2 == 0: #if not pre or post negated, or if both pre and post negated
            return False
        if neg_sum == 1:
            return True 
    
    def check_previous(self, chunk_idx):
        clause_words, prev_idxs = self.__tagged_clause.split(), []
        
        for word in clause_words:
            if re.findall('\[PREV\]', word):
                prev_idxs.append(self.__tagged_clause.index(word))
        if len(prev_idxs) == 0:
            return False
        else:
            prev_idx = prev_idxs[0]
            if prev_idx < chunk_idx:
                return True
        return False
        
###DEBUG LOCATIONS
    '''PARSING ENTITIES FUNCTIONS'''
    def get_modifiers(self, token, location = False):
        
        children = [child.text for child in token.children if child.pos_ in ['ADJ', 'ADV', 'NOUN', 'VERB']]
        ancestors = [anc.text for anc in token.ancestors if anc.pos_ in ['VERB','ADJ', 'ADV', 'NOUN']]
        additionals = [tok.text for tok in self.__clause_doc if (tok.dep_ in ['conj', 'acomp', 'xcomp']) and (token in [child for child in tok.children] or token in [anc for anc in tok.ancestors])]
        
        dets = [tok.text for tok in self.__clause_doc if tok.dep_ is 'det' and (token in [anc for anc in tok.ancestors])]

        if location:
            lost_adjs = [tok.text for tok in self.__clause_doc if tok.pos_ == 'ADJ' and token in [anc for anc in tok.ancestors]]
        else:
            lost_adjs = []
        possible_modifiers = list(set(children + ancestors + additionals + dets + lost_adjs))
        
       # print("IN GET MODIIFERS, possible mods are ", possible_modifiers)
        
        if not location:
            narrowed_modifiers = []
            for modifier in possible_modifiers:
                mod_token = [tok for tok in self.__clause_doc if tok.text == modifier][0]
                mod_dependents = [child for child in mod_token.children] + [anc for anc in mod_token.ancestors]
                if token in mod_dependents:
                    narrowed_modifiers.append(modifier)
            #print("not location, returning a narrowed version")  #DEBUG
            return narrowed_modifiers
        else:
            return possible_modifiers
    
    def get_location(self, ent):
        
        
        possible_modifiers = []
        for token in self.__clause_doc:
            if token.text in ent:
                possible_modifiers = possible_modifiers + self.get_modifiers(token, location = True)          
        location = self.map_locations(possible_modifiers)
        
        #Check anatomy chunks for mapping
        anatomy_chunks = self.__chunks[1]
       # print("IN GET LOCATION,   FULL LIST OF ANATOMY CHUNKS ARE: ", anatomy_chunks)
        for anatomy in anatomy_chunks:
            anatomy_chunk = self.__full_chunks_dicts[1][anatomy]
           # print("IN GET LOCATION, considering chunk", anatomy, "with full chunk", anatomy_chunk, "\n")
        
            if type(anatomy_chunk) == spacy.tokens.span.Span:
                ancestors = [anc.text for anc in anatomy_chunk.root.ancestors]
                children = [child.text for child in anatomy_chunk.root.children]
                
            else:
                ancestors = [anc.text for anc in anatomy_chunk.ancestors]
                children = [child.text for child in anatomy_chunk.children]
            
            #print("considering ancestors and children", ancestors, children)
            retain_anatomy = []
            for word in ent.split(" "):
                #print("looking at word", word)
                #print("it should match ", ancestors + children)
                if word in ancestors + children:
                    words = anatomy_chunk.text.split()
                    refined_words = []
                    for word in words:
                       # print("LOOKING AT WORD", word)
                        if word not in ["the", "a"]:
                            if self.map_to_radlex(word, self.__location_list)[0] is not None:
                                refined_words.append(word)
                            elif self.map_to_radlex(word, self.__anatomy_list)[0] is not None:
                                refined_words.append(word)
                    location.append(" ".join(refined_words))
        location = list(set(location))
        location = self.remove_submatches(location)
        return location

    
    def parse_entity(self, idx, ent_type):
        entity_chunks, raw_chunks = self.__tagged_chunks[idx], self.__chunks[idx]
        chunk_idx_dict = self.__chunk_idx_dicts[idx]
        entity_ents, chunk_list_idx  = [], 0
        
        if ent_type == 'anatomy':
            location = True
        else: 
            location = False
        
        while chunk_list_idx < len(entity_chunks):
            tagged_chunk, raw_chunk = entity_chunks[chunk_list_idx], raw_chunks[chunk_list_idx]
            chunk_idx = chunk_idx_dict[tagged_chunk]
            
            is_previous = self.check_previous(chunk_idx)
            is_negated = self.check_negation(chunk_idx)
            
            possible_modifiers = []
            for token in self.__clause_doc:
                if token.text in raw_chunk:
                    possible_modifiers = possible_modifiers + self.get_modifiers(token, location)
                
            is_normal, mod_descriptors = self.map_modifiers(list(set(possible_modifiers)))
            #print("IN PARSE_ENTITY: Possible modifiers are", possible_modifiers,
                  #"selected descriptors are", [descr.name for descr in mod_descriptors])
            
            if not (ent_type == 'anatomy' and len(mod_descriptors) == 0):
                if ent_type == 'anatomy':
                    
                    potential_location = self.__full_chunks_dicts[1][raw_chunk].text
                    refined_words = []
                    for word in potential_location.split():
                        if word not in ["the", "a"]:
                            if self.map_to_radlex(word, self.__location_list)[0] is not None:
                                refined_words.append(word)
                            elif self.map_to_radlex(word, self.__anatomy_list)[0] is not None:
                                refined_words.append(word)
                    anat_loc = " ".join(refined_words)
                    
                    
                    ent_name = raw_chunk + " (" + ", ".join([descriptor.describe() for descriptor in mod_descriptors]) + ")"
                    location = [anat_loc] #self.get_location(raw_chunk)
                else: 
                    ent_name = raw_chunk
                    location = self.get_location(raw_chunk)
                if ent_type == 'change':
                    is_negated = self.check_change_negation(raw_chunk)
                    
                entity_ents.append(ExtractedEntity(name = ent_name, ent_type = ent_type,
                                                    location = ", ".join(location),
                                                    description = mod_descriptors,
                                                    is_previous = is_previous,
                                                    is_negated = is_negated, is_normal = is_normal))
            chunk_list_idx += 1
        
        return entity_ents
    
    def parse_indices(self):
        entity_lsts = [self.parse_entity(i, self.__keys[i]) for i in range(len(self.__keys))]
        return [ent for lst in entity_lsts for ent in lst]

    '''HEDGING CODE''' 
    def apply_hedges(self):
        hedges = [phrase for phrase in self.__hedge_list if phrase in self.__clause]
        
        doc = nlp(self.__clause) 
        if len(hedges) > 0:
            
            for entity in self.__all_entities:
                ent_hedges = []
                
                #Assume the last word in the entity name is the fundamental one
                ent_tokens = [token for token in doc if token.text == entity.name.split()[-1]]
                
                #If you format naming correctly (NO parentheses, should always be a match to the text), you shouldn't need this caveat
                if len(ent_tokens) > 0:
                    ent_token = ent_tokens[0]
                
                    for hedge in hedges:
                        anc_hedges = list(set([hedge for anc in ent_token.ancestors if anc.text in hedge] + [hedge for child in ent_token.children if child.text in hedge]))
                        for anc_hedge in anc_hedges:
                            if anc_hedge not in self.__post_hedges:
                                if self.__clause.index(anc_hedge) < self.__clause.index(ent_token.text):
                                    ent_hedges.append(anc_hedge)
                            else:
                                ent_hedges.append(anc_hedge)  #because post hedges often can also be prior


                    #2nd Layer Search for hedge among children
                    if len(ent_hedges) == 0:
                        #print("2nd layer search")
                        for hedge in hedges:
                            child_hedges = [hedge for child in ent_token.children if child.text in hedge]
                            #print("Child hedges:", child_hedges)
                            for child_hedge in child_hedges:
                                if self.__clause.index(child_hedge) < self.__clause.index(ent_token.text):
                                    ent_hedges.append(child_hedge)
                    
                    ent_hedges = list(set(ent_hedges))
                    if len(ent_hedges) > 0:
                        entity.set_hedging(self.remove_submatches(ent_hedges))

                    for descriptor in entity.description:
                        descr_token = [token for token in doc if token.text == descriptor.name][0]
                        descr_hedges = [anc.text for anc in descr_token.ancestors if anc.text in hedges] + [child.text for child in descr_token.children if child.text in hedges]
                        if len(descr_hedges) > 0:
                            descriptor.set_hedging(self.remove_submatches(descr_hedges))
        return None
   
    def remove_redundant_anatomy(self, df):
        findings_to_remove, idxs_to_check, findings_to_check = [], [], []
        
        for i in range(len(df)):
            if df.finding_types.loc[i] == 'anatomy':# or df.finding_types.loc[i] == 'visual_disease':
                idxs_to_check.append(i)
                findings_to_check.append(df.findings[i])  
                
                
        for k in range(len(findings_to_check)): 
            finding = findings_to_check[k]
            for j in range(len(df)):
                if j not in idxs_to_check:
                    ent = df.iloc[j]
                    slim_finding = finding.split("(")[0].strip()  #this counts on the parentheses being at the end
                    if ent.locations in slim_finding or slim_finding in ent.locations:
                        findings_to_remove.append(finding)     
         
        for finding in findings_to_remove:
            df = df[df.findings != finding]
            
        return df
    
    
    def check_lost(self):
        num_chunks = np.sum([len(chunk_list) for chunk_list in self.__chunks])
        if num_chunks > len(self.__all_entities):
            ent_adjs = [adj for entity in self.__all_entities for adj in entity.describe().split(",")]
            ent_adjs = ent_adjs + [adj for entity in self.__all_entities for adj in entity.location.split(",")]
            ent_adjs = list(set([word.strip("\'") for ent in ent_adjs for word in ent.split(" ")]))
            
            adjs = [token for token in nlp(self.__clause) if token.pos_ == 'ADJ']
            missing_adjs = []
            for adj in adjs:
                if adj.text not in ent_adjs:
                    missing_adjs.append(adj)
            
            #print(ent_adjs, missing_adjs, adjs)
            if len(missing_adjs) > 0:
                return True, True
            else:
                return True, False
        else:
            return False, False
        
    def grab_bag(self):
        #print("IN GRAB BAG")
        doc = nlp(self.__clause)
        root_verb = [token for token in doc if token.dep_ == 'ROOT']
        if len(root_verb) == 0:
            return None
        else:
            root_verb = root_verb[0]
            root_adjs = [child.text for child in root_verb.children if child.pos_ == 'ADJ']
            relations = [token.text for token in doc if root_verb.text in [anc.text for anc in token.ancestors] and token.pos_ == 'NOUN']
            
            adj_string = ", ".join(root_adjs)
            new_outputs = []
            
            for relation in relations:
                new_clause = "the " + adj_string + " " + relation
                sub_extractor = ReportExtractor(clause = new_clause, neg_rules = self.__neg_rules, 
                                               prev_rules = self.__prev_rules,
                                               
                                               vis_dis_list = self.__vis_dis_list, anatomy_list = self.__anatomy_list,
                                               procedure_list = self.__procedure_list, device_list = self.__device_list, 
                                               
                                                locations_list = self.__location_list, descriptor_list = self.__descriptor_list,
                                                normal_list = self.__normal_list,
                                                
                                                hedge_list = self.__hedge_list, post_hedge_list = self.__post_hedges, 
                                                hedge_dict = self.__hedge_dict, hedge_scores = self.__hedge_scores,
                                               grab = False)
                new_outputs.append(sub_extractor.run_extractor())
            
            if len(new_outputs) is not 0:
                return pd.concat(new_outputs)
            else:
                return None
            
    def remove_nonsense_locations(self, locations):
        new_locs = []
        for location in locations:
            doc = nlp(location)
            pos = [token.pos_ for token in doc]
            if 'NOUN' in pos:
                new_locs.append(location)
            elif 'ADJ' in pos:
                new_locs.append(location)
            else:
                new_locs.append('')
        return new_locs
            
    
    def clean_output(self):
        chunks_lost, adjs_lost = self.check_lost()
        addl_df = None
        if chunks_lost and adjs_lost and self.__grab:
            addl_df = self.grab_bag()
        
        findings, finding_types, certainties, statuses, descriptors, locations, changes = [], [], [], [], [], [], []
        
        for entity in self.__all_entities:
            #finding, finding_type, ceratinty, status, descriptors, locations, is_changed
            findings.append(entity.name)
            finding_types.append(entity.ent_type)
            certainties.append(entity.set_hedge_level(self.__hedge_dict, self.__hedge_scores))
            descriptors.append(entity.describe())
            locations.append(entity.location)
            changes.append(entity.is_changed(self.__change_list))
            
            if entity.negated and entity.previous:
                statuses.append("negated, previous")
            elif entity.negated:
                statuses.append("negated")
            elif entity.previous:
                statuses.append("previous")
            else:
                statuses.append("current")
                
        locations = self.remove_nonsense_locations(locations)
        
        print(len(findings), len(finding_types), len(certainties), len(statuses), len(descriptors), len(locations), len(changes))
        output_df = pd.DataFrame([findings, finding_types, certainties, statuses, descriptors, locations, changes],
                           index = ['findings', 'finding_types', 'certainties', 'statuses', 'descriptors',
                                   'locations', 'changed']).T
        if addl_df is not None:
            addl_df.index = range(len(addl_df))
            addl_copy = addl_df.copy()
            for i in range(len(addl_df)):
                finding = addl_df.findings[i]
                if finding in output_df.findings.tolist():
                    addl_copy = addl_copy.drop(i)
            addl_df = addl_copy
            if addl_copy is not None:
                output_df = pd.concat([output_df, addl_copy])
            
        output_df.index = range(len(output_df))
        
        #print("CONCAT DF: ", output_df)
        if self.__grab:
            output_df = self.remove_redundant_anatomy(output_df)
            
        return output_df

# Take Patient Input

In [59]:
def remove_double_errors(df):
    if 'effusion' in df.findings.values and 'pleural effusion' in df.findings.values:
        df = df[df.findings != 'effusion']
    if 'effusions' in df.findings.values and 'pleural effusions' in df.findings.values:
        df = df[df.findings != 'effusions']
    if 'process' in df.findings.values and re.findall('\S+\s+process', ", ".join(df.findings.values)):
        df = df[df.findings != 'process']
    if 'processes' in df.findings.values and re.findall('\S+\s+processes', ", ".join(df.findings.values)):
        df = df[df.findings != 'processes']
    if 'disease' in df.findings.values and (re.findall('\S+\s+disease', ", ".join(df.findings.values))):
        df = df[df.findings != 'disease']
    return df

In [73]:
def remove_submatches(matches_to_search):
    unique_matches = []

    while len(matches_to_search) > 0:
        match1 = max(matches_to_search, key = len)
        related_matches = [match1]
        matches_to_search.remove(match1)     
        for match2 in matches_to_search:
            if match2 in match1:
                related_matches.append(match2)
        unique_matches.append(max(related_matches, key = len))
        for match in related_matches:
            if match in matches_to_search:
                matches_to_search.remove(match)              
    return unique_matches

def get_abnormals(df):
    abnormals = []
    for finding in df[df.statuses == 'current'].findings:
        is_normal = False
        for word in normal_list + ["stable", "aerated", "aerated (well)"]:
            if word in finding.replace("(","").replace(")",""):
                is_normal = True
        if not is_normal:
            abnormals.append(finding)
    
    #include previous procedures
    procedure_df = df[df.finding_types == 'procedure']
    procedure_df = procedure_df[procedure_df.statuses == 'previous']
    for finding in procedure_df.findings:
        abnormals.append(finding)
    return list(set(remove_submatches(abnormals)))

def get_abnormals_and_locs(df):
    abnormal_locs = []
    abnormal_descrs = []
    
    currents = df[df.statuses == 'current']
    for idx in range(len(currents)):
        finding = currents.iloc[idx].findings
        is_normal = False
        for word in normal_list + ["stable", "aerated", "aerated (well)"]:
            if word in finding.replace("(","").replace(")",""):
                is_normal = True
        if not is_normal:
            abnormal_locs.append((finding, currents.iloc[idx].locations))
            abnormal_descrs.append((finding, currents.iloc[idx].descriptors))
    
    #include previous procedures
    procedure_df = df[df.finding_types == 'procedure']
    procedure_df = procedure_df[procedure_df.statuses == 'previous']
    for idx in range(len(procedure_df)):
        finding = procedure_df.iloc[idx].findings
        abnormal_locs.append((finding, procedure_df.iloc[idx].locations))
        abnormal_descrs.append((finding, procedure_df.iloc[idx].descriptors))
    return list(set(remove_submatches(abnormal_locs))), list(set(remove_submatches(abnormal_descrs)))

def get_findings(df):
    devices = []
    df = df[df.statuses != 'negated']
    df = df[df.statuses != 'negated, previous']
    df = df[df.statuses != 'previous']
    for finding in df.findings:
        devices.append(finding)
    return list(set(remove_submatches(devices)))

def get_changes(df):
    changes = []
    df = df[df.statuses != 'negated']
    df = df[df.statuses != 'negated, previous']
    for i in range(len(df)):
        change_description = df.iloc[i].findings + ", " + df.iloc[i].descriptors
        if len(change_description.split()) > 1:
            changes.append(change_description)
    return list(set(remove_submatches(changes)))

def get_changes_and_locs(df):
    change_locs = []
    change_descrs = []
    df = df[df.statuses != 'negated']
    df = df[df.statuses != 'negated, previous']
    for i in range(len(df)):
        change_description = df.iloc[i].findings + ", " + df.iloc[i].descriptors
        if len(change_description.split()) > 1:
            change_locs.append((change_description, df.iloc[i].locations))
            change_descrs.append((df.iloc[i].findings, df.iloc[i].descriptors))
    return list(set(remove_submatches(change_descrs))), list(set(remove_submatches(change_locs)))

def convert_list_to_string(list_items):
#     print(list_items)
    if len(list_items)==0 or list_items is None:
        final_str = ""
    else:
#         for idx, item in enumerate(list_items):
#             if "," in str(item):
#                 words = item.split(',')
#                 item = "__".join([word.strip() for word in words])
#                 list_items[idx] = item
        final_str = "--".join([str(item) for item in list_items])
    print(list_items, final_str)
    return final_str

# Basic Function Call

In [74]:
# #skipped 20737
# def extract_from_reports(radnet):
#     all_abnorms, all_changes = [], []
#     for i in range(len(radnet)):
#         abnormalities, changes = [], []
#         if i % 1000 == 0:
#             print(i)    
#         if type(radnet.iloc[i].FINDINGS) != float and type(radnet.iloc[i].IMPRESSION) != float:
#             patient_sents = pre_process_sents(radnet.iloc[i].FINDINGS + radnet.iloc[i].IMPRESSION)
#             results = get_patient_results(patient_sents)
#             if results is not None and len(results) > 0:
#                 abnormalities = get_abnormals(results[results.finding_types != 'change'])
#                 changes = get_changes(results[results.finding_types == 'change'])
                
#         elif type(radnet.iloc[i].FINDINGS) != float:
#             patient_sents = pre_process_sents(radnet.iloc[i].FINDINGS)
#             results = get_patient_results(patient_sents)
#             if results is not None and len(results) > 0:
#                 abnormalities = get_abnormals(results[results.finding_types != 'change'])
#                 changes = get_changes(results[results.finding_types == 'change'])
                
#         elif type(radnet.iloc[i].IMPRESSION) != float:
#             patient_sents = pre_process_sents(radnet.iloc[i].IMPRESSION)
#             results = get_patient_results(patient_sents)
#             if results is not None and len(results) > 0:
#                 abnormalities = get_abnormals(results[results.finding_types != 'change'])
#                 changes = get_changes(results[results.finding_types == 'change'])
                
#         all_abnorms.append(abnormalities)
#         all_changes.append(changes)
#     return pd.DataFrame([all_abnorms, all_changes], index = ['abnorms', 'changes']).T

In [81]:
#skipped 20737
def extract_from_reports(radnet):
    all_abnorms, all_devices, all_vis_diseases, all_changes, all_procedures, all_anatomies = [], [], [], [], [], []
    for i in tqdm(range(len(radnet))):
        abnormalities, devices, vis_diseases, changes, procedures, anatomies = [], [], [], [], [], []
#         if i % 1000 == 0:
#             print(i)
        findings = radnet.iloc[i].FINDINGS
        impression = radnet.iloc[i].IMPRESSION
        conclusion = radnet.iloc[i].Conclusion
        
        report_interest = ""        
        if type(findings) != float:
            report_interest += findings        
        if type(impression) != float:
            report_interest += impression
        if type(conclusion) != float:
            report_interest += conclusion

        patient_sents = pre_process_sents(report_interest)
        results = get_patient_results(patient_sents)        
        if results is not None and len(results) > 0:
            abnormalities = get_abnormals(results[results.finding_types != 'change'])
            
#             devices = get_findings(results[results.finding_types == 'device'])
#             vis_diseases = get_findings(results[results.finding_types == 'visual_disease'])
            changes = get_changes(results[results.finding_types == 'change'])
#             anatomies = get_findings(results[results.finding_types == 'anatomy'])
            procedures = get_findings(results[results.finding_types == 'procedure'])

        all_abnorms.append(abnormalities)
        all_devices.append(devices)
        all_anatomies.append(anatomies)
        all_procedures.append(procedures)
        all_vis_diseases.append(vis_diseases)
        all_changes.append(changes)

    return pd.DataFrame([all_abnorms, all_devices, all_vis_diseases, all_anatomies, all_procedures, all_changes], \
                        index = ['abnorms', 'devices', 'vis_diseases', 'anatomies', 'procedures', 'changes']).T

In [120]:
def get_patient_results(sents):
    patient_sent_dfs = []
    for sent in sents:
        pt_paths, pt_neg_paths, pt_changes, pt_devices, pt_procedures, pt_norm_anatomy = [], [], [], [], [], []
        clauses, clause_outputs = split_by_clause(sent, term_rules), []
        for clause in clauses:
            start = time.time()
            extractor = ReportExtractor(clause = clause, neg_rules = neg_rules, prev_rules = prev_rules, 

                                        vis_dis_list = vis_dis_list, anatomy_list = anatomy_list, 
                                        procedure_list = procedure_list, 
                                        device_list = device_list, change_list = change_list,

                                        locations_list = locations, descriptor_list = sort_list(descriptors), 
                                        normal_list = normal_list,

                                       hedge_list = hedge_list, post_hedge_list = post_hedge_list, 
                                       hedge_dict = hedge_dict, hedge_scores = hedge_scores, grab = True)

#             temp_results = extractor.run_extractor()
#             print(temp_results.shape, time.time() - start)
#             print(extractor.run_extractor())
            clause_output = extractor.run_extractor()
            if clause_output.shape[0]
            clause_outputs.append(extractor.run_extractor())
            print("run_extractor: {:.2f}sec".format(time.time() - start), clause_outputs[-1].shape)
        print(len(clause_outputs))
        if len(clause_outputs) != 0:
            print(pd.concat(clause_outputs).shape)
            patient_sent_dfs.append(pd.concat(clause_outputs))
        
#     if len(patient_sent_dfs) != 0:
#         pt_df = remove_double_errors(pd.concat(patient_sent_dfs).drop_duplicates())
#         return pt_df
#     else:
    return None

In [121]:
def extract_findings_from_reports(df):
    all_results = pd.DataFrame()
    for i in tqdm(range(len(df))):
        accnum = df.iloc[i].accnum
        findings = df.iloc[i].FINDINGS
        impression = df.iloc[i].IMPRESSION
        conclusion = df.iloc[i].Conclusion
        
        report_interest = ""        
        if type(findings) != float:
            report_interest += findings        
        if type(impression) != float:
            report_interest += impression
        if type(conclusion) != float:
            report_interest += conclusion

        patient_sents = pre_process_sents(report_interest)
        results = get_patient_results(patient_sents)
#         results['accnum'] = accnum
#         all_results = all_results.append(results, ignore_index=True)
    return all_results

In [126]:
extracted_findings = extract_findings_from_reports(df[:10])
# print(extracted_findings.columns)
list_columns = ['accnum','findings','finding_types','certainties','statuses','descriptors','locations','changed']
extracted_findings = extracted_findings[list_columns]



  0%|          | 0/10 [00:00<?, ?it/s][A[A

0 0 0 0 0 0 0
run_extractor: 0.11sec (0, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.16sec (1, 7)
2
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.03sec (0, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.19sec (0, 7)
2
(0, 7)
0 0 0 0 0 0 0
run_extractor: 0.03sec (0, 7)
2 2 2 2 2 2 2
run_extractor: 0.09sec (2, 7)
2
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.03sec (0, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
2 2 2 2 2 2 2
run_extractor: 0.18sec (2, 7)
2
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.05sec (0, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.22sec (1, 7)
2
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.13sec



 10%|█         | 1/10 [00:01<00:11,  1.25s/it][A[A

 (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.11sec (1, 7)
1
(1, 7)
2 2 2 2 2 2 2
run_extractor: 0.09sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.10sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
run_extractor: 0.13sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.17sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
1 1 1 1 1 1 1
1 1 1 1 1 1 1
run_extractor: 0.40sec (1, 7)
1
(1, 7)




 20%|██        | 2/10 [00:02<00:09,  1.25s/it][A[A

1 1 1 1 1 1 1
run_extractor: 0.09sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.12sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.07sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.09sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.12sec (0, 7)
0 0 0 0 0 0 0
run_extractor: 0.06sec (0, 7)
2
(0, 7)




 30%|███       | 3/10 [00:03<00:07,  1.04s/it][A[A

2 2 2 2 2 2 2
run_extractor: 0.15sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.15sec (1, 7)
1
(1, 7)
2 2 2 2 2 2 2
run_extractor: 0.07sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.14sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.12sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.11sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.08sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.07sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)




 40%|████      | 4/10 [00:04<00:06,  1.07s/it][A[A

1 1 1 1 1 1 1
run_extractor: 0.11sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
run_extractor: 0.11sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.11sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.16sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.14sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
1 1 1 1 1 1 1
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.20sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
1 1 1 1 1 1 1




 50%|█████     | 5/10 [00:05<00:05,  1.08s/it][A[A

1 1 1 1 1 1 1
run_extractor: 0.40sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.10sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.14sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.09sec (0, 7)
1
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.22sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
1 1 1 1 1 1 1
2 2 2 2 2 2 2
run_extractor: 0.40sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.07sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.04sec (0, 7)
2 2 2 2 2 2 2
run_extractor: 0.11sec (1, 7)
2
(1, 7)




 60%|██████    | 6/10 [00:06<00:04,  1.17s/it][A[A

2 2 2 2 2 2 2
run_extractor: 0.16sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.11sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
run_extractor: 0.03sec (0, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.17sec (0, 7)
2
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.13sec (1, 7)
1
(1, 7)
2 2 2 2 2 2 2
run_extractor: 0.11sec (2, 7)
1
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.05sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.14sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.15sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.14sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
run_extractor: 0.08sec (1, 7)
1
(1, 7)
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.14sec (0, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
2
(1, 7)




 70%|███████   | 7/10 [00:08<00:03,  1.26s/it][A[A

0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.12sec (0, 7)
1
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.10sec (2, 7)
1
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.05sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.17sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.12sec (1, 7)
1
(1, 7)




 80%|████████  | 8/10 [00:08<00:02,  1.07s/it][A[A

0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.10sec (0, 7)
1
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.10sec (2, 7)
1
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.08sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
run_extractor: 0.30sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
0 0 0 0 0 0 0
1 1 1 1 1 1 1
run_extractor: 0.40sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.10sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.10sec (1, 7)
1
(1, 7)
2 2 2 2 2 2 2
run_extractor: 0.09sec (2, 7)
1
(2, 7)




 90%|█████████ | 9/10 [00:10<00:01,  1.17s/it][A[A

1 1 1 1 1 1 1
run_extractor: 0.12sec (1, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
2 2 2 2 2 2 2
run_extractor: 0.07sec (1, 7)
3
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.04sec (1, 7)
1
(1, 7)
2 2 2 2 2 2 2
run_extractor: 0.15sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.09sec (1, 7)
0 0 0 0 0 0 0
run_extractor: 0.04sec (0, 7)
2
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.05sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.07sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
1 1 1 1 1 1 1
0 0 0 0 0 0 0
run_extractor: 0.16sec (2, 7)
1
(2, 7)
1 1 1 1 1 1 1
run_extractor: 0.06sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.04sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.14sec (1, 7)
1 1 1 1 1 1 1
run_extractor: 0.05sec (1, 7)
2
(2, 7)
3 3 3 3 3 3 3
run_extractor: 0.15sec (2, 7)
1
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
run_extractor: 0.10sec (1, 7)
1
(1, 7)
1 1 1 1 1 1 1
run_extractor: 0.07sec (1, 7)
1
(1, 7)




100%|██████████| 10/10 [00:11<00:00,  1.26s/it][A[A

[A[A

0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)
2 2 2 2 2 2 2
run_extractor: 0.09sec (2, 7)
1
(2, 7)
0 0 0 0 0 0 0
run_extractor: 0.02sec (0, 7)
1
(0, 7)
1 1 1 1 1 1 1
run_extractor: 0.07sec (1, 7)
1
(1, 7)


KeyError: "['accnum' 'findings' 'finding_types' 'certainties' 'statuses'\n 'descriptors' 'locations' 'changed'] not in index"

In [31]:
extracted_findings.columns

Index(['accnum', 'findings', 'finding_types', 'certainties', 'statuses',
       'descriptors', 'locations', 'changed'],
      dtype='object')

In [52]:
df.iloc[0].FINDINGS

' Lungs: The lungs are well aerated. There are no infiltrates or nodules. Heart and aorta: The heart is in normal in position, size, contour, and x-ray attenuation. Bones: The bones around the margins of the chest are unremarkable. Hila: The hila are normal. There are no abnormal masses in the hila. Trachea and main bronchi: The trachea and main bronchi are normal in appearance. Soft tissues: The soft tissues around the chest margin are normal in appearance. Other:'

In [33]:
work_df = df[10000:20000]
# work_df = df.sample(10, random_state=123)

In [34]:
list_columns = ['accnum', 'abnorms', 'devices', 'vis_diseases', 'anatomies', 'procedures', 'changes']
extracted_findings = extract_from_reports(work_df)
extracted_findings['accnum'] = pd.Series(list(work_df['accnum']))
extracted_findings = extracted_findings[list_columns]

extracted_findings['FINDINGS'] = pd.Series(list(work_df['FINDINGS']))
extracted_findings['IMPRESSION'] = pd.Series(list(work_df['IMPRESSION']))
extracted_findings['CONCLUSION'] = pd.Series(list(work_df['Conclusion']))
extracted_findings['REPORT'] = pd.Series(list(work_df['report']))

100%|██████████| 10000/10000 [2:14:09<00:00,  1.50it/s] 


In [35]:
#Save Extracts
extracted_findings.to_csv("outputs/extracts_all_1.csv", index=None)

In [36]:
extracts = pd.read_csv("outputs/extracts_all_1.csv")#.drop("Unnamed: 0", axis = 1)

In [37]:
extracts.shape

(10000, 11)