In [1]:
import re
import spacy
import pandas as pd
import numpy as np
import pickle
import nltk
from sklearn.model_selection import train_test_split
import random
import time

## The main function call is towards the bottom but you need to run all cells first

In [2]:
nlp = spacy.load('en_core_web_lg')

#### Load Data

Right now, this is set up to run on a very large list. If you have a shorter list of targets,
you can make the code run faster by giving it shorter dictionaries. 

If you want to just look for a short list of findings, the easiest thing to do is fill in the vis_dis and 
anatomy lists (if an anatomical pathology is part of your finding list, do NOT put it in vis_dis because anatomy 
needs to be handled differently ) and set the other lists to [].

In [284]:
save_dicts()

In [283]:
'''Run save_dicts() every time you make a change to your dictionaries and want to update them.  '''
def save_dicts():
    pd.DataFrame(vis_dis_list, columns = ['name']).to_csv("data/csvs/radlex_vis_dis.csv")
    pd.DataFrame(anatomy_list, columns = ['name']).to_csv("data/csvs/radlex_anatomy.csv")
    pd.DataFrame(device_list, columns = ['name']).to_csv("data/csvs/radlex_devices.csv")
    pd.DataFrame(procedure_list, columns = ['name']).to_csv("data/csvs/radlex_procedures.csv")
    pd.DataFrame(descriptors, columns = ['name']).to_csv("data/csvs/radlex_descriptor_list.csv")

    pd.DataFrame(change_list_narrow, columns = ['name']).to_csv('data/csvs/narrow_change_list.csv')
    pd.DataFrame(normal_list, columns = ['name']).to_csv('data/csvs/normal_list.csv')


In [5]:
vis_dis_list = pd.read_csv("rules/radlex_vis_dis.csv").name.tolist()
anatomy_list = pd.read_csv("rules/radlex_anatomy.csv").name.tolist()
device_list = pd.read_csv("rules/radlex_devices.csv").name.tolist()
procedure_list = pd.read_csv('rules/radlex_procedures.csv').name.tolist()
# change_list_narrow = pd.read_csv('rules/narrow_change_list.csv').name.tolist()
normal_list = pd.read_csv('rules/normal_list.csv').name.tolist()
descriptors = pd.read_csv("rules/radlex_descriptor_list.csv").name.tolist()

with open ("rules/term_rules.p", 'rb') as f:
    term_rules = pickle.load(f)
with open("rules/neg_rules.p", "rb") as f:
    neg_rules = pickle.load(f)
with open("rules/prev_rules.p", "rb") as f:
    prev_rules = pickle.load(f)

In [5]:
radnet = pd.read_excel("data/csvs/radnet_norm_parsed.xlsx")

### Supporting Classes (Descriptor, ChangeEntity, ExtractedEntity)

In [231]:
'''ExtractedEntity class:
An "entity" refers to a finding of one of the 5 categories: 'Visual_Disease', 'Anatomy', 'Procedure', 
'Device', and 'Change' (see detailed comments at top of ReportExtractor class).

An ExtractedEntity object allows us to store details about the finding. Here it's implemented to store the
finding cateogry and it's previous & negation statuses. In more advanced version of the code, could be modified
to include hedging, location, descriptive terms etc. 
'''

class ExtractedEntity(object):
    def __init__(self, name, ent_type = '', descriptors = [], is_previous = False, is_normal = False, 
                 is_negated = False, is_changed = False):
        self.name = name
        self.ent_type = ent_type  #'visual_disease', 'anatomy', 'procedure', 'device', or 'change'
        self.descriptors = descriptors #List of Descriptor objects
        self.previous = is_previous  #Boolean
        self.negated = is_negated  #Boolean
        self.is_changed = is_changed
    
    def describe(self):
        if len(self.descriptors) > 0:
            return ", ".join([descr.describe() for descr in self.descriptors])
        return ""
    
        
'''Descriptor class: an object holding one or several descriptive terms attributed to a given entity 
Qualifiers represents an additional terms that describe the Descriptor 
(ie, in the phrase "mild degenerative changes", "change" is the basic entity, 
degenerative" is the descriptor, mild" is a qualifier)
'''
class Descriptor(object):
    def __init__(self, name, qualifiers = []):
        self.name = name
        self.qualifiers = qualifiers  
    
    def describe(self):
        if len(self.qualifiers) > 0:
            return self.name + " (" + ", ".join(self.qualifiers) + ")"
        else:
            return self.name

In [396]:
'''
CATEGORIES (INFORMATION STRUCTURING)
The 5 Categories are called 'Visual_Disease', 'Anatomy', 'Procedure', 'Device', and 'Change'. 

An "entity" refers to a finding of one of these categories.

Of these categories, 'Anatomy', 'Procedure', and 'Device' are self-explanatory. Note that the terms in the'Anatomy'
category are ONLY the mentions of anatomy that are modified by at least one descriptive word (ie, "tortuous aorta"). 
This is meant to avoid extracting anatomy that is mentioned only to locate other findings (ie, "infiltrate 
in left lung"). 

The 'Change' category is meant to catch pathologic changes, such as "degenerative changes of the spine" or 
"kyphotic changes." The output of a "Change' entity will be a word referencing the sort of change (ie, "change",
"increasing", etc.) and the details can be found in the Descriptor terms (ie, "kyphotic", "degenerative", etc.).

The 'Visual_Disease' category can be thought of as the "everything else" category. It should include anything 
that can be visually observed on the chest X-ray that does not fit into the other 4 categories. 

USE OF TOKEN VS TERM IN THE VARIABLE NAMES
Throughout this code, "Token" and "Term" are used for similar concepts. A "term" is the str() (plain text) version
of a phrase or word. A "token" is the Spacy-ified version of the phrase/word, which means that it carries with it
the grammatical information stored from Spacy's original parse of the clause.

RULES FORMAT
neg_rules (negation rules) and prev_rules (previous rules) are lists of lists. These are the basis for this algorithm's
detection of previous and negated features. 

Each entry of these outer lists should be formatted as follows, similar to the original NegEx:
 ['phrase',
  '',
  '[TAG]',
  re.compile(r'-regex-phrase\b', re.IGNORECASE|re.UNICODE)],
 
An example:
['previous', '',
  '[PREV]',
  re.compile(r'\b(previous)\b', re.IGNORECASE|re.UNICODE)],

There are 4 types of tags for the negation rules. [PREN] and [POSN] are for pre- and post-negations respectively. 
[PREP] and [POSP] are for *possibly* pre- or post-negations respectively.
In this code, the [PREN]/[PREP] and [POSN]/[POSP] pairs are treated identically although the original NegEx code
handled them differently, as uncertainty about a diagnosis will be accounted for by the hedging portion of the code
(still to be implemented Nov 2018). 

There is only 1 tag for the previous rules, [PREV]. 
'''

class ReportExtractor(object):
    def __init__(self, 
                 clause = None, neg_rules = None, prev_rules = None, 
                 vis_dis_list = [], anatomy_list = [], procedure_list = [], device_list = [], 
                 change_list = [], descriptor_list = [], normal_list = []):
        
        self.__filler = '_'
        self.__clause = clause
        self.__clause_doc = nlp(clause)
        self.__keys = ['visual_disease', 'anatomy', 'procedure', 'device', 'change']
        self.__neg_rules = neg_rules  #ALLI SHOULD YOU BE SORTING THESE?
        self.__prev_rules = prev_rules
        self.__vis_dis_list = self.sort_list(vis_dis_list)
        self.__anatomy_list = self.sort_list(anatomy_list)
        self.__change_list = self.sort_list(change_list)
        self.__procedure_list = self.sort_list(procedure_list)
        self.__device_list = self.sort_list(device_list)
        self.__normal_list = self.sort_list(normal_list)
        self.__descriptor_list = self.sort_list(descriptor_list)
        
        self.__joined_rules = []
        
        for rule_set in [neg_rules, prev_rules]:
            if rule_set is not None:
                self.__joined_rules = self.__joined_rules + rule_set
        
        #Bind the lists of terms into one dictionary of the term lists 
        self.__term_dicts = self.bind_term_lists([vis_dis_list, anatomy_list, procedure_list, device_list, change_list])
        #List the negation phrases for the change negation detector
        self.__neg_list = [rule[0] for rule in neg_rules if rule[3] is not '[PSEU]']
        
        return None
    
    
    
    '''MAIN CALL'''
    def run_extractor(self):
        #Get words/phrases from the clause that map to the term lists
        self.__term_lists, self.__tagged_term_lists = self.run_term_finder()
        #Tag the clause with the negation & previous rules
        self.__tagged_clause = self.tag(self.__clause)
        #Get the indices of the mapped chunks 
        self.__term_idx_dicts = self.get_indices()
        #Compare the indices to the negation/previous tags, find modifying terms, 
        # return a list of ExtractedEntity objects (nice packaging)
        self.__all_entities = self.make_structured_entities()
        #
        return self.clean_output()

    
    '''INITIALIZING FUNCTIONS'''
    '''Sort lists by descending length (most specific first) and exclude repeated elements'''
    def sort_list(self, lst):
        lengths = [len(item) for item in lst]
        lst_df = pd.Series(lengths, index = lst) #, columns = ['len'])
        lst_df = lst_df.sort_values(ascending = False)
        return list(set(lst_df.index.tolist()))
    
    '''Create a dictionary of the category lists for access via indices'''
    def bind_term_lists(self, lists):
        term_list_dict, key_idx = {}, 0
        for lst in lists:
            term_list_dict[self.__keys[key_idx]] = lst
            key_idx += 1
        return term_list_dict
    
    '''Tag the given phrase according to all the rule sets'''
    def tag(self, phrase):
        for rule in self.__joined_rules:
                reformatRule = re.sub(r'\s+', self.__filler, rule[0].strip())
                phrase = rule[3].sub(' ' + rule[2].strip() + reformatRule + rule[2].strip() + ' ', phrase)
        return phrase 
    
    
    '''MAPPING TO RADLEX FUNCTIONS'''
    '''Map text (one or many words) to the given term list by attempting common substitutions to handle
    grammatical variations.  '''
    def map_text_to_term_list(self, text, term_list):
        text = text.strip()
        if text in term_list:
            return text
        elif text[:-1] in term_list:
            return text
        else:  #Check a list of substituted phrases against the term_list
            #Suffix substitutions
            subs = [("'s", ""), ("ies", "y"), ("es", "is"), ("ing", "ed"), ("ing", ""), ("ring", ""), 
                ("ion", "ed"), ("ative", "ed"), ("tous", ""), ("ed", "ement"), ("iness", "y"), ("ly", ""), 
                ("ity", ""), ("e", "ion"), 
                #Affix substitutions
                ("para", ""), ("peri", ""), ("bi", "")]

            for sub_tuple in subs:
                if text.replace(sub_tuple[0], sub_tuple[1]) in term_list:
                    return text
        return 
        
    
    '''Map a potential term (potentially multiple words) to a given term list.'''
    def map_term_to_term_list(self, text, term_list): 
        text_words, word_idx, found_mapping = text.split(), 0, False
        
        #Attempt to map as many words as possible to the term list
        while not found_mapping and word_idx < len(text_words):
            mapped_term = self.map_text_to_term_list(text, term_list) 
            if mapped_term is not None:
                 found_mapping = True
            else:  #If no match found, repeatedly remove the first word from the phrase and try again
                word_idx += 1
                text = " ".join(text_words[word_idx:])  
        
        #If still no match found, try each individual word in the term
        if word_idx == len(text_words) and not found_mapping:
            for word in text_words[:-1]:
                if not found_mapping:
                    mapped_term = self.map_text_to_term_list(word, term_list) 
                    if mapped_term is not None:
                        found_mapping = True           
        return mapped_term
    
    '''Select the longest (therefore the most specific) versions of every term for which a positive match was found.'''
    def remove_submatches(self, matches_to_search):
        unique_matches = []
        while len(matches_to_search) > 0:
            match1 = max(matches_to_search, key = len)
            related_matches = [match1]
            matches_to_search.remove(match1)     
            for match2 in matches_to_search:
                if match2 in match1:
                    related_matches.append(match2)
            unique_matches.append(max(related_matches, key = len))
            for match in related_matches:
                if match in matches_to_search:
                    matches_to_search.remove(match)              
        return unique_matches
    
    '''Map possible anatomical modifiers to the descriptor list'''
    def map_modifiers(self, possible_modifiers):
        found_descriptors = []
        for modifier in possible_modifiers:
            description = self.map_term_to_term_list(modifier, self.__descriptor_list + self.__normal_list)
            if description is not None:
                descr_name, adverbs = description, []
                descr_token = [token for token in self.__clause_doc if token.text == description][0]
                #Find and save words that modify core descriptor words (for instance, "mildly overinflated")
                if descr_token.pos_ == 'VERB' or descr_token.pos_ == 'ADJ':
                    adverbs = [child.text for child in descr_token.children if child.pos_ == 'ADV']
                found_descriptors.append(Descriptor(name = descr_name, qualifiers = adverbs))
        return found_descriptors

    
    '''Find the "chunks" (any noun, adjective, verb, or noun phrase that maps to a term list) and stores both 
    their original and rule-tagged versions. The point of tagging the chunks with the negation/previous rules 
    is to allow for accurate index comparison later on. This is because comparison of the term indices with the 
    indices of the negated/previous phrases will determine whether or not a given phrase 
    applies to the term at hand) '''
    def run_term_finder(self):
        num_keys = len(self.__keys)  #KEYS ORDER: 'visual-disease', 'anatomy', 'procedure', 'device', 'change'
        
        #Create storage for the list of tokens & tagged 
        term_lists, tagged_term_lists = [[] for i in range(num_keys)], [[] for i in range(num_keys)]

        #Pull out every noun chunk, verb, adjective, and noun from the clause
        #"Long token" flags that they might be +1 words long
        long_tokens = list(set([chunk for chunk in self.__clause_doc.noun_chunks] + [token for token in self.__clause_doc if token.pos_ == 'VERB' or token.pos_ == 'NOUN' or token.pos_ == 'ADJ']))
        
        #Narrow the list to the most specific versions of a term possible. 
        narrowed_terms = self.remove_submatches([token.text for token in long_tokens])
        #Also store the Spacy-ified version (to preserve the grammar relations)
        long_tokens = [long_token for long_token in long_tokens if long_token.text in narrowed_terms]
        self.__spacy_tokens = long_tokens
        
        #Map tokens to every term list & store any successfully mapped_term chunsk. 
        for term_list_idx in range(num_keys):
            for token in long_tokens:
                mapped_term = self.map_term_to_term_list(token.text, self.__term_dicts[self.__keys[term_list_idx]])
                if mapped_term is not None and mapped_term not in term_lists[term_list_idx]:
                    term_lists[term_list_idx].append(mapped_term)
            term_lists[term_list_idx] = self.remove_submatches(term_lists[term_list_idx])
        
        # For cases in which "lymph node" is in the list of potential "anatomy" terms, 
        # remove "node" from the vis_disease list (no other category cross-checking exists in this code)
        # because it would likely increase the False Negative rate by incorrectly discarding terms
        if "lymph node" in " ".join(term_lists[1]):
            replace_vis_dis_list = []
            for term in term_lists[0]:  #vis_dis 
                if "node" not in term:
                    replace_vis_dis_list.append(term)
            term_lists[0] = replace_vis_dis_list
        
        #Tag the mapped_term_text of the tokens with rules (allow for proper indexing later) & store the 
        tagged_term_list_idx = 0
        for term_list in term_lists:
            for mapped_term in term_list:
                tagged_term_lists[tagged_term_list_idx].append(self.tag(mapped_term))
            tagged_term_list_idx += 1
        
        return term_lists, tagged_term_lists
    
    '''Get indices in the rule-tagged clause of every term in the list of tagged terms.
    Return as a list of dictionaries with key-value pairs like [term: index in rule-tagged clause]'''
    def get_indices(self):
        term_idx_dicts = []
        for lst in self.__tagged_term_lists:
            term_idx_dict = {}
            #Check to make sure doubled tags don't break the code
            #(ie, "no abnormal" => [PREN]no abnormal[PREN] and [PREN]no[PREN])
            for tagged_term in lst:
                if tagged_term in self.__tagged_clause:  
                    term_idx_dict[tagged_term] = self.__tagged_clause.index(tagged_term)
                else:
                    lst.remove(tagged_term)
            term_idx_dicts.append(term_idx_dict)
        return term_idx_dicts
    

    '''CHECK NEGATIONS AND PREVIOUS STATUS'''
    
    '''Check the negation status of a Change finding. This is separate from the other negation checker because
    the occurring negation term needs to be connected directly to the change itself (raw_chunk) rather than
    other nearby concepts'''
    def check_change_negation(self, raw_chunk):
        is_negated = False
        
        for token in self.__clause_doc:
            if token.text in raw_chunk:
                if token.pos_ == 'NOUN':
                    to_check = [child.text for child in token.children]   
                else:
                    to_check = [child.text for anc in [anc for anc in token.ancestors] for child in anc.children]     
                for word in self.__neg_list:
                    if word in to_check:
                        return True
        return False
        
    '''Checks whether or not a term beginning at term_idx is negated. This is a modified form of the NegEx algorithm
    (Chapman). '''
    def check_negation(self, term_idx):
        #Find the indices of the pre-negation, post-negation flags
        clause_words, preneg_idxs, postneg_idxs = self.__tagged_clause.split(), [], []
        
        #Look for the negation flags in the tagged clause
        #Assumes PREP and POSP are actually negations. 
        for word in clause_words:
            if re.findall('\[PREN\]|\[PREP\]', word):
                preneg_idxs.append(self.__tagged_clause.index(word))
            if re.findall('\[POST\]|\[POSP\]', word):
                postneg_idxs.append(self.__tagged_clause.index(word))
              
        #Return false if no negation tags present
        if len(preneg_idxs) == 0 and len(postneg_idxs) == 0:
            return False
        
        #Filter out the right pre/post negation tags with respect to the given term index
        preneg_idxs = [neg_idx for neg_idx in preneg_idxs if neg_idx < term_idx]
        postneg_idxs = [neg_idx for neg_idx in postneg_idxs if neg_idx > term_idx]
        
        #Set pre-neg/post-neg/previous indices, handling multiple negations
        if len(preneg_idxs) % 2 == 0:
            is_pre_negated = False
        else:
            is_pre_negated = True
        
        if len(postneg_idxs) % 2 == 0:
            is_post_negated = False
        else:
            is_post_negated = True
        neg_sum = is_pre_negated + is_post_negated
        
        if neg_sum % 2 == 0: #if not pre or post negated, or if both pre and post negated
            return False
        if neg_sum == 1:
            return True 
    
    '''Determine whether not a term beginning at the given term index is a previous condition or a current one.
    This is an adaptation of the NegEx (Chapman) idea, similar to that proposed by PrevEx ##ALLI ADD CITATION). 
    It assumes that the earliest previous-phrase index (if multiple are found) will determine the 
    previous/current status of the term'''
    def check_previous(self, term_idx):
        clause_words, prev_idxs = self.__tagged_clause.split(), []
        for word in clause_words:
            if re.findall('\[PREV\]', word):
                prev_idxs.append(self.__tagged_clause.index(word))
        if len(prev_idxs) == 0:
            return False
        else:
            #Assume the earliest prev_idx determines the state of the term
            prev_idx = prev_idxs[0]   
            if prev_idx < term_idx:
                return True
        return False
        
    '''PARSING ENTITIES FUNCTIONS'''
    
    '''This function finds the modifiers of a given anatomical or change "token" (it's called token because
    the term is still in the Spacy-ified form with the grammatical information intact )'''
    def get_modifiers(self, token):
        #Find all possible modifiers of a given token
        children = [child.text for child in token.children if child.pos_ in ['ADJ', 'ADV', 'NOUN', 'VERB']]
        ancestors = [anc.text for anc in token.ancestors if anc.pos_ in ['VERB','ADJ', 'ADV', 'NOUN']]
        additionals = [tok.text for tok in self.__clause_doc if (tok.dep_ in ['conj', 'acomp', 'xcomp']) and (token in [child for child in tok.children] or token in [anc for anc in tok.ancestors])]
        dets = [tok.text for tok in self.__clause_doc if tok.dep_ is 'det' and (token in [anc for anc in tok.ancestors])]
        possible_modifiers = list(set(children + ancestors + additionals + dets))
       
        #Narrow the list of modifiers by checking that dependencies match from both directions
        narrowed_modifiers = []
        for modifier in possible_modifiers:
            mod_token = [tok for tok in self.__clause_doc if tok.text == modifier][0]
            mod_dependents = [child for child in mod_token.children] + [anc for anc in mod_token.ancestors]
            if token in mod_dependents:
                narrowed_modifiers.append(modifier)
        return narrowed_modifiers
    
    
    ''' Structure the entity at a given index into an ExtractedEntity object for easy handling.'''
    def structure_entity(self, category_idx, ent_type):        
        #A "raw term" is an untagged term
        tagged_terms, raw_terms = self.__tagged_term_lists[category_idx], self.__term_lists[category_idx]
        term_idx_dict = self.__term_idx_dicts[category_idx]
        entity_list, term_list_idx  = [], 0
        
        while term_list_idx < len(tagged_terms):
            tagged_term, raw_term = tagged_terms[term_list_idx], raw_terms[term_list_idx]
            term_idx = term_idx_dict[tagged_term]
            
            #Get previous and negation statuses at this index in the clause
            is_previous = self.check_previous(term_idx)
            is_negated = self.check_negation(term_idx)
            
            #Find the modifiers of the term
            possible_modifiers = []
            for token in self.__clause_doc:   #Get all possible modifiers
                if token.text in raw_term:
                    possible_modifiers = possible_modifiers + self.get_modifiers(token)
            #Map the modifiers to the descriptor term list. Return as a list of Descriptor objects
            mod_descriptors = self.map_modifiers(list(set(possible_modifiers)))  

            if not (ent_type == 'anatomy' and len(mod_descriptors) == 0): #Ignore anatomy words without descriptors
                if ent_type == 'anatomy':  #Ensure that the Anatomy finding's name includes the descriptors, in parentheses
                    ent_name = raw_term + " (" + ", ".join([descriptor.describe() for descriptor in mod_descriptors]) + ")"
                else: 
                    ent_name = raw_term
                #Check the negation status of a change. ##ALLI WHY DOES THIS HAVE TO BE SEPARATE?
                if ent_type == 'change':
                    is_negated = self.check_change_negation(raw_term)
                    
                #Add the entity with its properties as an ExtractedEntity object to the list of all entities
                entity_list.append(ExtractedEntity(name = ent_name, ent_type = ent_type,
                                                    descriptors = mod_descriptors,
                                                    is_previous = is_previous,
                                                    is_negated = is_negated))
            term_list_idx += 1
        return entity_list
    
    '''Structure every entity from the term indices stored previously'''
    def make_structured_entities(self):
        return [entity for lst in [self.structure_entity(i, self.__keys[i]) for i in range(len(self.__keys))] for entity in lst]

    
    '''Remove repeats of pathologies that were mentioned multiple times with alternate names
    (ie, avoid double counting in cases where a radiologist mentions "pleural effusion" in the Findings 
    and refers back to it as an "effusion" in the "Impressions" section)'''
    def remove_double_mentions(self, df):
        if 'effusion' in df.findings.values and 'pleural effusion' in df.findings.values:
            df = df[df.findings != 'effusion']
        if 'effusions' in df.findings.values and 'pleural effusions' in df.findings.values:
            df = df[df.findings != 'effusions']
        if 'process' in df.findings.values and re.findall('\S+\s+process', ", ".join(df.findings.values)):
            df = df[df.findings != 'process']
        if 'processes' in df.findings.values and re.findall('\S+\s+processes', ", ".join(df.findings.values)):
            df = df[df.findings != 'processes']
        if 'disease' in df.findings.values and (re.findall('\S+\s+disease', ", ".join(df.findings.values))):
            df = df[df.findings != 'disease']
        return df
    
    
    '''Handling exceptions that derive from overlapping dictionaries/lists'''
    def clean_entity_exceptions(self):
        for entity in self.__all_entities:
            if entity.ent_type == 'anatomy' and 'pleural' in entity.name:
                self.__all_entities.remove(entity)
        return 
    
    '''Format the Entities into a dataframe.'''
    def clean_output(self):  
        findings, finding_types, statuses, descriptors, changes, normal_flags = [], [], [], [], [], []
        
        self.clean_entity_exceptions()
        for entity in self.__all_entities:
            #finding, finding_type, descriptors, changes
            findings.append(entity.name)
            finding_types.append(entity.ent_type)
            if entity.ent_type == 'anatomy' or entity.ent_type == 'change':
                descriptors.append(entity.describe())
            else:
                descriptors.append("")
            changes.append(entity.is_changed)
            
            #Set status (ie, negated/previous/current condition)
            if entity.negated and entity.previous:
                statuses.append("negated, previous")
            elif entity.negated:
                statuses.append("negated")
            elif entity.previous:
                statuses.append("previous")
            else:
                statuses.append("current")
                
            #Potentially flag anatomy entities as "normal" (aka unproblematic) 
            #based on what terms are included in the "Descriptors" (ie, "well-aerated lungs")
            #to allow for convenient exclusion later on
            is_normal = False
            if entity.ent_type == 'anatomy':
                is_normal = False
                norm_idx, end_norm_idx = 0, len(self.__normal_list)
                while not is_normal and norm_idx < end_norm_idx:
                    for ent in entity.describe().split(","):
                        if self.__normal_list[norm_idx] == ent:  #remove parentheses on descriptor terms
                            is_normal = True
                    norm_idx += 1
            normal_flags.append(is_normal)
                
        output_df = pd.DataFrame([findings, finding_types, statuses, descriptors, changes, normal_flags],
                           index = ['findings', 'finding_types', 'statuses', 'descriptors','changed', 'normal_flag']).T       
        output_df = self.remove_double_mentions(output_df)
            
        output_df.index = range(len(output_df))
        return output_df

#### Sentence Pre-Processing and Clause Splitting

In [418]:
def pre_process_sents(findings):
    if type(findings) == None or type(findings) == float:
        return []
    else:
        sentences = nltk.tokenize.sent_tokenize(findings)
        sentences = [sent.lower() for sent in sentences]
        sentences = [sent.split("   ") for sent in sentences]
        sentences = [sent for sents in sentences for sent in sents]
        sentences = [re.sub('\d+?/\d+?/\d{2,}', '', sent) for sent in sentences]
        sentences = [sent.replace("/", " ").replace("\n", " ") for sent in sentences]
        #Modify for abbreviations particular to your dataset as needed 
        sentences = [sent.replace("chronic obstructive pulmonary disease", "copd") for sent in sentences]
        sentences = [sent.replace("coronary artery bypass graft", "cabg") for sent in sentences]
        sentences = [sent.replace("coronary bypass surgery", "cabg") for sent in sentences]
        sentences = [sent.replace("tb", "tuberculosis") for sent in sentences]
        sentences = [sent.replace("cp", "costophrenic") for sent in sentences]
        sentences = [sent.replace(".", " ") for sent in sentences]
        return sentences

In [None]:
'''Split initial list of sentences into a list of sentences by clause'''
def split_by_clause(sentence, term_rules):
    
    '''Subfunction to split up sentence if the word 'AND' is present'''
    def split_ands(phrases):
        new_phrases = []
        for phrase in phrases:
            if phrase.count('and') == 1 and "," not in phrase:
                parts = phrase.split('and')
                pos1, pos2 = [token.pos_ for token in nlp(parts[0])], [token.pos_ for token in nlp(parts[1])]
                if 'NOUN' in pos1 and 'VERB' in pos1 and 'NOUN' in pos2 and 'VERB' in pos2:  #maybe also 'ADV'
                    new_phrases.append(parts[0])
                    new_phrases.append(parts[1])
                else:
                    new_phrases.append(phrase)
            else:
                new_phrases.append(phrase)
        return new_phrases
    
    '''Subfunction to split up sentence into comma-separated phrases while not splitting lists'''
    def split_sent_by_comma(sent):
        list_start, list_end, comma_indices = 0, 0, [c.start() for c in re.finditer(',', sent)]

        #if oxford comma
        if re.findall(', (((\w+\s?){1,2},)\s)+?(and|or)', sent):
            lst_indices = [(c.start(), c.end()) for c in re.finditer(', (((\w+\s?){1,2},)\s)+?(and|or)', sent)]
        #if no oxford comma
        elif re.findall('((\w+\s?){1,2},\s?)+?(\s\w+)+?\s(and|or)', sent):
            lst_indices = [(c.start(), c.end()) for c in re.finditer('((\w+\s?){1,2},\s?)+?(\s\w+)+?\s(and|or)', sent)]
        else:
            lst_indices = []
            
        split_zones = [0]
        for j in range(len(lst_indices)):
            split_zones = split_zones + [i for i in range(split_zones[-1], lst_indices[j][0])] + [lst_indices[j][1]]
        split_zones = split_zones + [i for i in range(split_zones[-1], len(sent))]
        
        to_split = [idx for idx in comma_indices if idx in split_zones]
        sxns = [sent[i:j] for i, j in zip([0] + to_split, to_split + [len(sent)])]

        return sxns

    '''Subfunction with common features to apply to split clauses'''
    def apply_split_rules(phrases):
        phrases = [re.split(';|:', phrase) for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        phrases = [phrase.split("  ") for phrase in phrases]
        phrases = [phrase for sub_phrase in phrases for phrase in sub_phrase]
        return split_ands(phrases)
    
    #NegEx-style detection of clause termination words. 
    term_pat, clauses = "\[TERM\]", []
    
    for rule in term_rules:  
        #Tag sentences that contain any clause termination words
        reformatRule = re.sub(r'\s+', '_', rule[0].strip())
        sentence = rule[3].sub(' ' + rule[2].strip() + reformatRule + rule[2].strip() + ' ', sentence)
        
    #If a clause termination word is present, split the phrases there & apply other splitting rules
    if re.findall(term_pat, sentence, flags = re.IGNORECASE):   
        phrases = re.split(term_pat, sentence, flags = re.IGNORECASE)
        phrases = [" ".join([word.strip() for word in phrase.split()]) for phrase in phrases if len(phrase.split()) > 1]
        phrases = [split_sent_by_comma(phrase) for phrase in phrases]
        phrases = apply_split_rules([phrase for sub_phrase in phrases for phrase in sub_phrase])
        
    #If no clause termination words are present, apply general splitting rules
    else:  
        phrases = apply_split_rules(split_sent_by_comma(sentence))
    
    #Return a list of clauses 
    return [phrase.lower() for phrase in phrases if len(phrase) != 0]



### Sub-Wrapper For the Report Extractor (input = patient sentences)

In [None]:
'''This function will output a dataframe with every finding and all of its attributes generated from the 
given list of pre-processed sentences.

Note that it doesn't take the ReportExtractor inputs (ie, lists or rules) as arguments.
This would be easy to fix, but it's a lot to copy  so for clarity and ease of debugging, 
I'm assuming these lists have been defined under the same names earlier in the Jupyter field.
If you want to change this, just copy all the ReportExtractor arguments into the function call behind sents.'''
def get_patient_results(sents):
    patient_sent_dfs = []
    for sent in sents:
        pt_paths, pt_neg_paths, pt_changes, pt_devices, pt_procedures, pt_norm_anatomy = [], [], [], [], [], []
        clauses, clause_outputs = split_by_clause(sent, term_rules), []
        for clause in clauses:
            extractor = ReportExtractor(clause = clause, neg_rules = neg_rules, prev_rules = prev_rules, 
                                        vis_dis_list = vis_dis_list, anatomy_list = anatomy_list, 
                                        procedure_list = procedure_list, 
                                        device_list = device_list, change_list = change_list_narrow,
                                        descriptor_list = sort_list(descriptors), 
                                        normal_list = normal_list)
            clause_outputs.append(extractor.run_extractor())
        if len(clause_outputs) != 0:
            patient_sent_dfs.append(pd.concat(clause_outputs))
        
    if len(patient_sent_dfs) != 0:
        return pd.concat(patient_sent_dfs).drop_duplicates()
    return None

# ReportExtractor Wrapper (CALL THIS)


The default here is to look at the combined FINDINGS and IMPRESSION sections of the reports. It could be easily modified to pick up just one of these sections, or a section with another name (but be careful not to input any text in the past or future tense because the code will fail).



In [None]:

def extract_from_reports(radnet):
    all_abnorms, all_changes = [], []
    for i in range(len(radnet)):
        abnormalities, changes, text = [], [], None 
        
        #Find the relevant section of the reports (modifiable)
        if type(radnet.iloc[i].FINDINGS) != float and type(radnet.iloc[i].IMPRESSION) != float:
            text = radnet.iloc[i].FINDINGS + radnet.iloc[i].IMPRESSION
        elif type(radnet.iloc[i].FINDINGS) != float:
            text = radnet.iloc[i].FINDINGS
        elif type(radnet.iloc[i].IMPRESSION) != float:
            text = radnet.iloc[i].IMPRESSION
        
        if text is not None:
            patient_sents = pre_process_sents(text)
            results = get_patient_results(patient_sents)
            
            #Pick whatever sub-batch of results you would like 
            #Right now, this picks up the 'current' findings without the normal flag 
            #ie, it excludes things like "normal heart"
            if results is not None and len(results) > 0:
                abnorms = results[results.statuses == 'current']
                abnorms = abnorms[abnorms.normal_flag != True]
                all_abnorms.append(abnorms.findings.tolist())
            else:
                all_abnorms.append([])
    return pd.DataFrame([all_abnorms], index = ['abnorms']).T



In [416]:
#0.53 seconds per patient (old version: 0.96 seconds per patient)
start = time.time()
abnorm_df = extract_from_reports(radnet[:10])
print(time.time() - start)



5.325988054275513


In [397]:
save_dicts()  #Save the dictionaries every time you want to update them