In [1]:
import penemuu
import glob
import json
import spacy
import json

import en_core_web_sm
from spacy.pipeline import EntityRuler
from spacy import displacy
from penemuu import TestFunctions

from collections import defaultdict
from collections import Counter

from itertools import product

Generate corpus of biomedical abstracts annotated with named entities (bacterial species and habitats)
and extract sentences that contain at least one entity of each type.

In [2]:
"""
Pubmed files were downloaded using easy_pubmed_batch_downloads.R (easyPubMed R library) with a query
for "foodborne bacterial infections".

Each file contains 100 entries in MedLine format.

Available titles and/or abstracts are extracted from each entry, combined, and used to populate a 
PassageCorpus object.
"""
pubmed_files = glob.glob('penemuu/resources/pubmed/bacteria_foodborne_disease_200115/*txt')
corpus = penemuu.PassageCorpus()

for file in pubmed_files:
    corpus.add_medline_records_from_file(file)

In [3]:
"""
jsonl file for entity ruler was generated from NCBI taxonomy dump file (2/21) (bacteria) and BioNLP 
bacterial biotope .obo reference file (habitats).

PassageAnnotator contains a spaCy pipeline with an EntityRuler derived from the jsonl file.
"""
jsonl = 'penemuu/resources/entity_ruler_bacteria_habitat_200130.jsonl'

annotator = penemuu.PassageAnnotator()
annotator.add_entity_ruler_from_jsonl(jsonl)

Entity ruler generated from jsonl at penemuu/resources/entity_ruler_bacteria_habitat_200130.jsonl
Entity ruler added to spacy pipeline.
PySBDFactory added to spacy pipeline.


In [4]:
"""
Text passages are annotated for named entities and sentences containing at least 1 entity of each type are
extracted and written to a file.
"""    
outfile = 'penemuu/resources/test_sentences_with_bac_and_hab_ents_200211.txt'
sentences = TestFunctions.get_single_sentences_with_entities_by_labels(corpus, annotator, ['BACTERIA', 'HABITAT'], outfile)

"""
Filtering sentences to those with a root that is a verb or auxiliary verb eliminates titles (which are 
often not full sentences) and sentence fragments due to sentence boundary detection errors.
"""
full_sentences = []
for sent in sentences:
    if sent.root.pos_ in ['VERB', 'AUX']:
        full_sentences.append(sent)

In [5]:
"""
Named entity recognition examples
"""

colors = {"BACTERIA": "LIGHTGREEN", "HABITAT": "BEIGE"}
options = {"ents": ["BACTERIA", 'HABITAT'], "colors": colors}

for sent in full_sentences[0:5]:

    displacy.render(sent, style='ent', options=options)

In [6]:
"""
Dependency tree examples
"""

for sent in full_sentences[0:5]:
    displacy.render(sent)

In [7]:
"""

Dependency Parsing

Dependencies are the contextual links between words in a sentence.  Nuances of dependency can
discriminate between:

-Entities that are positively linked:

    "Bacillus subtilis was isolated from fermented cheese."

-Entities that are negatively linked:

    "Bacillus subtilis was not isolated from fermented cheese."

-Entities that are in the same sentence, but unlinked:

    "Bacilli with resistance markers similar to those in Bacillus subtilis were isolated
    from fermented cheese."

spaCy generates a dependency tree for each sentence:
    -Every word (token) in the sentence belongs to the tree.
    -Every word has a single head.  The nature of that link is the "dependency" of the child.
    -Common dependencies include subject, direct object, passive object, preposition, etc.
    -Every sentence has a single root node, identified as sentence.root or if token.head == token.


"""

test_sent = full_sentences[0]

displacy.render(test_sent)

print("Entities in test sentence:")
print(test_sent.ents)
entity_pairs = penemuu.TestFunctions.get_entity_pairs(test_sent, 'BACTERIA', 'HABITAT')

print("Entity pairs in test sentence:")
print(entity_pairs)

lca_node = penemuu.TestFunctions.get_lca_idx_of_ent_pair(test_sent, entity_pairs[0])
print(test_sent[lca_node])

Entities in test sentence:
[salmonella, turkey]
Entity pairs in test sentence:
[(salmonella, turkey)]
resistance


In [8]:
def get_dep_path_to_lca(sent, ent_token, lca_node):
    
    polarity = 1
    current_node = ent_token
    nodes_in_path = [current_node]
    while current_node not in [lca_node, sent.root]:
        for child in current_node.children:
            if child.dep_ == 'NEG':
                polarity = polarity * -1
            
        print(current_node, current_node.dep_, current_node.pos_)    
        nodes_in_path.append(current_node)
        current_node = current_node.head
        
    if polarity == 1:
        return nodes_in_path
    else:
        return 0
        

def get_dep_path_between_entities(sent, ent_1, ent_2):
    
    lca_node = penemuu.TestFunctions.get_lca_idx_of_ent_pair(sent, (ent_1, ent_2))

    ent_1_path_to_lca = get_dep_path_to_lca(sent, ent_1, lca_node)
    ent_2_path_to_lca = get_dep_path_to_lca(sent, ent_2, lca_node)
    
    result = {
        'ent_1_path_to_lca' : ent_1_path_to_lca,
        'ent_2_path_to_lca' : ent_2_path_to_lca,
        'lca_node_path_to_root' : [],
        'polarity' : 1,
    
    }    
    if ent_1_path_to_lca != 0 and ent_2_path_to_lca != 0:
        current_node = lca_node
        while current_node != sent.root:
            for child in current_node.children:
                if child.dep_ == 'NEG':
                    result['polarity'] = result['polarity'] * -1
                    
            current_node = current_node.head


def get_dep_path_to_root(sent, token):
    
    """
    -Traverse path from entity to root of sentence:
        -Nouns on path are linked by prepositions and other connectors.
        -Check each noun and verb on path for negation:
            -child with dep = NEG?
            -Reversed by some prepositions ("but", "however")
    -Directionality?
    
    """
    
    path = []
    nouns_in_path = []
    
    polarity = 1
    current_node = token
    while current_node != sent.root:
        #if current_node.pos_ == 'NOUN':
            
        print(current_node.text, current_node.dep_, current_node.pos_)
        current_node = current_node.head
    
get_dep_path_to_root(test_sent, entity_pairs[0][0])
get_dep_path_to_root(test_sent, entity_pairs[0][1])

salmonella pobj NOUN
of prep ADP
resistance pobj NOUN
of prep ADP
prevalence dobj NOUN
investigate xcomp VERB
turkey compound PROPN
farms pobj NOUN
from prep ADP
isolated acl VERB
resistance pobj NOUN
of prep ADP
prevalence dobj NOUN
investigate xcomp VERB


In [10]:
"""
Basic sentence constructions

"""

test_sent = annotator.annotate_passage_with_named_entities('Escherichia coli was isolated from cheese.')

test_sent_neg = annotator.annotate_passage_with_named_entities('Escherichia coli was not isolated from cheese.')

test_sent_miss = annotator.annotate_passage_with_named_entities('Escherichia coli was not isolated from cheese, but staphylococcus aureus was.')

In [11]:
"""
Components


-Named entities
-Non named entity nouns
-Verbs (positive by default)
-Conjuncts connect verbs
-Prepositional phrases
-Modifiers:
    -if child node has dep_ = neg, negate parent ()
    -cc:
        -'but' reverse polarity of parent

"""

print(test_sent_neg)

displacy.render(test_sent_miss)

escherichia coli was not isolated from cheese.
