In [57]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# USING_PORTION = slice(None, 80) # When training
# USING_PORTION = slice(80, None) # When testing
USING_PORTION = slice(0, 100)

In [58]:
dataset = []
with open('./dataset/dataset.txt', encoding='utf-8') as data_file:
    dataset = data_file.readlines()

dataset = [data.strip() for data in dataset]

print("================ LOADING DATASET ================")
print('\n'.join(dataset[:5]))

Poly- and perfluorinated compounds activate human pregnane X receptor.
Neurotransmitters activate T-cells and elicit crucial functions via neurotransmitter receptors
Pericytes activate complement in fibrosis
These molecules activate different signaling pathways and produce different physiological responses which can be taken advantage of for sepsis modeling.
Monoamines activate neuropeptide signaling cascades to modulate nociception in C. elegans


In [59]:
import re

# To mitigate pos_tag wrongly tags
#     If these words appear, force tag it as 'V'
verbs = [
    'activate', 'inhibit', 'bind',
    'accelerate', 'augment', 'induce', 'stimulate', 'require', 'up-regulate',
    'abolish', 'block', 'down-regulate', 'prevent'
]

remove_paren = re.compile(r"\(.*?\)")
dataset_preprocessed = [
    remove_paren.sub('', sentence)
    for sentence in dataset
]

dataset_tagged = [
    [
        (word, tag) if word not in verbs else (word, 'V')
        for word, tag in pos_tag(word_tokenize(sentence))
    ]
    for sentence in dataset_preprocessed
]

print("================ TAGGING DATASET ================")
print('\n'.join([ str(dataset) for dataset in dataset_tagged[:5] ]))

[('Poly-', 'NNP'), ('and', 'CC'), ('perfluorinated', 'VBD'), ('compounds', 'NNS'), ('activate', 'V'), ('human', 'JJ'), ('pregnane', 'NN'), ('X', 'NNP'), ('receptor', 'NN'), ('.', '.')]
[('Neurotransmitters', 'NNS'), ('activate', 'V'), ('T-cells', 'NNS'), ('and', 'CC'), ('elicit', 'JJ'), ('crucial', 'JJ'), ('functions', 'NNS'), ('via', 'IN'), ('neurotransmitter', 'NN'), ('receptors', 'NNS')]
[('Pericytes', 'NNS'), ('activate', 'V'), ('complement', 'NN'), ('in', 'IN'), ('fibrosis', 'NN')]
[('These', 'DT'), ('molecules', 'NNS'), ('activate', 'V'), ('different', 'JJ'), ('signaling', 'NN'), ('pathways', 'NNS'), ('and', 'CC'), ('produce', 'VB'), ('different', 'JJ'), ('physiological', 'JJ'), ('responses', 'NNS'), ('which', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('taken', 'VBN'), ('advantage', 'NN'), ('of', 'IN'), ('for', 'IN'), ('sepsis', 'NN'), ('modeling', 'NN'), ('.', '.')]
[('Monoamines', 'NNS'), ('activate', 'V'), ('neuropeptide', 'IN'), ('signaling', 'VBG'), ('cascades', 'NNS'), ('to', 'T

In [60]:
from nltk import CFG
from nltk.parse import ChartParser
from nltk.grammar import Nonterminal, Production

grammar = CFG.fromstring('\n'.join([
    'S -> NP VP | NP VP NP | S C S | S "." | VP NP | WDT S | EX S | S RB | S PART | IN S',
    'NP -> N | DT NP | J NP | NP NP | NP MOD | NP C NP | NP R | R NP | PART | NP POS | WRONG_NP',
    'VP -> V | R VP | MD VP | VP PART | VP C VP | VP MOD',
    'MOD -> IN NP | IN IN NP | TO VP | TO NP',
    'PART -> VBN | VBG',
    'C -> CC | ","',
    'N -> NN | NNP | NNS | FW | CD | PRP | WRONG_N',
    'R -> RB | RBR | RBS | RP | R C R',
    'V -> VB | VBD | VBP | VBZ',
    'J -> JJ | JJR | JJS | J C J',
    'WRONG_NP -> JJ | R V | NNP VBZ'
])).productions()


# Append pos_tagged results into originally defined grammar
def generate_sent_grammar(sent):
    sent_grammar = grammar[:]
    sent_words = list([word for word, tag in sent])
    
    for word, tag in sent:
        sent_grammar.append(Production(
            Nonterminal(tag), (word, )
        ))
    
    return sent_grammar, sent_words

    
non_sents = {}
parsed_sents = {}
# Parse dataset
for (idx, sent) in enumerate(dataset_tagged[USING_PORTION]):
    sent_grammar, sent_words = generate_sent_grammar(sent)
    
    parser = ChartParser(CFG(Nonterminal('S'), sent_grammar))
    tree = parser.parse_one(sent_words)
    
    if tree is None:
        non_sents[idx] = sent
    
    else:
        parsed_sents[idx] = tree

print("Parsed %d sents, Unable to parse %d sents" % (len(parsed_sents), len(non_sents)))

Parsed 73 sents, Unable to parse 27 sents


In [61]:
result = {}

# Summarize NP into a string
def summarize_np(tree):
    if tree.label() != 'NP':
        return ''
    
    # NP -> N
    if tree[0].label() == 'N':
        return tree[0][0][0]
    
    # NP -> DT NP | J NP | R NP 
    if tree[0].label() in ('J', 'DT', 'R'):
        return summarize_np(tree[1])
    
    # NP -> PART
    if tree[0].label() == 'PART':
        return tree[0][0][0]
    
    # NP -> WRONG_NP
    # WRONG_NP: Originally, it should not be classified into NP
    #           but it is classified as NP to mitigate not correctly tagged sentence
    if tree[0].label() == 'WRONG_NP':
        if tree[0][0].label() == 'JJ':
            return tree[0][0][0]
        
        if tree[0][0].label() == 'R':
            return tree[0][1][0][0]
        
        if tree[0][0].label() == 'NNP':
            return tree[0][0][0] + ' ' + tree[0][1][0]
    
    if len(tree) < 2:
        return ''
    
    # NP-> NP R | NP MOD
    if tree[1].label() in ('MOD', 'R'):
        return summarize_np(tree[0])

    # NP -> NP C NP
    if tree[1].label() == 'C':
        return summarize_np(tree[0]) + ',' + summarize_np(tree[2])

    # NP -> NP POS 
    if tree[1].label() == 'POS':
        return summarize_np(tree[0]) + ' ' + tree[1][0]

    # NP -> NP NP
    if tree[1].label() == 'NP' and tree[0].label() == 'NP':
        return summarize_np(tree[0]) + ' ' + summarize_np(tree[1])
    
    return ''

# Summarize VP into a string
def summarize_vp(tree):
    # VP -> V
    if tree[0].label() == 'V':
        if isinstance(tree[0][0], str):
            return tree[0][0]
        
        # V -> VB | VBD | VBP | VBZ
        return tree[0][0][0]
    
    # VP -> R VP | MD VP
    if tree[1].label() == 'VP':
        return summarize_vp(tree[1])
    
    # VP -> VP C VP 
    if tree[1].label() == 'C':
        return summarize_vp(tree[0]) + ',' + summarize_vp(tree[2])
    
    # VP -> VP PART | VP MOD
    if tree[0].label() == 'VP':
        return summarize_vp(tree[0])
    
    return ''

# Summarize S into an array of { X: String, V: String, Z: String }
def summarize_s(tree):
    # S -> NP VP | NP VP NP 
    if tree[0].label() == 'NP':
        np = summarize_np(tree[0])
        vp = summarize_vp(tree[1])
        np_2 = ''
        
        # S -> NP VP
        if len(tree) <= 2:
            # S -> NP (VP -> VP MOD)
            if tree[1][1].label() == 'MOD':
                np_2 = summarize_np(tree[1][1][1])
        
        # S -> NP VP NP
        else:
            np_2 = summarize_np(tree[2])
        
        if np_2 == '':
            return []
        
        return [{ 'X': np, 'V': vp, 'Y': np_2 }]
    
    # S -> S "."
    if isinstance(tree[1], str):
        return summarize_s(tree[0])
    
    # S -> S C S
    if tree[1].label() == 'C':
        return summarize_s(tree[0]) + summarize_s(tree[2])
    
    # S -> VP NP
    if tree[0].label() == 'VP':
        return []
    
    # S -> S RB | S PART
    if tree[0].label() == 'S':
        return summarize_s(tree[0])
    
    # S -> WDT S | EX S | IN S
    if tree[1].label() == 'S':
        return summarize_s(tree[1])
    
    return []


for idx, tree in parsed_sents.items():
    values = summarize_s(tree)
    
    # if len(result[idx]) == 0:
        # tree.pretty_print()
    
    new_values = []
    # Change { X: 'a,b', V: 'bind', Y: 'c'} to
    #        { X: 'a', V: 'bind', Y: 'c'}, { X: 'b', V: 'bind', Y: 'c'}
    
    for value in values:
        xs = value['X'].split(',')
        ys = value['Y'].split(',')
        
        for x in xs:
            for y in ys:
                new_values.append({'X': x, 'V': value['V'], 'Y': y})
    
    result[idx] = new_values


# Parse NP to str
def parse_np(subsent):
    np_grammar, np_words = generate_sent_grammar(subsent)
    
    parser = ChartParser(CFG(Nonterminal('NP'), np_grammar))
    tree = parser.parse_one(np_words)
    
    if tree is None:
        return ' '.join(np_words)
    
    return summarize_np(tree)

# If given sentence could not be parsed, split it into two parts using the verb.
# Then, try to parse the parts as NP
#     If it also fails, just return splitted.
for idx, non_sent in non_sents.items():
    verb_cands = list([ (word, tag) for word, tag in non_sent if tag == 'V' ])
    if len(verb_cands) == 0:
        verb_cands = list([ (word, tag) for word, tag in non_sent if tag.startswith('V') ])
    
    if len(verb_cands) == 0:
        result[idx] = [{'X': '', 'V': '', 'Y': ''}]
        continue
    
    verb = verb_cands[0]
    verb_idx = non_sent.index(verb)
    
    # Split into two parts
    sub_raw = non_sent[:verb_idx]
    obj_raw = non_sent[verb_idx + 1:]
    
    sub = parse_np(sub_raw)
    obj = parse_np(obj_raw)
    
    result[idx] = [{
        'X': sub,
        'V': verb[0],
        'Y': obj
    }]

# Print extracted result
import json
with open('./output.txt', 'w', encoding='utf-8') as f:
    for index, values in sorted(result.items(), key = lambda x: x[0]):
        value_serialized = json.dumps(values)
        
        f.write(value_serialized + '\n')
        print(index, value_serialized)


0 [{"X": "Poly- and perfluorinated compounds", "V": "activate", "Y": "human pregnane X receptor ."}]
1 [{"X": "Neurotransmitters", "V": "activate", "Y": "T-cells"}, {"X": "Neurotransmitters", "V": "activate", "Y": "functions receptors"}]
2 [{"X": "Pericytes", "V": "activate", "Y": "complement"}]
3 [{"X": "molecules", "V": "activate", "Y": "different signaling pathways and produce different physiological responses which can be taken advantage of for sepsis modeling ."}]
4 [{"X": "Monoamines", "V": "activate", "Y": "elegans"}]
5 [{"X": "bacteria", "V": "activate", "Y": "interferon production"}]
6 [{"X": "Salient sounds", "V": "activate", "Y": "human visual cortex automatically ."}]
7 [{"X": "Bisphenol A and its analogues", "V": "activate", "Y": "human pregnane X receptor ."}]
8 [{"X": "cancer vaccination", "V": "activate", "Y": "T-cell responses"}, {"X": "cells", "V": "activate", "Y": "T-cell responses"}]
9 [{"X": "Because the subthreshold sequences", "V": "activate", "Y": "the retina at

In [62]:
import csv
with open('dataset/dataset_annotations.csv', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    rows = list(reader)

true_positive = 0
false_positive = 0
false_negative = 0

for idx, (annotations, _) in enumerate(rows[USING_PORTION]):
    groundtruth = set()
    for triplet in json.loads(annotations):
        groundtruth.add(
            (triplet['X'], triplet['V'], triplet['Y'])
        )

    predicted = set()
    for triplet in result[idx]:
        predicted.add(
            (triplet['X'], triplet['V'], triplet['Y'])
        )
    
    true_positive += len(groundtruth & predicted)
    false_positive += len(predicted - groundtruth)
    false_negative += len(groundtruth - predicted)

precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)

if precision + recall == 0:
    f_score = 0

else:
    f_score = 2 * (precision * recall) / (precision + recall)


print("TP       : %d"   % true_positive)
print("FP       : %d"   % false_positive)
print("FN       : %d"   % false_negative)
print("Precision: %.4f" % precision)
print("Recall   : %.4f" % recall)
print("F Score  : %.4f" % f_score)

ERROR Poly- and perfluorinated compounds activate human pregnane X receptor.
[{"X": "Neurotransmitters", "V": "activate", "Y": "T-cells"}]
[{"X": "Pericytes", "V": "activate", "Y": "complement"}]
[{"X": "Molecules", "V": "activate", "Y": "different signaling pathways"}]
[{"X": "Monoamines", "V": "activate", "Y": "neuropeptide signaling cascades"}]
[{"X": "bacteria", "V": "activate", "Y": "type-I interferon production"}]
[{"X": "Salient sounds", "V": "activate", "Y": "human visual cortex"}]
[{"X": "Bisphenol A", "V": "activate", "Y": "human pregnane X receptor"}, {"X": "analogues", "V": "activate", "Y": "human pregnane X receptor"}]
[{"X": "cells", "V": "activate", "Y": "T-cell responses"}]
[{"X": "substhreshold sequences", "V": "activate", "Y": "retina"}]
[{"X": "drugs", "V": "activate", "Y": "reward pathways"}]
[{"X": "Cannabinoids", "V": "activate", "Y": "signaling"}]
[{"X": "Integrins", "V": "activates", "Y": "nuclear transcription factor-kappaB"}, {"X": "cytokines", "V": "activates