# TreeDLib

In [67]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
#from feature_templates import *
#from basic_features import *
from treedlib_util import PTSVParser
import lxml.etree as et
from treedlib_structs import corenlp_to_xmltree, XMLTree

We define three classes of operators:
* _NodeSets:_ $S : 2^T \mapsto 2^T$
* _Indicators:_ $I : 2^T \mapsto \{0,1\}^F$
* _Combinators:_ $C : \{0,1\}^F \times \{0,1\}^F \mapsto \{0,1\}^F$

where $T$ is a given input tree, and $F$ is the dimension of the feature space.

In [87]:
# Basic IO
parser = PTSVParser([
        ('doc_id', 'text'),
        ('sent_id', 'int'),
        ('text', 'text'),
        ('words', 'text[]'),
        ('lemmas', 'text[]'),
        ('poses', 'text[]'),
        ('ners', 'text[]'),
        ('char_idxs', 'int[]'),
        ('dep_paths', 'text[]'),
        ('dep_parents', 'int[]')
    ])
rows = map(parser.parse_line, open('test/test1.parsed.tsv', 'rb'))

In [88]:
rows[0]

<Row(dep_paths=['nn', 'nsubj', 'cop', 'det', 'amod', 'amod', 'amod', 'nn', None, 'vmod', None, 'agent', None, 'det', 'nn', 'nn', 'prep_of', None], char_idxs=[0, 11, 20, 23, 25, 30, 38, 46, 63, 71, 78, 81, 91, 94, 98, 103, 110, 114], text=Hyper-IgM1 syndrome is a rare genetic primary immunodeficiency disease caused by mutations of the CD40 ligand gene., ners=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], sent_id=1, words=['Hyper-IgM1', 'syndrome', 'is', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'caused', 'by', 'mutations', 'of', 'the', 'CD40', 'ligand', 'gene', '.'], lemmas=['hyper-igm1', 'syndrome', 'be', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'cause', 'by', 'mutation', 'of', 'the', 'cd40', 'ligand', 'gene', '.'], dep_parents=[2, 9, 9, 9, 9, 9, 9, 9, 0, 9, 0, 10, 0, 17, 17, 17, 12, 0], poses=['NN', 'NN', 'VBZ', 'DT', 'JJ', 'JJ', 'JJ', 'NN', 'NN', 'VBN', 'IN', 'NNS', 'IN', 'DT', 'NN', 'NN', '

### Testing XML speeds

How does it compare between:
* parse to XML via this python code, store as string, then parse from string at runtime
* just parse to XML at runtime via this python code?

In [89]:
# Map sentence to xmltree
%time xts = map(corenlp_to_xmltree, rows)

CPU times: user 9.16 ms, sys: 2.84 ms, total: 12 ms
Wall time: 9.58 ms


In [90]:
# Pre-process to xml string
xmls = [xt.to_str() for xt in map(corenlp_to_xmltree, rows)]

# Parse @ runtime using lxml
%time roots = map(et.fromstring, xmls)

CPU times: user 383 µs, sys: 101 µs, total: 484 µs
Wall time: 489 µs


## Simple demo: Generating DDLib features

As a first simple demo let's generate the features generated by [ddlib](http://deepdive.stanford.edu/doc/basics/gen_feats.html).

**_Note there are some noticeable differences stemming from using a dep tree representation at base; but a simple linear representation of the sentence could be used too, as in DDLib_**

First, let's load a few sample sentences and convert one of them to XML format for testing; we'll also load the feature templates library, and also tag some candidates (crudely for now) to play around with:

In [92]:
xts[1].render_tree()
root = xts[1].root

In [93]:
print Indicator(Between(Mention(0), Mention(1)), 'word')
Indicator(Between(Mention(0), Mention(1)), 'word').print_apply(root, [[19], [21]])

<Indicator:word:BETWEEN-MENTION-and-MENTION, xpath="//*[{0}][1]/ancestor-or-self::*[count(. | //*[{1}][1]/ancestor-or-self::*) = count(//*[{1}][1]/ancestor-or-self::*)][1]/descendant-or-self::*[(count(.//*[{0}]) = count(//*[{0}])) or (count(.//*[{1}]) = count(//*[{1}]))]">
WORD:BETWEEN-MENTION-and-MENTION[PKD1]


In [53]:
Indicator(Between(Mention(0), Mention(1)), 'word').print_apply(root, [range(5), [19]])

WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations]


In [45]:
Indicator(RightSiblings(Mention()), 'word').print_apply(root, [range(5)], 'word_idx')

WORD:RIGHT-OF-MENTION[is_the_common]


In [40]:
Indicator(Filter(Between(Mention(0), Mention(1)), 'pos', 'NN'), 'lemma').print_apply(root, [range(5), [19]], 'word_idx')

LEMMA:FILTER-BY(pos=NN):BETWEEN-MENTION-and-MENTION[disorder_disease_mutation]


In [26]:
root.xpath("//*[starts-with(@pos, 'VB')]/@pos")

['VBZ', 'VBN', 'VBZ']

In [55]:
for feat in get_relation_features(root, range(5), [21]):
    print feat

WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O_O]
DEP_LABEL:BETWEEN-MENTION-and-MENTION[conj_and_agent_prep_in]
WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations]
WORD:BETWEEN-MENTION-and-MENTION[caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation]
LEMMA:BETWEEN-MENTION-and-MENTION[cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS]
POS:BETWEEN-MENTION-and-MENTION[VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
DEP_LABEL:BETWEEN-MENTION-and-MENTION[conj_and_agent_prep_in]
LEMMA:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION[cause]
WORD:RIGHT-OF-MENTION[is]
WORD:RIGHT-OF-MENTION[is_the]
WORD:RIGHT-OF-MENTION[is_the_common]
LEMMA:RIGHT-OF-MENTION[be]
LEMMA:RIGHT-OF-MENTION[be_the]
LEMMA:RIGHT-OF-MENTION[be_the_common]
POS:RIGHT-OF-ME

In [19]:
root = dts[1].root
print Indicator(Between(Mention(0), Mention(1)), 'lemma')
print '\n'
Indicator(Between(Mention(0), Mention(1)), 'lemma').print_apply(root, [[19], [21]], 'word_idx')

<Indicator:lemma:BETWEEN-MENTION-and-MENTION, xpath="//*[{0}]/ancestor-or-self::*[count(. | //*[{1}]/ancestor-or-self::*) = count(//*[{1}]/ancestor-or-self::*)][1]/descendant-or-self::*[ .//*[{0}] | .//*[{1}]]">


LEMMA:BETWEEN-MENTION-and-MENTION[pkd1]


In [33]:
range(5)

[0, 1, 2, 3, 4]

In [36]:

root.xpath("//*[@word_idx='0' or @word_idx='1']/@word")

['Autosomal', 'dominant']

In [19]:
et.tostring(root)

'<node char_idx="80" dep_parent="0" lemma="disorder" ner="O" pos="NN" word="disorder" word_idx="11"><node char_idx="37" dep_label="nsubj" dep_parent="12" lemma="disease" ner="O" pos="NN" word="disease" word_idx="4"><node char_idx="0" dep_label="amod" dep_parent="5" lemma="autosomal" ner="O" pos="JJ" word="Autosomal" word_idx="0"/><node char_idx="10" dep_label="amod" dep_parent="5" lemma="dominant" ner="O" pos="JJ" word="dominant" word_idx="1"/><node char_idx="19" dep_label="amod" dep_parent="5" lemma="polycystic" ner="O" pos="JJ" word="polycystic" word_idx="2"/><node char_idx="30" dep_label="nn" dep_parent="5" lemma="kidney" ner="O" pos="NN" word="kidney" word_idx="3"/></node><node char_idx="45" dep_label="cop" dep_parent="12" lemma="be" ner="O" pos="VBZ" word="is" word_idx="5"/><node char_idx="48" dep_label="det" dep_parent="12" lemma="the" ner="O" pos="DT" word="the" word_idx="6"/><node char_idx="57" dep_label="amod" dep_parent="12" lemma="common" ner="O" pos="JJ" word="common" word_

In [None]:
import lxml.etree as et
from util import load_sentences, tag_candidate
from tree_structs import sentence_to_xmltree, XMLTree
dts = map(sentence_to_xmltree, load_sentences('test/test1.parsed.tsv'))
dt = dts[1]
tag_candidate(dt.root, ['Autosomal', 'dominant', 'polycystic', 'kidney', 'disease'], 'P1')
tag_candidate(dt.root, ['PKD1'], 'G1')
tag_candidate(dt.root, ['PKD2'], 'G2')
dt.to_str()
dt.render_tree()
root = dt.root

In [None]:
dts[3].render_tree()

In [None]:
pheno = root.xpath("//*[@cid='P1'][1]")[0]

In [None]:
p = XMLTree(pheno)
p.render_tree()

In [None]:
NGrams(Between(Mention(0), Mention(1)), 'lemma', 3).print_apply(root, ['P1', 'G1'])

In [None]:
load_sentences('test/test1.parsed.tsv')[1].text

In [None]:
for feat in get_generic_mention_features(dt.root, 'G2', ['monogenic']):
    print feat

### Table example

In [None]:
# Some wishful thinking...
table_xml = """
<div class="table-wrapper">
    <h3>Causal genomic relationships</h3>
    <table>
        <tr><th>Gene</th><th>Variant</th><th>Phenotype</th></tr>
        <tr><td>ABC</td><td><i>AG34</i></td><td>Headaches during defecation</td></tr>
        <tr><td>BDF</td><td><i>CT2</i></td><td>Defecation during headaches</td></tr>
        <tr><td>XYG</td><td><i>AT456</i></td><td>Defecasomnia</td></tr>
    </table>
</div>
"""
from IPython.core.display import display_html, HTML
display_html(HTML(table_xml))

In [None]:
tt = TableTree(table_xml)

In [None]:
tt.to_xml_str()

In [None]:
tt.root