# TreeDLib

In [102]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
from treedlib import *
import lxml.etree as et

We define three classes of operators:
* _NodeSets:_ $S : 2^T \mapsto 2^T$
* _Indicators:_ $I : 2^T \mapsto \{0,1\}^F$
* _Combinators:_ $C : \{0,1\}^F \times \{0,1\}^F \mapsto \{0,1\}^F$

where $T$ is a given input tree, and $F$ is the dimension of the feature space.

### DB connection...

## Debugging

In [8]:
raw_in = """
79205745-b593-4b98-8a94-da6b8238fefc    32      “I know where you can get those framed,” aprivate eye named Visser (M. Emmet Walsh) tells a bar owner, Marty (Dan Hedaya), when he presents him with pho    tos of his wife, Abby (Frances McDormand) in bed with his bartender, Ray (John Getz), and from that staring point nocturnal acts of cruelty take shape not through nefarious schemes but crude assumptio    ns and mistaken identity.       {``,I,know,where,you,can,get,those,framed,",",'',aprivate,eye,named,Visser,-LRB-,M.,Emmet,Walsh,-RRB-,tells,a,bar,owner,",",Marty,-LRB-,Dan,Hedaya,-RRB-,",",when,he,pre    sents,him,with,photos,of,his,wife,",",Abby,-LRB-,Frances,McDormand,-RRB-,in,bed,with,his,bartender,",",Ray,-LRB-,John,Getz,-RRB-,",",and,from,that,staring,point,nocturnal,acts,of,cruelty,take,shape,no    t,through,nefarious,schemes,but,crude,assumptions,and,mistaken,identity,.}      {``,I,know,where,you,can,get,those,frame,",",'',aprivate,eye,name,Visser,-lrb-,M.,Emmet,Walsh,-rrb-,tell,a,bar,owner,","    ,Marty,-lrb-,Dan,Hedaya,-rrb-,",",when,he,present,he,with,photo,of,he,wife,",",Abby,-lrb-,Frances,McDormand,-rrb-,in,bed,with,he,bartender,",",Ray,-lrb-,John,Getz,-rrb-,",",and,from,that,stare,point,n    octurnal,act,of,cruelty,take,shape,not,through,nefarious,scheme,but,crude,assumption,and,mistaken,identity,.}   {``,PRP,VBP,WRB,PRP,MD,VB,DT,VBN,",",'',JJ,NN,VBN,NNP,-LRB-,NNP,NNP,NNP,-RRB-,VBZ,DT,NN,    NN,",",NNP,-LRB-,NNP,NNP,-RRB-,",",WRB,PRP,VBZ,PRP,IN,NNS,IN,PRP$,NN,",",NNP,-LRB-,NNP,NNP,-RRB-,IN,NN,IN,PRP$,NN,",",NNP,-LRB-,NNP,NNP,-RRB-,",",CC,IN,DT,VBG,NN,JJ,NNS,IN,NN,VBP,NN,RB,IN,JJ,NNS,CC,JJ    ,NNS,CC,JJ,NN,.}        {O,O,O,O,O,O,O,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,PERSON,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,O,O,O,O,O,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSO    N,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O}      {4952,4953,4955,4960,4966,4970,4974,4978,4984,4990,4991,4993,5002,5006,5012,5019,5020,5023,5029,5034,5036,5042,5044,5048,5053,5055,5061,5062,506    6,5072,5073,5075,5080,5083,5092,5096,5101,5108,5111,5115,5119,5121,5126,5127,5135,5144,5146,5149,5153,5158,5162,5171,5173,5177,5178,5183,5187,5188,5190,5194,5199,5204,5212,5218,5228,5233,5236,5244,524    9,5255,5259,5267,5277,5285,5289,5295,5307,5311,5320,5328}       {"",nsubj,dep,advmod,nsubj,aux,ccomp,dobj,vmod,"","",amod,nsubj,"",nsubj,"",nn,nn,appos,"",ccomp,det,nn,dobj,"",dobj,"",nn,appos,"","",a    dvmod,nsubj,dep,dobj,"",prep_with,"",poss,prep_of,"",appos,"",nn,appos,"","",prep_in,"",poss,prep_with,"",conj_and,"",nn,appos,"","","",conj_and,pobj,dep,nn,amod,nsubj,"",prep_of,ccomp,dobj,neg,"",amo    d,prep_through,"",amod,conj_but,"",amod,conj_and,""}    {0,3,13,7,7,7,3,7,8,0,0,13,14,0,21,0,19,19,15,0,14,24,24,21,0,14,0,29,26,0,0,34,34,26,34,0,34,0,40,37,0,40,0,45,42,0,0,42,0,51,42,0,51,0,56,53,0    ,0,0,51,60,61,65,65,68,0,65,62,68,68,0,73,68,0,76,73,0,79,73,0}
"""

In [9]:
# Basic IO
parser = PTSVParser([
        ('doc_id', 'text'),
        ('sent_id', 'int'),
        ('text', 'text'),
        ('words', 'text[]'),
        ('lemmas', 'text[]'),
        ('poses', 'text[]'),
        ('ners', 'text[]'),
        ('char_idxs', 'int[]'),
        ('dep_paths', 'text[]'),
        ('dep_parents', 'int[]')
    ])
row = parser.parse_line(raw_in.strip())

ValueError: 2 attributes for 10 fields:
79205745-b593-4b98-8a94-da6b8238fefc	32      “I know where you can get those framed,” aprivate eye named Visser (M. Emmet Walsh) tells a bar owner, Marty (Dan Hedaya), when he presents him with pho    tos of his wife, Abby (Frances McDormand) in bed with his bartender, Ray (John Getz), and from that staring point nocturnal acts of cruelty take shape not through nefarious schemes but crude assumptio    ns and mistaken identity.       {``,I,know,where,you,can,get,those,framed,",",'',aprivate,eye,named,Visser,-LRB-,M.,Emmet,Walsh,-RRB-,tells,a,bar,owner,",",Marty,-LRB-,Dan,Hedaya,-RRB-,",",when,he,pre    sents,him,with,photos,of,his,wife,",",Abby,-LRB-,Frances,McDormand,-RRB-,in,bed,with,his,bartender,",",Ray,-LRB-,John,Getz,-RRB-,",",and,from,that,staring,point,nocturnal,acts,of,cruelty,take,shape,no    t,through,nefarious,schemes,but,crude,assumptions,and,mistaken,identity,.}      {``,I,know,where,you,can,get,those,frame,",",'',aprivate,eye,name,Visser,-lrb-,M.,Emmet,Walsh,-rrb-,tell,a,bar,owner,","    ,Marty,-lrb-,Dan,Hedaya,-rrb-,",",when,he,present,he,with,photo,of,he,wife,",",Abby,-lrb-,Frances,McDormand,-rrb-,in,bed,with,he,bartender,",",Ray,-lrb-,John,Getz,-rrb-,",",and,from,that,stare,point,n    octurnal,act,of,cruelty,take,shape,not,through,nefarious,scheme,but,crude,assumption,and,mistaken,identity,.}   {``,PRP,VBP,WRB,PRP,MD,VB,DT,VBN,",",'',JJ,NN,VBN,NNP,-LRB-,NNP,NNP,NNP,-RRB-,VBZ,DT,NN,    NN,",",NNP,-LRB-,NNP,NNP,-RRB-,",",WRB,PRP,VBZ,PRP,IN,NNS,IN,PRP$,NN,",",NNP,-LRB-,NNP,NNP,-RRB-,IN,NN,IN,PRP$,NN,",",NNP,-LRB-,NNP,NNP,-RRB-,",",CC,IN,DT,VBG,NN,JJ,NNS,IN,NN,VBP,NN,RB,IN,JJ,NNS,CC,JJ    ,NNS,CC,JJ,NN,.}        {O,O,O,O,O,O,O,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,PERSON,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,O,O,O,O,O,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSON,O,O,O,O,O,O,O,PERSON,O,PERSON,PERSO    N,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O}      {4952,4953,4955,4960,4966,4970,4974,4978,4984,4990,4991,4993,5002,5006,5012,5019,5020,5023,5029,5034,5036,5042,5044,5048,5053,5055,5061,5062,506    6,5072,5073,5075,5080,5083,5092,5096,5101,5108,5111,5115,5119,5121,5126,5127,5135,5144,5146,5149,5153,5158,5162,5171,5173,5177,5178,5183,5187,5188,5190,5194,5199,5204,5212,5218,5228,5233,5236,5244,524    9,5255,5259,5267,5277,5285,5289,5295,5307,5311,5320,5328}       {"",nsubj,dep,advmod,nsubj,aux,ccomp,dobj,vmod,"","",amod,nsubj,"",nsubj,"",nn,nn,appos,"",ccomp,det,nn,dobj,"",dobj,"",nn,appos,"","",a    dvmod,nsubj,dep,dobj,"",prep_with,"",poss,prep_of,"",appos,"",nn,appos,"","",prep_in,"",poss,prep_with,"",conj_and,"",nn,appos,"","","",conj_and,pobj,dep,nn,amod,nsubj,"",prep_of,ccomp,dobj,neg,"",amo    d,prep_through,"",amod,conj_but,"",amod,conj_and,""}    {0,3,13,7,7,7,3,7,8,0,0,13,14,0,21,0,19,19,15,0,14,24,24,21,0,14,0,29,26,0,0,34,34,26,34,0,34,0,40,37,0,40,0,45,42,0,0,42,0,51,42,0,51,0,56,53,0    ,0,0,51,60,61,65,65,68,0,65,62,68,68,0,73,68,0,76,73,0,79,73,0}

### Feature focus: Preceding statements which nullify or negate meaning

Example:
> _Ex1:_ To investigate whether mutations in the SURF1 gene are a cause of Charcot-Marie-Tooth -LRB- CMT -RRB- disease

> _Ex2:_ To investigate the genetic effect of a new mutation found in exon 17 of the myophosphorylase -LRB- PYGM -RRB- gene as a cause of McArdle disease -LRB- also known as type 5 glycogenosis -RRB-.

Notes:
* These seem to mostly be **_modifiers of the primary verb_**?
    * We are only sampling from a limited set of patterns of sentences (due to narrow DSR set) currently...
* Modifiers in general...?
* _I know how RNNs claim to / do handle this phenomenon..._ *

In [133]:
ex1_id = ('24027061', 'Abstract.0', 1)
ex1_raw="""
<node dep_parent="0" lemma="investigate" ner="O" pos="VB" word="investigate" word_idx="1"><node dep_parent="2" dep_path="aux" lemma="to" ner="O" pos="TO" word="To" word_idx="0"/><node dep_parent="2" dep_path="ccomp" lemma="cause" ner="O" pos="NN" word="cause" word_idx="10"><node dep_parent="11" dep_path="mark" lemma="whether" ner="O" pos="IN" word="whether" word_idx="2"/><node dep_parent="11" dep_path="nsubj" lemma="mutation" ner="O" pos="NNS" word="mutations" word_idx="3"><node dep_parent="4" dep_path="prep_in" lemma="gene" ner="O" pos="NN" word="gene" word_idx="7"><node dep_parent="8" dep_path="det" lemma="the" ner="O" pos="DT" word="the" word_idx="5"/><node dep_parent="8" dep_path="nn" lemma="surf1" ner="O" pos="NN" word="SURF1" word_idx="6"/></node></node><node dep_parent="11" dep_path="cop" lemma="be" ner="O" pos="VBP" word="are" word_idx="8"/><node dep_parent="11" dep_path="det" lemma="a" ner="O" pos="DT" word="a" word_idx="9"/><node dep_parent="11" dep_path="prep_of" lemma="Charcot-Marie-Tooth" ner="O" pos="NNP" word="Charcot-Marie-Tooth" word_idx="12"/><node dep_parent="11" dep_path="dep" lemma="disease" ner="O" pos="NN" word="disease" word_idx="16"><node dep_parent="17" dep_path="appos" lemma="CMT" ner="O" pos="NNP" word="CMT" word_idx="14"/></node></node></node>
"""
xt1 = XMLTree(et.fromstring(ex1_raw))
ex2_id = ('15262743', 'Abstract.0', 1)
ex2_raw="""
<node dep_parent="0" lemma="investigate" ner="O" pos="VB" word="investigate" word_idx="1"><node dep_parent="2" dep_path="aux" lemma="to" ner="O" pos="TO" word="To" word_idx="0"/><node dep_parent="2" dep_path="dobj" lemma="effect" ner="O" pos="NN" word="effect" word_idx="4"><node dep_parent="5" dep_path="det" lemma="the" ner="O" pos="DT" word="the" word_idx="2"/><node dep_parent="5" dep_path="amod" lemma="genetic" ner="O" pos="JJ" word="genetic" word_idx="3"/><node dep_parent="5" dep_path="prep_of" lemma="mutation" ner="O" pos="NN" word="mutation" word_idx="8"><node dep_parent="9" dep_path="det" lemma="a" ner="O" pos="DT" word="a" word_idx="6"/><node dep_parent="9" dep_path="amod" lemma="new" ner="O" pos="JJ" word="new" word_idx="7"/><node dep_parent="9" dep_path="vmod" lemma="find" ner="O" pos="VBN" word="found" word_idx="9"><node dep_parent="10" dep_path="prep_in" lemma="exon" ner="O" pos="NN" word="exon" word_idx="11"><node dep_parent="12" dep_path="num" lemma="17" ner="NUMBER" pos="CD" word="17" word_idx="12"/><node dep_parent="12" dep_path="prep_of" lemma="gene" ner="O" pos="NN" word="gene" word_idx="19"><node dep_parent="20" dep_path="det" lemma="the" ner="O" pos="DT" word="the" word_idx="14"/><node dep_parent="20" dep_path="nn" lemma="myophosphorylase" ner="O" pos="NN" word="myophosphorylase" word_idx="15"/><node dep_parent="20" dep_path="nn" lemma="pygm" ner="O" pos="NN" word="PYGM" word_idx="17"/></node></node><node dep_parent="10" dep_path="prep_as" lemma="cause" ner="O" pos="NN" word="cause" word_idx="22"><node dep_parent="23" dep_path="det" lemma="a" ner="O" pos="DT" word="a" word_idx="21"/><node dep_parent="23" dep_path="prep_of" lemma="disease" ner="O" pos="NN" word="disease" word_idx="25"><node dep_parent="26" dep_path="nn" lemma="McArdle" ner="PERSON" pos="NNP" word="McArdle" word_idx="24"/><node dep_parent="26" dep_path="vmod" lemma="know" ner="O" pos="VBN" word="known" word_idx="28"><node dep_parent="29" dep_path="advmod" lemma="also" ner="O" pos="RB" word="also" word_idx="27"/><node dep_parent="29" dep_path="prep_as" lemma="glycogenosis" ner="O" pos="NN" word="glycogenosis" word_idx="32"><node dep_parent="33" dep_path="nn" lemma="type" ner="O" pos="NN" word="type" word_idx="30"/><node dep_parent="33" dep_path="num" lemma="5" ner="NUMBER" pos="CD" word="5" word_idx="31"/></node></node></node></node></node></node></node></node>
"""
xt2 = XMLTree(et.fromstring(ex2_raw))

In [135]:
xt1.render_tree()
xt2.render_tree()

In [104]:
# Basic IO
parser = PTSVParser([
        ('doc_id', 'text'),
        ('sent_id', 'int'),
        ('text', 'text'),
        ('words', 'text[]'),
        ('lemmas', 'text[]'),
        ('poses', 'text[]'),
        ('ners', 'text[]'),
        ('char_idxs', 'int[]'),
        ('dep_paths', 'text[]'),
        ('dep_parents', 'int[]')
    ])
rows = map(parser.parse_line, open('test/test1.parsed.tsv', 'rb'))

In [105]:
rows[0]

<Row(dep_paths=['nn', 'nsubj', 'cop', 'det', 'amod', 'amod', 'amod', 'nn', None, 'vmod', None, 'agent', None, 'det', 'nn', 'nn', 'prep_of', None], char_idxs=[0, 11, 20, 23, 25, 30, 38, 46, 63, 71, 78, 81, 91, 94, 98, 103, 110, 114], text=Hyper-IgM1 syndrome is a rare genetic primary immunodeficiency disease caused by mutations of the CD40 ligand gene., ners=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], sent_id=1, words=['Hyper-IgM1', 'syndrome', 'is', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'caused', 'by', 'mutations', 'of', 'the', 'CD40', 'ligand', 'gene', '.'], lemmas=['hyper-igm1', 'syndrome', 'be', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'cause', 'by', 'mutation', 'of', 'the', 'cd40', 'ligand', 'gene', '.'], dep_parents=[2, 9, 9, 9, 9, 9, 9, 9, 0, 9, 0, 10, 0, 17, 17, 17, 12, 0], poses=['NN', 'NN', 'VBZ', 'DT', 'JJ', 'JJ', 'JJ', 'NN', 'NN', 'VBN', 'IN', 'NNS', 'IN', 'DT', 'NN', 'NN', '

### Testing XML speeds

How does it compare between:
* parse to XML via this python code, store as string, then parse from string at runtime
* just parse to XML at runtime via this python code?

In [106]:
# Map sentence to xmltree
%time xts = map(corenlp_to_xmltree, rows)

CPU times: user 8.92 ms, sys: 2.56 ms, total: 11.5 ms
Wall time: 9.56 ms


In [107]:
# Pre-process to xml string
xmls = [xt.to_str() for xt in map(corenlp_to_xmltree, rows)]

# Parse @ runtime using lxml
%time roots = map(et.fromstring, xmls)

CPU times: user 433 µs, sys: 326 µs, total: 759 µs
Wall time: 1.21 ms


## Simple demo: Generating DDLib features

As a first simple demo let's generate the features generated by [ddlib](http://deepdive.stanford.edu/doc/basics/gen_feats.html).

**_Note there are some noticeable differences stemming from using a dep tree representation at base; but a simple linear representation of the sentence could be used too, as in DDLib_**

First, let's load a few sample sentences and convert one of them to XML format for testing; we'll also load the feature templates library, and also tag some candidates (crudely for now) to play around with:

In [110]:
xt = corenlp_to_xmltree(rows[1], prune_root=True)
xt.render_tree()
root = xt.root

In [124]:
p1 = [0,1,2,3,4]
g1 = [19]
g2 = [21]
ft = Ngrams(Filter(Between(Mention(0), Mention(1)), 'pos', 'VB'), 'lemma', 1)
print ft
ft.print_apply(root, [p1, g2], 'word_idx')

<Ngrams:lemma:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION, xpath="//*[{0}][1]/ancestor-or-self::*[count(. | //*[{1}][1]/ancestor-or-self::*) = count(//*[{1}][1]/ancestor-or-self::*)][1]/descendant-or-self::*[(count(.//*[{0}]) = count(//*[{0}])) or (count(.//*[{1}]) = count(//*[{1}]))][starts-with(@pos, 'VB')]">
LEMMA:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION[cause]


In [125]:
for feat in get_relation_features(root, p1, g2):
    print feat

WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O_O]
WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations]
WORD:BETWEEN-MENTION-and-MENTION[caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation]
LEMMA:BETWEEN-MENTION-and-MENTION[cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS]
POS:BETWEEN-MENTION-and-MENTION[VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
LEMMA:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION[cause]
WORD:RIGHT-OF-MENTION[is]
WORD:RIGHT-OF-MENTION[is_the]
WORD:RIGHT-OF-MENTION[is_the_common]
LEMMA:RIGHT-OF-MENTION[be]
LEMMA:RIGHT-OF-MENTION[be_the]
LEMMA:RIGHT-OF-MENTION[be_the_common]
POS:RIGHT-OF-MENTION[VBZ]
POS:RIGHT-OF-MENTION[VBZ_DT]
POS:RIGHT-OF-MENTION[VBZ_DT_JJ]
NER:RIGHT-OF-MENTION[O]
NER:RIGHT-OF-MENTION[O_O]
NE

### Table example

In [126]:
# Some wishful thinking...
table_xml = """
<div class="table-wrapper">
    <h3>Causal genomic relationships</h3>
    <table>
        <tr><th>Gene</th><th>Variant</th><th>Phenotype</th></tr>
        <tr><td>ABC</td><td><i>AG34</i></td><td>Headaches during defecation</td></tr>
        <tr><td>BDF</td><td><i>CT2</i></td><td>Defecation during headaches</td></tr>
        <tr><td>XYG</td><td><i>AT456</i></td><td>Defecasomnia</td></tr>
    </table>
</div>
"""
from IPython.core.display import display_html, HTML
display_html(HTML(table_xml))

Gene,Variant,Phenotype
ABC,AG34,Headaches during defecation
BDF,CT2,Defecation during headaches
XYG,AT456,Defecasomnia


In [None]:
tt = TableTree(table_xml)

In [None]:
tt.to_xml_str()

In [None]:
tt.root