# TreeDLib

In [102]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [103]:
from feature_templates import *
from basic_features import *
from treedlib_util import PTSVParser
import lxml.etree as et
from treedlib_structs import corenlp_to_xmltree, XMLTree

We define three classes of operators:
* _NodeSets:_ $S : 2^T \mapsto 2^T$
* _Indicators:_ $I : 2^T \mapsto \{0,1\}^F$
* _Combinators:_ $C : \{0,1\}^F \times \{0,1\}^F \mapsto \{0,1\}^F$

where $T$ is a given input tree, and $F$ is the dimension of the feature space.

In [104]:
# Basic IO
parser = PTSVParser([
        ('doc_id', 'text'),
        ('sent_id', 'int'),
        ('text', 'text'),
        ('words', 'text[]'),
        ('lemmas', 'text[]'),
        ('poses', 'text[]'),
        ('ners', 'text[]'),
        ('char_idxs', 'int[]'),
        ('dep_paths', 'text[]'),
        ('dep_parents', 'int[]')
    ])
rows = map(parser.parse_line, open('test/test1.parsed.tsv', 'rb'))

In [105]:
rows[0]

<Row(dep_paths=['nn', 'nsubj', 'cop', 'det', 'amod', 'amod', 'amod', 'nn', None, 'vmod', None, 'agent', None, 'det', 'nn', 'nn', 'prep_of', None], char_idxs=[0, 11, 20, 23, 25, 30, 38, 46, 63, 71, 78, 81, 91, 94, 98, 103, 110, 114], text=Hyper-IgM1 syndrome is a rare genetic primary immunodeficiency disease caused by mutations of the CD40 ligand gene., ners=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], sent_id=1, words=['Hyper-IgM1', 'syndrome', 'is', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'caused', 'by', 'mutations', 'of', 'the', 'CD40', 'ligand', 'gene', '.'], lemmas=['hyper-igm1', 'syndrome', 'be', 'a', 'rare', 'genetic', 'primary', 'immunodeficiency', 'disease', 'cause', 'by', 'mutation', 'of', 'the', 'cd40', 'ligand', 'gene', '.'], dep_parents=[2, 9, 9, 9, 9, 9, 9, 9, 0, 9, 0, 10, 0, 17, 17, 17, 12, 0], poses=['NN', 'NN', 'VBZ', 'DT', 'JJ', 'JJ', 'JJ', 'NN', 'NN', 'VBN', 'IN', 'NNS', 'IN', 'DT', 'NN', 'NN', '

### Testing XML speeds

How does it compare between:
* parse to XML via this python code, store as string, then parse from string at runtime
* just parse to XML at runtime via this python code?

In [106]:
# Map sentence to xmltree
%time xts = map(corenlp_to_xmltree, rows)

CPU times: user 8.92 ms, sys: 2.56 ms, total: 11.5 ms
Wall time: 9.56 ms


In [107]:
# Pre-process to xml string
xmls = [xt.to_str() for xt in map(corenlp_to_xmltree, rows)]

# Parse @ runtime using lxml
%time roots = map(et.fromstring, xmls)

CPU times: user 433 µs, sys: 326 µs, total: 759 µs
Wall time: 1.21 ms


## Simple demo: Generating DDLib features

As a first simple demo let's generate the features generated by [ddlib](http://deepdive.stanford.edu/doc/basics/gen_feats.html).

**_Note there are some noticeable differences stemming from using a dep tree representation at base; but a simple linear representation of the sentence could be used too, as in DDLib_**

First, let's load a few sample sentences and convert one of them to XML format for testing; we'll also load the feature templates library, and also tag some candidates (crudely for now) to play around with:

In [110]:
xt = corenlp_to_xmltree(rows[1], prune_root=True)
xt.render_tree()
root = xt.root

In [124]:
p1 = [0,1,2,3,4]
g1 = [19]
g2 = [21]
ft = Ngrams(Filter(Between(Mention(0), Mention(1)), 'pos', 'VB'), 'lemma', 1)
print ft
ft.print_apply(root, [p1, g2], 'word_idx')

<Ngrams:lemma:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION, xpath="//*[{0}][1]/ancestor-or-self::*[count(. | //*[{1}][1]/ancestor-or-self::*) = count(//*[{1}][1]/ancestor-or-self::*)][1]/descendant-or-self::*[(count(.//*[{0}]) = count(//*[{0}])) or (count(.//*[{1}]) = count(//*[{1}]))][starts-with(@pos, 'VB')]">
LEMMA:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION[cause]


In [125]:
for feat in get_relation_features(root, p1, g2):
    print feat

WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O_O]
WORD:BETWEEN-MENTION-and-MENTION[disorder_caused_mutations]
WORD:BETWEEN-MENTION-and-MENTION[caused_mutations_PKD1]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_cause_mutation]
LEMMA:BETWEEN-MENTION-and-MENTION[cause_mutation_pkd1]
POS:BETWEEN-MENTION-and-MENTION[NN_VBN_NNS]
POS:BETWEEN-MENTION-and-MENTION[VBN_NNS_NN]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
NER:BETWEEN-MENTION-and-MENTION[O_O_O]
LEMMA:FILTER-BY(pos=VB):BETWEEN-MENTION-and-MENTION[cause]
WORD:RIGHT-OF-MENTION[is]
WORD:RIGHT-OF-MENTION[is_the]
WORD:RIGHT-OF-MENTION[is_the_common]
LEMMA:RIGHT-OF-MENTION[be]
LEMMA:RIGHT-OF-MENTION[be_the]
LEMMA:RIGHT-OF-MENTION[be_the_common]
POS:RIGHT-OF-MENTION[VBZ]
POS:RIGHT-OF-MENTION[VBZ_DT]
POS:RIGHT-OF-MENTION[VBZ_DT_JJ]
NER:RIGHT-OF-MENTION[O]
NER:RIGHT-OF-MENTION[O_O]
NE

### Table example

In [126]:
# Some wishful thinking...
table_xml = """
<div class="table-wrapper">
    <h3>Causal genomic relationships</h3>
    <table>
        <tr><th>Gene</th><th>Variant</th><th>Phenotype</th></tr>
        <tr><td>ABC</td><td><i>AG34</i></td><td>Headaches during defecation</td></tr>
        <tr><td>BDF</td><td><i>CT2</i></td><td>Defecation during headaches</td></tr>
        <tr><td>XYG</td><td><i>AT456</i></td><td>Defecasomnia</td></tr>
    </table>
</div>
"""
from IPython.core.display import display_html, HTML
display_html(HTML(table_xml))

Gene,Variant,Phenotype
ABC,AG34,Headaches during defecation
BDF,CT2,Defecation during headaches
XYG,AT456,Defecasomnia


In [None]:
tt = TableTree(table_xml)

In [None]:
tt.to_xml_str()

In [None]:
tt.root