# TreeDLib

In [203]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We define three classes of operators:
* _NodeSets:_ $S : 2^T \mapsto 2^T$
* _Indicators:_ $I : 2^T \mapsto \{0,1\}^F$
* _Combinators:_ $C : \{0,1\}^F \times \{0,1\}^F \mapsto \{0,1\}^F$

where $T$ is a given input tree, and $F$ is the dimension of the feature space.

## Simple demo: Generating DDLib features

As a first simple demo let's generate the features generated by [ddlib](http://deepdive.stanford.edu/doc/basics/gen_feats.html).

**_Note there are some noticeable differences stemming from using a dep tree representation at base; but a simple linear representation of the sentence could be used too, as in DDLib_**

First, let's load a few sample sentences and convert one of them to XML format for testing; we'll also load the feature templates library, and also tag some candidates (crudely for now) to play around with:

In [275]:
from util import load_sentences, tag_candidate
from tree_structs import DepTree, TableTree
from feature_templates import *
from basic_features import *
dts = map(DepTree, load_sentences('test/test1.parsed.tsv'))
dt = dts[1]
tag_candidate(dt.root, ['Autosomal', 'dominant', 'polycystic', 'kidney', 'disease'], 'P1')
tag_candidate(dt.root, ['PKD1'], 'G1')
tag_candidate(dt.root, ['PKD2'], 'G2')
dt.render_tree()
#for i in range(len(dts)):
#    dts[i].render_tree()

In [None]:
# TODO:
# - XML -> JSON / visualization; be able to init Tree from XML!
# - Clean up this notebook / code!

def new_root_cid(root, cid):
    
    # Get candidate mention as new root
    # NOTE: This will already contain all of its descendants from the old tree
    new_root = root.xpath("//*[@cid='%s'][1]" % cid)[0]
    
    # Recursively add parents + all their children, *minus current one!*
    new_root.append(root.xpath("//*[@cid='%s'][1]/.." % cid)[0])
    return new_root

In [None]:
root.xpath("//*[@cid='P1'][1]/..")

In [None]:
et.tostring(new_root_cid(root, 'P1')[0])

In [287]:
Mention(0)

<MENTION, xpath="//*[@cid='{0}']">

In [286]:
NGrams(Between(Mention(0), Mention(1)), 'lemma', 3).print_apply(root, ['P1', 'G1'])

LEMMA:BETWEEN-MENTION-and-MENTION[disorder]
LEMMA:BETWEEN-MENTION-and-MENTION[disease]
LEMMA:BETWEEN-MENTION-and-MENTION[cause]
LEMMA:BETWEEN-MENTION-and-MENTION[mutation]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_disease]
LEMMA:BETWEEN-MENTION-and-MENTION[disease_cause]
LEMMA:BETWEEN-MENTION-and-MENTION[cause_mutation]
LEMMA:BETWEEN-MENTION-and-MENTION[disorder_disease_cause]
LEMMA:BETWEEN-MENTION-and-MENTION[disease_cause_mutation]


In [268]:
load_sentences('test/test1.parsed.tsv')[1].text

'Autosomal dominant polycystic kidney disease is the most common human monogenic disorder and is caused by mutations in the PKD1 or PKD2 genes.'

In [271]:
def flat_tree(root):
    if root.get('dep_label') is not None:
        s = '[%s]> %s' % (root.get('dep_label'), root.get('word'))
    else:
        s = root.get('word')
    if len(root) > 0:
        s += ' ( %s )' % ', '.join(flat_tree(c) for c in root)
    return s

In [272]:
ft = flat_tree(dt.root)
ft

'disorder ( [nsubj]> disease ( [amod]> Autosomal, [amod]> dominant, [amod]> polycystic, [nn]> kidney ), [cop]> is, [det]> the, [amod]> common ( [advmod]> most ), [amod]> human, [amod]> monogenic, [conj_and]> caused ( [auxpass]> is, [agent]> mutations ( [prep_in]> PKD1 ( [dep]> the, [conj_or]> PKD2, [dep]> genes ) ) ) )'

In [255]:
re.findall(r'disorder.*?caused.*?mutations', ft)

['disorder (  -> disease (  -> Autosomal -> dominant -> polycystic -> kidney )  -> is -> the -> common (  -> most )  -> human -> monogenic -> caused (  -> is -> mutations']

In [234]:
xml = dt.to_xml_str()

In [235]:
xml

'<node dep_parent="0" id="12" lemma="disorder" ner="O" pos="NN" word="disorder" word_idx="80"><node dep_label="nsubj" dep_parent="12" id="5" lemma="disease" ner="O" pos="NN" word="disease" word_idx="37" cid="P1"><node dep_label="amod" dep_parent="5" id="1" lemma="autosomal" ner="O" pos="JJ" word="Autosomal" word_idx="0" cid="P1"/><node dep_label="amod" dep_parent="5" id="2" lemma="dominant" ner="O" pos="JJ" word="dominant" word_idx="10" cid="P1"/><node dep_label="amod" dep_parent="5" id="3" lemma="polycystic" ner="O" pos="JJ" word="polycystic" word_idx="19" cid="P1"/><node dep_label="nn" dep_parent="5" id="4" lemma="kidney" ner="O" pos="NN" word="kidney" word_idx="30" cid="P1"/></node><node dep_label="cop" dep_parent="12" id="6" lemma="be" ner="O" pos="VBZ" word="is" word_idx="45"/><node dep_label="det" dep_parent="12" id="7" lemma="the" ner="O" pos="DT" word="the" word_idx="48"/><node dep_label="amod" dep_parent="12" id="9" lemma="common" ner="O" pos="JJ" word="common" word_idx="57"><

In [237]:
re.findall(r'<node[^>]*cid="G2"[^>]*/>', xml)

['<node dep_label="conj_or" dep_parent="20" id="22" lemma="pkd2" ner="O" pos="NN" word="PKD2" word_idx="131" cid="G2"/>']

In [238]:
def node(attribs):
    return r'<[^>]+' + '\s[^>]*\s'.join('%s="%s"' % (k,v) for k,v in attribs.iteritems()) + '[^>]*>'

def child_of(attribs):
    

In [241]:
re.search(node({'cid':'G2'}), xml)

<_sre.SRE_Match at 0x10ba0b718>

In [205]:
dt.root

<Element node at 0x10ba224b0>

In [206]:
Indicator(Between(Mention(0), Mention(1)), 'lemma').print_apply(dt.root, ['P1', 'G1'])

LEMMA:BETWEEN-MENTION-and-MENTION[disorder_disease_cause_mutation]


In [207]:
for feat in get_generic_mention_features(dt.root, 'G2', ['monogenic']):
    print feat

WORD:MENTION[PKD2]
LEMMA:MENTION[pkd2]
POS:MENTION[NN]
NER:MENTION[O]
WORD:MENTION[RGX:STARTS_W_CAPITAL=True]
WORD:RIGHT-OF-MENTION[genes]
WORD:LEFT-OF-MENTION[the]
WORD:RIGHT-OF-MENTION[genes]+WORD:LEFT-OF-MENTION[the]
LEMMA:RIGHT-OF-MENTION[gene]
LEMMA:LEFT-OF-MENTION[the]
LEMMA:RIGHT-OF-MENTION[gene]+LEMMA:LEFT-OF-MENTION[the]
POS:RIGHT-OF-MENTION[NNS]
POS:LEFT-OF-MENTION[DT]
POS:RIGHT-OF-MENTION[NNS]+POS:LEFT-OF-MENTION[DT]
NER:RIGHT-OF-MENTION[O]
NER:LEFT-OF-MENTION[O]
NER:RIGHT-OF-MENTION[O]+NER:LEFT-OF-MENTION[O]
WORD:KEYWORD[monogenic]
DEP_LABEL:BETWEEN-MENTION-and-KEYWORD[conj_and_agent_prep_in]
LEMMA:BETWEEN-MENTION-and-KEYWORD[disorder_cause_mutation_pkd1]
DEP_LABEL|LEMMA:BETWEEN-MENTION-and-KEYWORD[None|disorder_conj_and|cause_agent|mutation_prep_in|pkd1]


### Table example

In [231]:
# Some wishful thinking...
table_xml = """
<div class="table-wrapper">
    <h3>Causal genomic relationships</h3>
    <table>
        <tr><th>Gene</th><th>Variant</th><th>Phenotype</th></tr>
        <tr><td>ABC</td><td><i>AG34</i></td><td>Headaches during defecation</td></tr>
        <tr><td>BDF</td><td><i>CT2</i></td><td>Defecation during headaches</td></tr>
        <tr><td>XYG</td><td><i>AT456</i></td><td>Defecasomnia</td></tr>
    </table>
</div>
"""
from IPython.core.display import display_html, HTML
display_html(HTML(table_xml))

Gene,Variant,Phenotype
ABC,AG34,Headaches during defecation
BDF,CT2,Defecation during headaches
XYG,AT456,Defecasomnia


In [232]:
tt = TableTree(table_xml)

Causal genomic relationships


KeyboardInterrupt: 

In [224]:
tt.to_xml_str()

'<div class="table-wrapper"><h3>Causal genomic relationships</h3><table><tr><th>Gene</th><th>Variant</th><th>Phenotype</th></tr><tr><td>ABC</td><td><i>AG34</i></td><td>Headaches during defecation</td></tr><tr><td>BDF</td><td><i>CT2</i></td><td>Defecation during headaches</td></tr><tr><td>XYG</td><td><i>AT456</i></td><td>Defecasomnia</td></tr></table></div>'

In [233]:
tt.root

<Element div at 0x10ba22fa0>