In [1]:
import sys
import collections
import subprocess

from lxml import etree

import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
from etcbc.mql import MQL
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.4.6
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html



In [2]:
API = fabric.load('etcbc4', '--', 'mql', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        oid otype monads
        g_word g_word_utf8 g_cons lex 
        typ code function rela det
        txt prs
        book chapter verse label
    ''','''
        functional_parent
    '''),
    "prepare": prepare,
}, verbose='DETAIL')
exec(fabric.localnames.format(var='fabric'))
Q = MQL(API)

  0.00s LOADING API: please wait ... 
  0.00s DETAIL: COMPILING m: UP TO DATE
  0.01s INFO: USING DATA COMPILED AT: 2014-07-23T09-31-37
  0.01s DETAIL: COMPILING a: UP TO DATE
  0.04s DETAIL: load main: G.node_anchor_min
  0.16s DETAIL: load main: G.node_anchor_max
  0.30s DETAIL: load main: G.node_sort
  0.44s DETAIL: load main: G.node_sort_inv
  1.33s DETAIL: load main: G.edges_from
  1.47s DETAIL: load main: G.edges_to
  1.62s DETAIL: load main: F.etcbc4_db_monads [node] 
  3.19s DETAIL: load main: F.etcbc4_db_oid [node] 
  5.01s DETAIL: load main: F.etcbc4_db_otype [node] 
  6.73s DETAIL: load main: F.etcbc4_ft_code [node] 
  6.82s DETAIL: load main: F.etcbc4_ft_det [node] 
  7.30s DETAIL: load main: F.etcbc4_ft_function [node] 
  7.67s DETAIL: load main: F.etcbc4_ft_g_cons [node] 
  8.08s DETAIL: load main: F.etcbc4_ft_g_word [node] 
  8.66s DETAIL: load main: F.etcbc4_ft_g_word_utf8 [node] 
  9.33s DETAIL: load main: F.etcbc4_ft_lex [node] 
  9.85s DETAIL: load main: F.etcbc4_ft_

###do the queries

yesh_query1 searches for all clause atoms containing JC/ and >JN/ .
yesh_query2 searches for all phrases in all clause atoms.

In [90]:
yesh_query1 = '''
select all objects where
[book
    [clause_atom
        [phrase
            [phrase_atom
                [word lex="JC/" or lex=">JN/"]
            ]
         ]   
    ]
]
'''
yesh_query2 = '''
select all objects where
    [clause_atom
        [phrase]
    ]
'''

In [91]:
sheaf1 = Q.mql(yesh_query1)
sheaf2 = Q.mql(yesh_query2)

###Create a dictionary
The dictionary contains the clause atoms (keys) and their phrases (values).

In [92]:
ca_index = collections.defaultdict(lambda: [])
n_ph = 0
for ((ca, ((ph,),)),) in sheaf2.results():
    n_ph +=1
    ca_index[ca].append(ph)
print('Clause atoms: {}'.format(len(ca_index)))
print('Phrases: {}'.format(n_ph))

Clause atoms: 89848
Phrases: 254405


###Create two dictionaries with phrase functions
These dictionaries contain the phrase functions of each phrase in all the clause atom containing JC/ or >JN/. There is a separation between 'before JC/ or >JN' and 'after JC/ or >JN/.

In [93]:
max_before = 0
max_after = 0
context_before = {}
context_after = {}
extra_info = {}
target_phrase = collections.OrderedDict()
for ((bk, ((ca, ((ph, ((pha, ((w,),)),)),)),)),) in sheaf1.results():
    phrases = ca_index[ca]
    #print(phrases)
    is_before = True
    context_before[ca] = []
    context_after[ca] = []
    target_phrase[ca] = F.function.v(ph)
    extra_info[ca] = (F.book.v(bk), F.code.v(ca))
    for contextph in phrases:
        if contextph == ph:
            is_before = False
        elif is_before:
            context_before[ca].append(F.function.v(contextph))
        else:
            context_after[ca].append(F.function.v(contextph))
        if contextph == ph:
            is_before = False
        elif is_before:
            context_before[ca].append(F.typ.v(contextph))
        else:
            context_after[ca].append(F.typ.v(contextph))
    max_before = max((max_before, len(context_before[ca])))
    max_after = max((max_after, len(context_after[ca])))
    
print('MaxBefore={}\nMaxAfter={}'.format(max_before, max_after))    
    

[605116, 605117, 605118]
[606146, 606147]
[606523, 606524, 606525]
[607629, 607630, 607631]
[609322, 609323, 609324, 609325]
[609905, 609906, 609907, 609908]
[610104, 610105, 610106, 610107]
[610168, 610169, 610170, 610171]
[611032, 611033, 611034]
[611477, 611478, 611479, 611480, 611481]
[611694, 611695, 611696, 611697, 611698]
[611810, 611811, 611812, 611813, 611814]
[613649, 613650, 613651, 613652]
[613664, 613665]
[614218, 614219, 614220]
[614613, 614614, 614615]
[614753, 614754, 614755, 614756, 614757]
[614781, 614782, 614783, 614784]
[615073, 615074]
[615375, 615376, 615377]
[615924, 615925, 615926]
[615963, 615964, 615965, 615966, 615967]
[617386, 617387, 617388]
[617460, 617461, 617462, 617463, 617464]
[617474]
[618028, 618029]
[618038, 618039, 618040]
[618050, 618051, 618052, 618053]
[618100, 618101, 618102]
[618105, 618106, 618107, 618108]
[618147, 618148, 618149, 618150]
[618302, 618303, 618304, 618305]
[618386, 618387, 618388]
[618649, 618650]
[618728, 618729, 618730]
[6188

[724510, 724511, 724512]
[724513, 724514, 724515, 724516]
[725138, 725139, 725140, 725141]
[727568, 727569, 727570, 727571]
[727665, 727666]
[727811, 727812, 727813, 727814]
[727826, 727827, 727828]
[727914, 727915, 727916, 727917]
[727922, 727923, 727924, 727925]
[728075, 728076, 728077, 728078]
[728079, 728080, 728081]
[728426, 728427]
[728622, 728623]
[728624, 728625, 728626, 728627]
[728665, 728666, 728667]
[728831]
[728834]
[729338, 729339, 729340, 729341]
[729437, 729438, 729439, 729440]
[730323, 730324, 730325]
[730736, 730737, 730738, 730739]
[731023, 731024, 731025]
[731152]
[731314, 731315]
[731954, 731955, 731956]
[731959, 731960, 731961]
[732072, 732073, 732074]
[732739, 732740, 732741]
[734404, 734405]
[734575, 734576, 734577, 734578]
[734591, 734592, 734593, 734594]
[735103, 735104, 735105, 735106]
[736061, 736062, 736063, 736064]
[736066, 736067, 736068, 736069]
[736070, 736071, 736072]
[736138, 736139, 736140, 736141]
[736220, 736221, 736222]
[736226, 736227, 736228, 73

[795118, 795119]
[795124, 795125]
[795131, 795132, 795133]
[795140, 795141]
[795142, 795143]
[795383, 795384, 795385, 795386]
[795712, 795713, 795714]
[795818, 795819]
[796819, 796820, 796821]
[796973, 796974]
[797216, 797217, 797218]
[797394, 797395, 797396, 797397]
[797471, 797472, 797473]
[797479, 797480, 797481, 797482]
[797486, 797487]
[797530, 797531, 797532, 797533]
[797707, 797708, 797709]
[798688, 798689, 798690]
[799401, 799402, 799403]
[799405, 799406, 799407]
[799568, 799569]
[801333, 801334]
[801542, 801543, 801544]
[801614, 801615]
[801819, 801820, 801821]
[801835, 801836, 801837, 801838]
[802275, 802276, 802277]
[804418, 804419, 804420, 804421]
[805276, 805277, 805278, 805279, 805280]
[805276, 805277, 805278, 805279, 805280]
[805609, 805610, 805611, 805612]
[805983, 805984, 805985, 805986]
[805990, 805991, 805992]
[806237, 806238]
[806239, 806240, 806241]
[806242, 806243, 806244, 806245]
[806272, 806273, 806274, 806275]
[806398, 806399, 806400, 806401]
[806488, 806489, 8

In [94]:
csvh = outfile('jsphrases.csv')
for ca in target_phrase:
    cbefore = context_before[ca]
    cafter = context_after[ca]
    extra = extra_info[ca]
    row = [extra[0], extra[1], target_phrase[ca]]
    row.extend([cbefore[i] if i < len(cbefore) else '' for i in range(max_before)])
    row.extend([cafter[i] if i < len(cafter) else '' for i in range(max_after)])
    csvh.write('{}\n'.format(','.join(row)))
csvh.close()