In [1]:
from collections import defaultdict, Counter
from tqdm import tqdm
import gzip

In [2]:
class DotDict(dict):
    def __getattr__(self, name):
        return self[name]
    
def parse(block):
    lines = [line for line in block.split('\n') if line]
    parsed = [ DotDict() for _ in range(len(lines)) ]

    for i, line in enumerate(lines):
        index, token, lemma, dep, pos, tag, head, children = line.split('\t')

        parsed[i].update({
            'i': int(index),
            'text': token,
            'lemma': lemma,
            'dep': dep,
            'pos': pos,
            'tag': tag,
            'head': parsed[int(head)],
            'children': [parsed[int(ch)] for ch in children.split(',') if ch],
            'doc': parsed
        })
        
    return parsed

In [3]:
block = '''0	through	through	ROOT	IN	ADP	0	5,6
1	infected	infected	amod	JJ	ADJ	5	
2	blood	blood	nmod	NN	NOUN	5	3,4
3	or	or	cc	CC	CCONJ	2	
4	blood	blood	conj	NN	NOUN	2	
5	products	product	pobj	NNS	NOUN	0	1,2
6	.	.	punct	.	PUNCT	0	'''

entry = parse(block)

In [4]:
PREPOSITIONS = ['about', 'across', 'against', 'along', 'among', 'around', 'as', 'at',
                'beside', 'besides', 'between', 'by', 'down', 'during', 
                'except', 'for', 'from', 'in', 'inside', 'into', 'of', 'off', 
                'on', 'onto', 'outside', 'over', 'through', 'to', 'toward', 'towards', 
                'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
# not inlcuded: above / behind / beneath /beyond / below ... 

WH_WORDS = ['how', 'who', 'what', 'when', 'why', 'where', 'which', 'whether', 'whichever', 'whoever', 'whomever', 'whatever', 'wherever', 'whenever']
RESERVED_WORDS = ['someway, together', 'that']

POS = {
    'VERB': ['VB', 'VBD', 'VBG', 'VBP', 'VBZ'], # 'VBN'
    'NOUN': ['NN', 'NNP', 'NNS', 'NNPS', 'DT', 'PRP', 'CD'],
    'ADJ':   ['JJ', 'JJR', 'JJS'],
    'ADV':   ['RB', 'RBR', 'RBS'],
    'PREP':  ['IN'],
    'WH':    ['WDT', 'WP', 'WP$', 'WRB']
}

DEP = {
    'SUB':  ['nsubj', 'nsubjpass', 'oprd'],
    'OBJ':  ['dobj', 'pobj'],
    'CL':   ['ccomp', 'xcomp', 'acomp', 'pcomp', 'csubj', 'csubjpass'],
    'PREP': ['prep', 'prt']
}

In [5]:
def classify_cl(token):
    if token.children:
        first_child = token.children[0]
        if first_child.tag in POS['WH']: return 'wh-cl'
        if first_child.tag == 'TO':      return 'to-v'
    return 'cl'
    
    
def head_mapping(token):
    if token.lemma == 'be':       return 'be'
    if token.tag == 'VBN':        return 'V-ed'
    if token.tag == 'VBG':        return 'V-ing'
    if token.tag in POS['VERB']:  return 'V'
    
    return None
    
    
def dep_mapping(token):
    # 順序 matters
    if token.dep in DEP['CL']:         return classify_cl(token)    
    
    if token.dep   == 'aux' and token['lemma'] == 'have': return 'have'
    if token.lemma == 'be':            return 'be'
    
    if token.tag == 'VBN':             return 'v-ed'
    if token.tag == 'VBG':             return 'v-ing'
    if token.tag in POS['VERB']:       return 'v'

    if token.dep in DEP['SUB']:        return 'S'
    if token.dep in DEP['OBJ']:        return 'O'
    if token.dep in DEP['PREP']:       return token.text
    if token.tag == 'TO':              return 'to'
    
    return None

In [6]:
####### Dependency pattern #######
# 第一層 dependency: dobj, prep, nsubj, nsubjpass, ccomp, xcomp, csubj, csubjpass, prt, acomp, oprd
# 第二層 dependency: prep -> pobj, pcomp

FIRST_REMAINS = ['aux', 'auxpass', 'dobj', 'prep', 'nsubj', 'nsubjpass', 'ccomp', 'xcomp', 'csubj', 'csubjpass', 'prt' 'acomp', 'oprd']
SECOND_REMAINS = ['pobj', 'pcomp']
go_deeper = ['prep']


def keep_children(children, rules):
    return [child for child in children if child.dep in rules]


def flattern(list_2d):
    return [el for li in list_2d for el in li]


# 先抓 Verb
def get_verb_dep(parsed, index):
    target = parsed[index]
    
    first_layer  = keep_children(target.children, FIRST_REMAINS)
    second_layer = flattern([keep_children(tk.children, SECOND_REMAINS) 
                             for tk in first_layer if tk.dep in go_deeper])
   
    tokens = [target] + first_layer + second_layer
    tokens.sort(key=lambda tk: tk.i)
    
    ptns = [head_mapping(tk) if tk.i == index else dep_mapping(tk) for tk in tokens]

    ptn = ' '.join([p for p in ptns if p]) # ignore None
    ngram = ' '.join([tk.text for tk in tokens])

    return ptn, ngram

In [10]:
%%time
patterns = defaultdict(lambda: defaultdict(Counter))
ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
sents = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
    
def retrieve_dep(entry):
    sent = ' '.join([tk.text for tk in entry])
    for token in entry:
        if token.pos in POS['VERB']: # or tag == VERB
            ptn, ngram = get_verb_dep(entry, token.i)
            
            patterns[token.lemma][token.dep][ptn] += 1
            ngrams[token.lemma][token.dep][ptn].append(ngram)
            sents[token.lemma][token.dep][ptn].append(sent)
            
        elif token.pos in POS['ADJ']:
            pass


if __name__ == '__main__':
    fs = gzip.open('/force/corpus/COCA/coca.deps.spacy.tagged.gz', 'rt', encoding='utf8')
    contents = fs.read().split('\n\n')
    
    for entry in tqdm(contents):
        retrieve_dep(parse(entry))

100%|██████████| 35136058/35136058 [1:23:35<00:00, 7005.61it/s]

CPU times: user 1h 21min 52s, sys: 1min 8s, total: 1h 23min 1s
Wall time: 1h 25min 56s





In [9]:
import json
with open('static/data/bnc.json', 'w', encoding='utf8') as ws:
    json.dump({ 'patterns': patterns, 'ngrams': ngrams, 'sents': sents }, ws)

In [16]:
patterns

defaultdict(<function __main__.<lambda>()>,
            {'inevitable': defaultdict(collections.Counter,
                         {'ROOT': Counter({'S O': 1})}),
             'fixit': defaultdict(collections.Counter,
                         {'acomp': Counter({'': 1})}),
             '-thromboglobulin': defaultdict(collections.Counter,
                         {'conj': Counter({'': 1}), 'nmod': Counter({'': 6})}),
             'dance': defaultdict(collections.Counter,
                         {'ROOT': Counter({'': 13,
                                   'Above O S O': 2,
                                   'After O S O on O for O': 1,
                                   'After cl S O': 1,
                                   'At O O': 2,
                                   'At O O cl': 1,
                                   'At O S be O by O': 1,
                                   'Before O S in O': 1,
                                   'By cl S be': 1,
                                   'For 