In [1]:
from utils.before_after import *
from collections import defaultdict
import numpy as np
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

## 用 dependency 抓 pattern
#### 第一層 dependency
dobj, prep, nsubj, nsubjpass, ccomp, xcomp, csubj, csubjpass, prt, acomp, oprd

#### 第二層 dependency
prep -> pobj, pcomp


In [12]:
from utils.syntax import *

SUB  = ['nsubj', 'nsubjpass', 'oprd']
OBJ  = ['dobj', 'pobj']
CL   = ['ccomp', 'xcomp', 'acomp', 'pcomp', 'csubj', 'csubjpass']
PREP = ['prep', 'prt']

def classify_cl(token):
    children = list(token.children)
    if children:
        if children[0].tag_ in WH: return 'wh-cl'
        if children[0].tag_ == 'TO': return 'to-v'
    return 'cl'
    
def head_mapping(token):
    if token.tag_ == 'VBN':      return 'V-ed'
    if token.tag_ == 'VBG':      return 'V-ing'
    if token.tag_ in VERBS:      return 'V'
    
def dep_mapping(token):
    # 順序 matters
    if token.dep_ in CL:         return classify_cl(token)    
    
    if token.dep_ == 'aux' and token.lemma_ == 'have': return 'have'
    if token.lemma_ == 'be':     return 'be'
    
    if token.dep_ in SUB:        return 'S'
    
    if token.tag_ == 'VBN':      return 'v-ed'
    if token.tag_ == 'VBG':      return 'v-ing'
    if token.tag_ in VERBS:      return 'v'
    
    if token.dep_ in OBJ:        return 'O'
    if token.dep_ in PREP:       return token.text
    if token.tag_ == 'TO':       return 'to'
    
    return None

In [13]:
FIRST_REMAINS = ['aux', 'auxpass', 'dobj', 'prep', 'nsubj', 'nsubjpass', 'ccomp', 'xcomp', 'csubj', 'csubjpass', 'prt' 'acomp', 'oprd']
SECOND_REMAINS = ['pobj', 'pcomp']
go_deeper = ['prep']

def keep_children(tk, rules):
    return [child for child in tk.children if child.dep_ in rules]

def flattern(list_2d):
    return [el for li in list_2d for el in li]

def dep_to_pattern(head_word):
    tokens = [head_word] + keep_children(head_word, FIRST_REMAINS)
    tokens += flattern([keep_children(tk, SECOND_REMAINS) for tk in tokens if tk.dep_ in go_deeper])
    
    tokens.sort(key=lambda tk: tk.i)
    
    ptns = [head_mapping(tk) if tk.i == head_word.i else dep_mapping(tk) for tk in tokens]
    
    ptn = ' '.join([p for p in ptns if p])
    ngram = ' '.join([tk.text for tk in tokens])

    return ptn, ngram

In [15]:
# headword, patterns, ngrams
patterns = defaultdict(lambda: defaultdict(lambda: []))
sents = defaultdict(lambda: defaultdict(lambda: [])) # for debug

def clean_data(file):
    lower_lines = map(lambda line: line.strip().lower(), open(file, 'r', encoding='utf8'))
    lines = filter(lambda line: len(line.split(' ')) <= 25, lower_lines)
    return lines

for line in clean_data('../dataset/test.txt'):
    line = nlp(line, disable=['ner'])
    for tk in line:
        # 先抓 headword 是 verb 的
        if tk.tag_ in VERBS: 
            ptn, ngram = dep_to_pattern(tk)
            patterns[tk.lemma_][ptn].append(ngram)
            sents[tk.lemma_][ptn].append(tk.doc.text)

In [18]:
patterns.keys()

dict_keys(['attend', 'keep', 'fund', 'sell', 'do', 'provide', 'have', 'ensure', 'organise', 'feel', 've', 'include', 'say', 'know', 'hesitate', 'help', 'support', 'involve', 'put', 'identify', 'find', 'write', 'represent', 'realise', 'complete', 'visit', 'get', 'hold', 'need', 'collect', 'like', 'report', 'receive', 'ask', 'pluck', 'make', 'respond', 'fight', 'come', 'be', 'call', 'plan', 'face', 'work', 'raise', 'affect', 'exist', 'discuss', 'become', 'serve', 'leave', 'encourage', 'happen', 'open', 'bake', 'go', 'believe', 'contact', 'give'])

In [22]:
patterns['be']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'S S V': ['richard burrows are'],
             'S V': ['what is',
              'aids is',
              'aspects can be',
              'people are',
              'situation is',
              'money can be',
              'total is',
              'it is',
              'none is',
              'they are',
              'i was',
              'it can be',
              "it 's",
              'answer is',
              'this is',
              'leader will be',
              'i am',
              'team were'],
             'S V as': ['women are as'],
             'S V at O': ['we are at beginning'],
             'S V cl': ['it is are', 'you are be', 'it is represents'],
             'S V in O': ["it 's in cause"],
             'S V to-v': ['impact is come'],
             'V': ['is',
              'are',
              'is',
              'are',
              'be',
              'are',
              'are',
       

In [24]:
sents['be']['V']

['it is not transmitted from :',
 'often people are rejected by family and friends , leaving them to face this chronic condition .',
 'there is no vaccine or currently available .',
 '10 million people worldwide are infected with hiv .',
 'you can be infected for between 10-15 years without realising it .',
 '16 , 000 infections ( it is probable that there are between 40-60 , 000 people actually ) .',
 'there are nearly 5 , 000 cases of aids , of which nearly 3 , 000 have already died .',
 '1 in 500 londoners are believed to be infected .',
 '1 in 500 londoners are believed to be infected .',
 'there is no limit to the number of ways to raise money .',
 'many of our existing volunteers have families and jobs and are often very busy .',
 'you will be asked to complete an application and subsequently to attend an acet training course one evening a week for six weeks .',
 'you are also asked to keep your church leaders of your involvement so that they can ensure you are adequately support