In [14]:
from utils.before_after import *
from collections import Counter, defaultdict
from operator import itemgetter
import numpy as np
import spacy
# import utils.explacy # https://github.com/tylerneylon/explacy
# explacy.print_parse_info(nlp, line)

In [15]:
nlp = spacy.load('en_core_web_lg') # ('en')

## Parse BNC 所有句子，再取出有 discuss
* read all lines
* strip() and lower()
* filter out target sentences
* ** Use list() to keep filter generator **

In [16]:
TARGET_WORD = 'discuss'

In [17]:
# bnc_remains = filter(lambda line: any([tk.lemma_ == TARGET_WORD for tk in line]), bnc_all)

In [18]:
def clean_data(file, reserved=None):
    lines = map(lambda line: line.strip().lower(), open(file, 'r', encoding='utf8'))
    remains = filter(lambda line: reserved in line, lines) if reserved else lines
    return list(remains)

bnc_temp = clean_data('../dataset/bnc.txt', TARGET_WORD)
bnc_remains = list(map(lambda line: nlp(line), bnc_temp))

## 從 EF 中取得有 discuss 的句子，並且 parse

In [19]:
# ef_temp = clean_data('../dataset/efcamp/ef.diff.simplize.despace.txt', 'discuss')
# ef_remains = []
# for index, line in enumerate(ef_temp):
#     tokens = line.split(' ')
    
#     aft_sent = nlp(' '.join(to_after(tokens)))
#     bef_sent = nlp(' '.join(to_before(tokens)))
#     has_edit = any([tk[:2] in ['[-', '{+'] and TARGET_WORD in tokens[i-1] 
#                     for i, tk in enumerate(tokens)]) 
    
#     ef_remains.append({
#         'origin': line,
#         'bef_sent': bef_sent,
#         'aft_sent': aft_sent, 
#         'has_edit': has_edit
#     })

## 用 dependency 抓 pattern
#### 第一層 dependency
dobj, prep, nsubj, nsubjpass, ccomp, xcomp, csubj, csubjpass, prt, acomp, oprd

#### 第二層 dependency
prep -> pobj, pcomp


In [433]:
from utils.syntax import *

SUB  = ['nsubj', 'nsubjpass', 'oprd']
OBJ  = ['dobj', 'pobj']
CL   = ['ccomp', 'xcomp', 'acomp', 'pcomp', 'csubj', 'csubjpass']
PREP = ['prep', 'prt']

def classify_cl(token):
    children = list(token.children)
    if children:
        if children[0].tag_ in WH: return 'wh-cl'
        if children[0].tag_ == 'TO': return 'to-v'
    return 'cl'
    
def dep_mapping(token):
    if token.lemma_ == TARGET_WORD:
        if token.tag_ == 'VBN':      return 'V-ed'
        if token.tag_ == 'VBG':      return 'V-ing'
        if token.tag_ in VERBS:      return 'V'

    # 順序 matters
    if token.dep_ in CL:         return classify_cl(token)    
    
    if token.dep_ == 'aux' and token.lemma_ == 'have': return 'have'
    if token.lemma_ == 'be': return 'be'
    
    if token.dep_ in SUB:        return 'S'
    
    if token.tag_ == 'VBN':      return 'v-ed'
    if token.tag_ == 'VBG':      return 'v-ing'
    if token.tag_ in VERBS:      return 'v'
    
    if token.dep_ in OBJ:        return 'O'
    if token.dep_ in PREP:       return token.text
    if token.tag_ == 'TO':       return 'to'
    
    return None
#     return token.text


In [434]:
FIRST_REMAINS = ['aux', 'auxpass', 'dobj', 'prep', 'nsubj', 'nsubjpass', 'ccomp', 'xcomp', 'csubj', 'csubjpass', 'prt' 'acomp', 'oprd']
SECOND_REMAINS = ['pobj', 'pcomp']
go_deeper = ['prep']

def keep_children(tk, remains):
    return [child for child in tk.children if child.dep_ in remains]

def flattern(list_2d):
    return [el for li in list_2d for el in li]

def dep_to_pattern(head_word):
    tokens = [head_word] + keep_children(head_word, FIRST_REMAINS)
    tokens += flattern([keep_children(tk, SECOND_REMAINS) for tk in tokens if tk.dep_ in go_deeper])
    tokens.sort(key=lambda tk: tk.i)
    
    ptns = [dep_mapping(tk) for tk in tokens]
    ptn = ' '.join([p for p in ptns if p])
    
    ngram = ' '.join([tk.text for tk in tokens])

    return ptn, ngram

In [436]:
# headword, patterns, ngrams
patterns = defaultdict(lambda: defaultdict(lambda: []))
sents = defaultdict(lambda: defaultdict(lambda: [])) # for debug

for line in filter(lambda line: len(line) <= 25, bnc_remains):
# for line in bnc_remains:
    for tk in line:
        if tk.lemma_ == TARGET_WORD and tk.tag_ in VERBS:
            ptn, ngram = dep_to_pattern(tk)

            patterns[tk.lemma_][ptn].append(ngram)
            sents[tk.lemma_][ptn].append(tk.doc.text)

In [437]:
# list(filter(lambda pair: pair[1] >= 20, sort_dict(patterns)))
sort_dict(patterns['discuss'])
# sort_dict(get_high_freq(patterns['conj']))

[('S S v V O', ['you you did discuss libraries']),
 ('S S be V-ing O of cl', ["you you 're discussing this of putting"]),
 ('S V O with O',
  ['you should discuss problem with us',
   'who will discuss needs with you',
   'i discussed matter with wife',
   'watkins discusses points with local',
   'watkins discusses points with local',
   'who discussed matter with ministers',
   'i will discuss future with party',
   'i discussed this with you',
   'i discussed beliefs with professor',
   "i 'll discuss it with you",
   'you discuss them with her',
   'souness discusses theory with linesman',
   'therapist discusses plans with practitioner',
   'he discussed this with practitioner',
   'you should discuss this with doctor',
   'you should discuss pros with doctor',
   'we discussed it with mick',
   'husband would discuss things with wife',
   'you should discuss situation with someone',
   'she discussed her with anyone',
   'i discussed it with baron',
   'i could discuss patients w

In [438]:
# sents['conj']['V-ed']

In [None]:
# # answers = dict(list(filter(lambda pair: pair[1] >= 20, sort_dict(patterns))))
# def is_exist(token):
#     ptn = dep_to_pattern(token)
#     return ptn in get_high_freq(patterns[token.dep_])

# for line, answer in test_data:
#     for token in line:
#         if token.lemma_ == TARGET_WORD and token.tag_.startswith('V'):
#             if is_exist(token) == answer:
#                 pass
#             else:
#                 print(line)
#                 print(token.dep_, dep_to_pattern(token))
#                 print()
# # y_test = [is_exist(token) == answer for line, answer in test_data for token in line 
# #           if token.lemma_ == TARGET_WORD and token.tag_.startswith('V')]

# print(sum(y_test) / len(y_test))

In [439]:
# explacy.print_parse_info(nlp, '')