In [2]:
from get_data import *
from pgrule import mapRest, mapRW
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter
from spacy.tokens import Doc
import nltk
import numpy as np
import pickle
import spacy
import explacy # https://github.com/tylerneylon/explacy
# explacy.print_parse_info(nlp, line)

In [3]:
nlp = spacy.load('en_core_web_lg') # ('en')

## Parse BNC 所有句子，再取出有 discuss
* read all lines
* strip() and lower()
* filter out target sentences
* ** Use list() to keep filter generator **

In [4]:
# 測試 load pickle
# # Load the data
# with open("../dataset/bnc_processed.pickle", "rb") as handle:
#     doc_bytes, vocab_bytes = pickle.load(handle)
    
# nlp.vocab.from_bytes(vocab_bytes)
# bnc_docs = [Doc(nlp.vocab).from_bytes(b) for b in doc_bytes]

In [5]:
TARGET_WORD = 'discuss'

In [6]:
# bnc_remains = filter(lambda line: any([tk.lemma_ == TARGET_WORD for tk in line]), bnc_all)

In [7]:
def clean_data(file, reserved=None):
    lines = map(lambda line: line.strip().lower(), open(file, 'r', encoding='utf8'))
    remains = filter(lambda line: reserved in line, lines) if reserved else lines
    return list(remains)

bnc_temp = clean_data('../dataset/bnc.txt', TARGET_WORD)
bnc_remains = list(map(lambda line: nlp(line), bnc_temp))

## 從 EF 中取得有 discuss 的句子，並且 parse

In [8]:
# fs = open('../dataset/efcamp/ef.diff.simplize.despace.txt', 'r', encoding='utf8')
# ef_all = []

ef_temp = clean_data('../dataset/efcamp/ef.diff.simplize.despace.txt', 'discuss')
ef_remains = []
for index, line in enumerate(ef_temp):
    tokens = line.split(' ')
    
    aft_sent = nlp(' '.join(to_after(tokens)))
    bef_sent = nlp(' '.join(to_before(tokens)))
    has_edit = any([tk[:2] in ['[-', '{+'] and TARGET_WORD in tokens[i-1] 
                    for i, tk in enumerate(tokens)]) 
    
    ef_remains.append({
        'origin': line,
        'bef_sent': bef_sent,
        'aft_sent': aft_sent, 
        'has_edit': has_edit
    })

## 產生測資

In [9]:
test_data = list(filter(lambda x: x['has_edit'], ef_remains))
test_data = [(el['bef_sent'], False) if i < 250 else (el['aft_sent'], True) for i, el in enumerate(test_data[:500])]

## Count bigram and trigram 
* **NO** stemming or other processsing
* split only by **space**

In [64]:
def is_past_passive(main_verb):
    if main_verb.tag_ != 'VBN':
        return False
    return any([child.lemma_ == 'be' for child in main_verb.children])

def pos_mapping(token):
    # temp 
    if token.lemma_ == TARGET_WORD:  
        if token.tag_ == 'VBN':          return 'VBN'
        if token.tag_.startswith('VB'):  return 'V'

    if token.tag_ == 'IN':           return token.text
    
    if token.tag_ in mapRest:        return mapRest[token.tag_]
    if token.tag_[:2] in mapRest:    return mapRest[token.tag_[:2]]
    if token.tag_[0] == 'W' and token.lemma_ in mapRW: return mapRW[token.lemma_] # 多加 why, which, where
    
    # if token.tag_ == 'VBN':          return 'VBN'
    # if token.tag_.startswith('VB'):  return 'V'
    if token.tag_.startswith('NN') or token.tag_ == 'DT': return 'n'
    return token.tag_

def is_noun_chunk(token):
    for nc in token.doc.noun_chunks:
        if token.i in range(nc.start, nc.end):
            return True, nc.end
    return False, token.i + 1


def get_ngram(token):
    line = token.doc
    return ' '.join([tk.text for tk in line[token.i : token.i + WINDOW_SIZE]]) # discuss _ _
    

def get_pattern(token):
    line = token.doc
    
    ptns, start_loc = [], token.i
    for i in range(WINDOW_SIZE):
        if start_loc < len(line):
            is_NC, next_start = is_noun_chunk(line[start_loc])
            
            ### 遇到標點符號先終止
            if line[start_loc].is_punct:
                break
            if line[start_loc].is_quote:
                start_loc += 1
                
            ptns.append('n' if is_NC else pos_mapping(line[start_loc]))
            start_loc = next_start
            
    return ' '.join(ptns)

def all_info(parsed_sents):
    # TODO: refactor to class
    info = {
        'ngrams': Counter(),
        'patterns': defaultdict(Counter),
        'sents': defaultdict(lambda: [])
    }
    for line in parsed_sents:
        for token in line: 
            if token.lemma_ == TARGET_WORD:
                ngram = get_ngram(token)
                ptn = get_pattern(token)

                info['ngrams'][ngram] += 1
                info['patterns'][is_past_passive(token)][ptn] += 1
                info['sents'][ptn].append(line.text)
    return info

In [65]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ngram, count) for ngram, count in counts.items() 
                 if count > avg])

def high(info):
    return {
        'ngrams': get_high_freq(info['ngrams']),
        'patterns': {
            True: get_high_freq(info['patterns'][True]),
            False: get_high_freq(info['patterns'][False])
        }
    }

def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

## 以 BNC 資料統計出正確 patterns

In [66]:
WINDOW_SIZE = 3
bnc = all_info(bnc_remains)
high_bnc = high(bnc)

Total: 10754, Avg: 1.8332765086941698, Std: 7.7271152219422286
Total: 2606, Avg: 15.604790419161677, Std: 71.52655033064005
Total: 8148, Avg: 21.442105263157895, Std: 104.3580340931085


In [67]:
sort_dict(high_bnc['patterns'][False]) # 這裡的 VBN 為完成式

[('V n', 1298),
 ('V n of', 1210),
 ('V n with', 615),
 ('V n CC', 422),
 ('V n in', 336),
 ('V', 318),
 ('V n adv', 271),
 ('V n n', 253),
 ('V with n', 222),
 ('V n for', 189),
 ('VBN in n', 152),
 ('VBN', 148),
 ('VBN adv', 143),
 ('V wh n', 104),
 ('VBN n of', 84),
 ('V n at', 83),
 ('V n on', 83),
 ('V n v-ed', 82),
 ('VBN n with', 75),
 ('V n TO', 71),
 ('V n that', 68),
 ('V n wh', 67),
 ('V adv', 67),
 ('V n to', 67),
 ('V in n', 65),
 ('V CC v', 60),
 ('V n adj', 56),
 ('VBN by n', 50),
 ('V n v', 49),
 ('VBN n', 47),
 ('V n ing', 44),
 ('VBN with n', 42),
 ('V n between', 39),
 ('V n from', 35),
 ('V n as', 32),
 ('V ing n', 32),
 ('V n MD', 30),
 ('V adv n', 28),
 ('V whether n', 27),
 ('VBN n CC', 26),
 ('V n CD', 25),
 ('VBN above', 22),
 ('V wh TO', 22)]

In [68]:
bnc['sents']['V']

['the artist critic in this case deliberately the historical context of the pictures she was discussing .',
 'in discussing , he treated him the most versatile artist of a , whose other members were marcantonio raimondi and lucas van leyden .',
 'reality does not discuss , it simply is .',
 'however , i believe that the vast majority of woodturners in the uk are amateurs like myself , and it is the contrast between our approach to and that of the which i wish to discuss .',
 "it has long been common practice to give students passages of poetry or prose , often but not always anonymous , to analyse , discuss , and respond to , in the manner of richards 's original undertaking .",
 'it is here papers are generated and read and discussed , and where the academic superstars go their paces and inspire their dedicated students .',
 'on saturday ec foreign ministers assemble a chateau near chartres for an informal weekend during which they will discuss , among other things , the co-ordination

## 以 EF 資料統計正確和錯誤的 patterns

### Before EF

In [69]:
ef_bef_sents = map(lambda obj: obj['bef_sent'] , ef_remains)
ef_bef = all_info(ef_bef_sents)
high_ef_bef = high(ef_bef)

Total: 2239, Avg: 1.6426999266324285, Std: 3.5240495265383576
Total: 145, Avg: 3.5365853658536586, Std: 6.184550227058177
Total: 2094, Avg: 10.214634146341464, Std: 36.84340998078887


In [70]:
sort_dict(high_ef_bef['patterns'][False])

[('V about n', 382),
 ('V n', 271),
 ('V with n', 157),
 ('V n of', 118),
 ('V n CC', 110),
 ('V n with', 95),
 ('V', 81),
 ('V n in', 45),
 ('V n for', 37),
 ('V n n', 37),
 ('V n adv', 35),
 ('V n about', 31),
 ('V in n', 30),
 ('VBN about n', 23),
 ('n about n', 22),
 ('V CC v', 21),
 ('V n TO', 20),
 ('V wh TO', 19),
 ('V n adj', 17),
 ('V adj about', 16),
 ('V for n', 16),
 ('V n that', 15),
 ('V wh n', 14),
 ('n', 14),
 ('VBN', 13),
 ('V on n', 13),
 ('VBN with n', 12),
 ('VBN in n', 12),
 ('V at n', 11),
 ('V adv', 11)]

### After EF

In [71]:
ef_aft_sents = map(lambda obj: obj['aft_sent'] , ef_remains)
ef_aft = all_info(ef_aft_sents)
high_ef_aft = high(ef_aft)

Total: 2326, Avg: 1.6891793754538853, Std: 3.1370266061802132
Total: 172, Avg: 3.5833333333333335, Std: 7.088233599110257
Total: 2154, Avg: 14.07843137254902, Std: 44.753285841576364


In [72]:
sort_dict(high_ef_aft['patterns'][False])

[('V n', 396),
 ('V about n', 274),
 ('V n of', 165),
 ('V n with', 138),
 ('V n CC', 134),
 ('V with n', 126),
 ('V', 76),
 ('V n for', 56),
 ('V n in', 54),
 ('V n n', 45),
 ('V n adv', 44),
 ('V n about', 41),
 ('V n TO', 38),
 ('V in n', 33),
 ('V wh TO', 22),
 ('V n at', 19),
 ('V n that', 19),
 ('V CC v', 18),
 ('VBN in n', 18),
 ('V n adj', 16),
 ('V wh n', 16)]

### Edit EF (Temp)

In [73]:
ef_edit = {
    'ngrams': defaultdict(Counter), 
    'patterns': defaultdict(lambda: defaultdict(lambda: Counter())), 
    'sents': defaultdict(lambda: defaultdict(lambda: []))
}
ef_right = {
    'ngrams': Counter(), 
    'patterns': defaultdict(Counter), 
    'sents': defaultdict(lambda: [])
}

ef_edit_sents = filter(lambda obj: obj['has_edit'], ef_remains)
for obj in ef_edit_sents:
    origin, bef_sent, aft_sent = obj['origin'], obj['bef_sent'], obj['aft_sent']
    
    for token in bef_sent: 
        if token.lemma_ == TARGET_WORD:
            is_pp = is_past_passive(token)
            bef_ngram = get_ngram(token)
            bef_ptn = get_pattern(token)

    for token in aft_sent:
        if token.lemma_ == TARGET_WORD:
            aft_ngram = get_ngram(token)
            aft_ptn = get_pattern(token)
            
    if bef_ptn != aft_ptn:  
        ef_edit['ngrams'][bef_ngram][aft_ngram] += 1
        ef_edit['patterns'][is_pp][bef_ptn][aft_ptn] += 1
        ef_edit['sents'][bef_ptn][aft_ptn].append(origin)
        
        ef_right['ngrams'][aft_ngram] += 1
        ef_right['patterns'][is_pp][aft_ptn] += 1
        ef_right['sents'][aft_ptn].append(origin)
    else:
        ef_right['ngrams'][aft_ngram] += 1
        ef_right['patterns'][is_pp][aft_ptn] += 1
        ef_right['sents'][aft_ptn].append(origin)

In [74]:
high_ef_edit = {
    'patterns': {
        True: get_high_freq(dict([(ptn, sum(ctn.values())) for ptn, ctn in ef_edit['patterns'][True].items()])),
        False: get_high_freq(dict([(ptn, sum(ctn.values())) for ptn, ctn in ef_edit['patterns'][False].items()]))    
    }
}
high_ef_edit['patterns'][False]

Total: 19, Avg: 2.111111111111111, Std: 1.8525924445036743
Total: 429, Avg: 4.515789473684211, Std: 17.889640141062966


{'V': 10,
 'V about n': 172,
 'V for n': 7,
 'V in n': 5,
 'V n': 14,
 'V n CC': 6,
 'V n in': 5,
 'V n with': 8,
 'V on n': 12,
 'V with n': 40,
 'VBN about n': 14,
 'n about n': 9}

## Rank

In [105]:
def transform(table):
    return dict([(ptn, (i, ctn)) for i, (ptn, ctn) in enumerate(sort_dict(table))])
        
def ranking(bnc_table, ef_table):
    bnc_rank = transform(bnc_table)
    # ef_rank = transform(ef_table)
    
    print("Pattern\tRank(EF->BNC)\tRatio(EF/BNC)")
    for i, (ptn, ctn) in enumerate(sort_dict(ef_table)):
        if ptn in bnc_rank:
            print("{}\t{}->{}\t{}".format(ptn, i, bnc_rank[ptn][0], bnc_rank[ptn][1]/ctn))
        else:
            not_exist.append(ptn)
            # print("{} NOT EXIST in BNC".format(ptn))

In [106]:
is_PP = False
not_exist = []
ranking(high_bnc['patterns'][is_PP], high_ef_bef['patterns'][is_PP])

Pattern	Rank(EF->BNC)	Ratio(EF/BNC)
V n	1->0	4.789667896678967
V with n	2->8	1.414012738853503
V n of	3->1	10.254237288135593
V n CC	4->3	3.8363636363636364
V n with	5->2	6.473684210526316
V	6->5	3.925925925925926
V n in	7->4	7.466666666666667
V n for	8->9	5.108108108108108
V n n	9->7	6.837837837837838
V n adv	10->6	7.742857142857143
V in n	12->24	2.1666666666666665
V CC v	15->25	2.857142857142857
V n TO	16->19	3.55
V wh TO	17->42	1.1578947368421053
V n adj	18->26	3.2941176470588234
V n that	21->20	4.533333333333333
V wh n	22->13	7.428571428571429
VBN	24->11	11.384615384615385
VBN with n	26->31	3.5
VBN in n	27->10	12.666666666666666
V adv	29->22	6.090909090909091


In [102]:
# bnc['patterns'][False]
# temp = nlp(bnc['sents']['V V V'][0])[6]
# temp.is_punct

## 觀察區

In [31]:
is_PP = False

In [23]:
sort_dict(high_bnc['patterns'][is_PP]) # 這裡的 VBN 為完成式

NameError: name 'is_PP' is not defined

In [22]:
bnc['sents']['V N of']

[]

In [21]:
sort_dict(high_ef_bef['patterns'][is_PP]) # 這裡的 VBN 為完成式

[('V about N', 383),
 ('V N .', 201),
 ('V with N', 157),
 ('V N of', 118),
 ('V N CC', 110),
 ('V N with', 95),
 ('V N ,', 55),
 ('V N in', 45),
 ('V .', 45)]

In [32]:
sort_dict(high_ef_aft['patterns'][is_PP]) # 這裡的 VBN 為完成式

[('V N .', 290),
 ('V about N', 275),
 ('V N of', 165),
 ('V N with', 138),
 ('V N CC', 134),
 ('V with N', 126),
 ('V N ,', 81),
 ('V N for', 56),
 ('V N in', 54)]

In [29]:
def is_exist(token):
    ptn = get_pattern(token)
    is_pp = is_past_passive(token)
    return ptn in high_bnc['patterns'][is_pp]

y_test = [is_exist(token) == answer for line, answer in test_data for token in line if token.lemma_ == TARGET_WORD]
print(sum(y_test) / len(y_test))

0.6824324324324325


In [162]:
explacy.print_parse_info(nlp, 'last week , my wife and i have discussed about our future .')

Dep tree   Token     Dep type Lemma   Tag  Part of Sp
────────── ───────── ──────── ─────── ──── ──────────
       ┌─> last      amod     last    JJ   ADJ       
 ┌────>└── week      npadvmod week    NN   NOUN      
 │┌──────> ,         punct    ,       ,    PUNCT     
 ││    ┌─> my        poss     -PRON-  PRP$ ADJ       
 ││┌─>┌┼── wife      nsubj    wife    NN   NOUN      
 │││  │└─> and       cc       and     CC   CCONJ     
 │││  └──> i         conj     i       PRP  PRON      
 │││   ┌─> have      aux      have    VBP  VERB      
┌┼┴┴───┴── discussed ROOT     discuss VBN  VERB      
│└─>┌───── about     prep     about   IN   ADP       
│   │  ┌─> our       poss     -PRON-  PRP$ ADJ       
│   └─>└── future    pobj     future  NN   NOUN      
└────────> .         punct    .       .    PUNCT     


In [63]:
test = nlp('well , . ( we were discussing my comments today on the on the erm relationship between the two countries , and i think you have just heard what they were .')
for tk in test:
    print(tk.is_stop)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
