In [None]:
from utils.before_after import *

# from pgrule import mapRW
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter
from spacy.tokens import Doc
import nltk
import numpy as np
import pickle
import spacy
import utils.explacy # https://github.com/tylerneylon/explacy
# explacy.print_parse_info(nlp, line)

In [None]:
nlp = spacy.load('en_core_web_lg') # ('en')

## Parse BNC 所有句子，再取出有 discuss
* read all lines
* strip() and lower()
* filter out target sentences
* ** Use list() to keep filter generator **

In [None]:
# 測試 load pickle
# # Load the data
# with open("../dataset/bnc_processed.pickle", "rb") as handle:
#     doc_bytes, vocab_bytes = pickle.load(handle)
    
# nlp.vocab.from_bytes(vocab_bytes)
# bnc_docs = [Doc(nlp.vocab).from_bytes(b) for b in doc_bytes]

In [None]:
TARGET_WORD = 'discuss'

In [None]:
# bnc_remains = filter(lambda line: any([tk.lemma_ == TARGET_WORD for tk in line]), bnc_all)

In [None]:
def clean_data(file, reserved=None):
    lines = map(lambda line: line.strip().lower(), open(file, 'r', encoding='utf8'))
    remains = filter(lambda line: reserved in line, lines) if reserved else lines
    return list(remains)

bnc_temp = clean_data('../dataset/bnc.txt', TARGET_WORD)
bnc_remains = list(map(lambda line: nlp(line), bnc_temp))

## 從 EF 中取得有 discuss 的句子，並且 parse

In [None]:
# fs = open('../dataset/efcamp/ef.diff.simplize.despace.txt', 'r', encoding='utf8')
# ef_all = []

ef_temp = clean_data('../dataset/efcamp/ef.diff.simplize.despace.txt', 'discuss')
ef_remains = []
for index, line in enumerate(ef_temp):
    tokens = line.split(' ')
    
    aft_sent = nlp(' '.join(to_after(tokens)))
    bef_sent = nlp(' '.join(to_before(tokens)))
    has_edit = any([tk[:2] in ['[-', '{+'] and TARGET_WORD in tokens[i-1] 
                    for i, tk in enumerate(tokens)]) 
    
    ef_remains.append({
        'origin': line,
        'bef_sent': bef_sent,
        'aft_sent': aft_sent, 
        'has_edit': has_edit
    })

## 產生測資

In [None]:
test_data = list(filter(lambda x: x['has_edit'], ef_remains))
test_data = [(el['bef_sent'], False) if i < 250 else (el['aft_sent'], True) for i, el in enumerate(test_data[:500])]

## Count bigram and trigram 
* **NO** stemming or other processsing
* split only by **space**

In [None]:
from util.syntax import pos_mapping


def get_ngram(token):
    line = token.doc
    return ' '.join([tk.text for tk in line[token.i : token.i + WINDOW_SIZE]]) # discuss _ _
    

def get_pattern(token):
    line = token.doc
    
    ptns, start_loc = [], token.i
    for i in range(WINDOW_SIZE):
        if start_loc < len(line):
            is_NC, next_start = is_noun_chunk(line[start_loc])
            
            ### 遇到標點符號先終止
            if line[start_loc].is_punct:
                break
            if line[start_loc].is_quote:
                start_loc += 1
                
            ptns.append('n' if is_NC else pos_mapping(line[start_loc]))
            start_loc = next_start
            
    return ' '.join(ptns)

def all_info(parsed_sents):
    # TODO: refactor to class
    info = {
        'ngrams': Counter(),
        'patterns': defaultdict(Counter),
        'sents': defaultdict(lambda: [])
    }
    for line in parsed_sents:
        for token in line: 
            if token.lemma_ == TARGET_WORD:
                ngram = get_ngram(token)
                ptn = get_pattern(token)

                info['ngrams'][ngram] += 1
                info['patterns'][is_past_passive(token)][ptn] += 1
                info['sents'][ptn].append(line.text)
    return info

In [None]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ngram, count) for ngram, count in counts.items() 
                 if count > avg + std])

def high(info):
    return {
        'ngrams': get_high_freq(info['ngrams']),
        'patterns': {
            True: get_high_freq(info['patterns'][True]),
            False: get_high_freq(info['patterns'][False])
        }
    }

def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

## 以 BNC 資料統計出正確 patterns

In [None]:
WINDOW_SIZE = 3
bnc = all_info(bnc_remains)
high_bnc = high(bnc)

In [None]:
sort_dict(high_bnc['patterns'][False]) # 這裡的 VBN 為完成式

In [None]:
# bnc['sents']['V']

## 以 EF 資料統計正確和錯誤的 patterns

### Before EF

In [None]:
ef_bef_sents = map(lambda obj: obj['bef_sent'] , ef_remains)
ef_bef = all_info(ef_bef_sents)
high_ef_bef = high(ef_bef)

In [None]:
sort_dict(high_ef_bef['patterns'][False])

### After EF

In [None]:
ef_aft_sents = map(lambda obj: obj['aft_sent'] , ef_remains)
ef_aft = all_info(ef_aft_sents)
high_ef_aft = high(ef_aft)

In [None]:
sort_dict(high_ef_aft['patterns'][False])

### Edit EF (Temp)

In [None]:
ef_edit = {
    'ngrams': defaultdict(Counter), 
    'patterns': defaultdict(lambda: defaultdict(lambda: Counter())), 
    'sents': defaultdict(lambda: defaultdict(lambda: []))
}
ef_right = {
    'ngrams': Counter(), 
    'patterns': defaultdict(Counter), 
    'sents': defaultdict(lambda: [])
}

ef_edit_sents = filter(lambda obj: obj['has_edit'], ef_remains)
for obj in ef_edit_sents:
    origin, bef_sent, aft_sent = obj['origin'], obj['bef_sent'], obj['aft_sent']
    
    bef_ptn, aft_ptn = None, None
    for token in bef_sent: 
        if token.lemma_ == TARGET_WORD:
            is_pp = is_past_passive(token)
            bef_ngram = get_ngram(token)
            bef_ptn = get_pattern(token)

    for token in aft_sent:
        if token.lemma_ == TARGET_WORD:
            aft_ngram = get_ngram(token)
            aft_ptn = get_pattern(token)

    ### 先不要考慮單獨 before or after
    if bef_ptn and aft_ptn: # 前後都存在 target word
        if bef_ptn != aft_ptn: # 前後有更改
            ef_edit['ngrams'][bef_ngram][aft_ngram] += 1
            ef_edit['patterns'][is_pp][bef_ptn][aft_ptn] += 1
            ef_edit['sents'][bef_ptn][aft_ptn].append(origin)
            
        elif bef_ptn == aft_ptn: # 前後 ptn 一樣
            ef_right['ngrams'][aft_ngram] += 1
            ef_right['patterns'][is_pp][aft_ptn] += 1
            ef_right['sents'][aft_ptn].append(origin)

In [None]:
high_ef_edit = {
    'patterns': {
        True: get_high_freq(dict([(ptn, sum(ctn.values())) for ptn, ctn in ef_edit['patterns'][True].items()])),
        False: get_high_freq(dict([(ptn, sum(ctn.values())) for ptn, ctn in ef_edit['patterns'][False].items()]))    
    }
}
high_ef_edit['patterns'][False]

In [None]:
# ef_edit['patterns'][False]

## Rank

In [None]:
def transform(table):
    return dict([(ptn, (i+1, ctn)) for i, (ptn, ctn) in enumerate(sort_dict(table))])
        
def ranking(bnc_table, ef_table):
    bnc_rank = transform(bnc_table)
    # ef_rank = transform(ef_table)
    
    print("Pattern\tRank(EF->BNC)\tRatio(EF/BNC)")
    for i, (ptn, ctn) in enumerate(sort_dict(ef_table)):
        if ptn in bnc_rank:
            print("{}\t{}->{}\t{}".format(ptn, i+1, bnc_rank[ptn][0], bnc_rank[ptn][1]/ctn))
        else:
            not_exist.append(ptn)
            # print("{} NOT EXIST in BNC".format(ptn))

In [None]:
is_PP = False
not_exist = []
ranking(high_bnc['patterns'][is_PP], high_ef_bef['patterns'][is_PP])
print(not_exist)

In [None]:
# bnc['patterns'][False]
# temp = nlp(bnc['sents']['V V V'][0])[6]
# temp.is_punct

## 觀察區

In [None]:
is_PP = False

In [None]:
sort_dict(high_bnc['patterns'][is_PP]) # 這裡的 VBN 為完成式

In [None]:
bnc['sents']['V N of']

In [None]:
sort_dict(high_ef_bef['patterns'][is_PP]) # 這裡的 VBN 為完成式

In [None]:
sort_dict(high_ef_aft['patterns'][is_PP]) # 這裡的 VBN 為完成式

In [None]:
def is_exist(token):
    ptn = get_pattern(token)
    is_pp = is_past_passive(token)
    return ptn in high_bnc['patterns'][is_pp]

y_test = [is_exist(token) == answer for line, answer in test_data for token in line if token.lemma_ == TARGET_WORD]
print(sum(y_test) / len(y_test))