In [14]:
from get_data import *
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter
import nltk
import numpy as np
import spacy
import explacy # https://github.com/tylerneylon/explacy
# explacy.print_parse_info(nlp, line)

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

## 從 BNC 中取得有 discuss 的句子
* read all lines
* strip() and lower()
* filter out target sentences
* ** Use list() to keep filter generator **

In [3]:
def clean_data(file, reserved=None):
    lines = map(lambda line: line.strip().lower(), open(file, 'r', encoding='utf8'))
    remains = filter(lambda line: reserved in line, lines) if reserved else lines
    return list(remains)

In [4]:
remains = clean_data('../dataset/bnc.txt', 'discuss')
remains = list(map(lambda line: nlp(line), remains))

## 針對含有 discuss 單字的句子做處理，產生正確與錯誤的資料

In [231]:
discuss_remains = clean_data('../dataset/efcamp/ef.diff.simplize.despace.txt', 'discuss')

rights, wrongs = [], []
for line in discuss_remains:
    tokens = line.split(' ')
    
    has_edit = False
    for i, tk in enumerate(tokens):
        if tk[:2] in ['[-', '{+'] and 'discuss' in tokens[i-1]:
            has_edit = True
            
    if has_edit:
        wrongs.append((nlp(' '.join(to_before(tokens))), False))
    rights.append((nlp(' '.join(to_after(tokens))), True))

In [233]:
rights[:3]

[(your family should discuss about my suggestion and we should find a date for a meeting in your favourite objects .,
  True),
 (i m writing to discuss about the cards that we are selling ., True),
 (therefore we could set up a meeting to discuss this issue and settling it out of court .,
  True)]

## Count bigram and trigram 
* **NO** stemming or other processsing
* split only by **space**

In [236]:
def get_bi_tri(sents):
    bigrams = Counter([' '.join(bi) for sent in sents for bi in nltk.bigrams(sent.split(' '))])
    trigrams = Counter([' '.join(tri) for sent in sents for tri in nltk.trigrams(sent.split(' '))])

    return bigrams, trigrams

In [239]:
# bigrams, trigrams = get_bi_tri(remains)

In [240]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ngram, count) for ngram, count in counts.items() 
                 if count > avg])

def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [241]:
# bigram_counts = dict([(ngram, count) for ngram, count in bigrams.items() if ngram.startswith('discuss')])
# trigram_counts = dict([(ngram, count) for ngram, count in trigrams.items() if ngram.startswith('discuss')])
# bigram_counts = get_high_freq(bigram_counts)
# trigram_counts = get_high_freq(trigram_counts)

In [279]:
def is_past_passive(main_verb):
    if main_verb.tag_ != 'VBN':
        return False
    
    return any([child.lemma_ == 'be' for child in main_verb.children])

def ptn_transform(token):
    if token.tag_ == 'IN':
        return token.text
    elif token.tag_ == 'VBN':
        return 'VBN'
    elif token.tag_.startswith('VB'):
        return 'VB'
    elif token.tag_.startswith('NN') or token.tag_ == 'DT':
        return 'NN'
    else:
        return token.tag_
    
def is_noun_chunk(token):
    for nc in token.doc.noun_chunks:
        if token.i in range(nc.start, nc.end):
            return True, nc.end
    return False, token.i + 1

def get_ptn(token):
    line = token.doc
    
    ptns = [ptn_transform(token)]
    if token.i+1 < len(line):
        is_NC, next_start = is_noun_chunk(line[token.i+1])
        ptns.append('NN' if is_NC else ptn_transform(line[token.i+1]))
        
        if next_start < len(line):
            is_NC, _ = is_noun_chunk(line[next_start])
            ptns.append('NN' if is_NC else ptn_transform(line[next_start]))
    return ' '.join(ptns)

In [280]:
patterns, ngrams = defaultdict(Counter), Counter()
sents = defaultdict(lambda: [])
 
WINDOW_SIZE = 3
for line in remains:
    for token in line:       
        # if token.lemma_ == 'discuss' and is_past_passive(token):
        if token.lemma_ == 'discuss':
            ptn = get_ptn(token)
            ngram = ' '.join([tk.text for tk in line[token.i : token.i + WINDOW_SIZE]]) # discuss _ _
            # ptn = ' '.join([ptn_transform(tk) for tk in line[token.i : token.i + WINDOW_SIZE]]) # VB _ _

            
            ngrams[ngram] += 1
            patterns[is_past_passive(token)][ptn] += 1
            sents[ptn].append(line.text)

In [281]:
high_ngrams = get_high_freq(ngrams)
high_ptns[True] = get_high_freq(patterns[True])
high_ptns[False] = get_high_freq(patterns[False])
sort_dict(high_ptns[False]) # 這裡的 VBN 為完成式

Total: 10754, Avg: 1.8332765086941698, Std: 7.727115221942229
Total: 2606, Avg: 11.136752136752136, Std: 54.50035771589748
Total: 8148, Avg: 16.975, Std: 81.10633067153266


[('VB NN of', 1209),
 ('VB NN .', 803),
 ('VB NN with', 615),
 ('VB NN CC', 426),
 ('VB NN ,', 389),
 ('VB NN in', 337),
 ('VB NN NN', 234),
 ('VB NN RB', 227),
 ('VB with NN', 222),
 ('VB NN for', 189),
 ('VBN in NN', 152),
 ('VB .', 132),
 ('VB NN VB', 127),
 ('VB WRB NN', 97),
 ('VB NN WDT', 96),
 ('VBN NN of', 87),
 ('VB NN at', 83),
 ('VB NN on', 83),
 ('VBN NN with', 75),
 ('VBN RB .', 75),
 ('VB CC VB', 73),
 ('VB NN TO', 71),
 ('VBN .', 69),
 ('VB NN to', 67),
 ('VB in NN', 65),
 ('VB NN JJ', 52),
 ('VBN RB ,', 50),
 ('VBN by NN', 50),
 ('VB VB NN', 49),
 ('VB NN VBN', 47),
 ('VB NN :', 43),
 ('VBN with NN', 42),
 ('VB NN RBR', 41),
 ('VB NN between', 39),
 ('VB NN from', 35),
 ('VB NN as', 32),
 ('VB `` NN', 31),
 ('VB NN MD', 30),
 ('VB RB .', 29),
 ('VB , NN', 28),
 ("VB . ''", 27),
 ('VB whether NN', 27),
 ('VB RB NN', 27),
 ('VB NN that', 26),
 ('VBN , NN', 26),
 ('VBN NN CC', 26),
 ('VB NN CD', 25),
 ('VBN NN ,', 25),
 ('VB NN', 25),
 ('VB , CC', 24),
 ('VBN NN in', 22),


In [288]:
sents['VB NN TO']

["the 39 signatories will be discussing what to do about pressures for development of the continent 's resources .",
 'this emerged yesterday the two companies finally they were discussing a deal to thwart the us car , which intends to build a 15 per cent in jaguar in preparation for a full bid .',
 ", america 's third largest car maker , and mitsubishi of japan are discussing plans to set up vehicle manufacturing plants in europe .",
 "with johnsonian vigour , eliot discussed the way to run a society ; he surveyed with distanced irony `` the literature of fascism `` , also printing macdiarmid 's `` second hymn to lenin ' .",
 'mr poole said national leaders would meet on thursday to discuss ways to intensify pressure on health service managers to re-open talks .',
 'he has set himself the task of trying to discuss the agenda to develop the next two decades on issues such as training , education and poverty .',
 'president alan of peru for a planned summit on february 15 with president

In [284]:
total, correct = 0, 0
for line, is_correct in rights[:500]+wrongs[:500]:
    for token in line:
        # if token.lemma_ == 'discuss' and is_past_passive(token):
        if token.lemma_ == 'discuss':
            ptn = get_ptn(token)
            # ngram = ' '.join([tk.text for tk in line[token.i : token.i + WINDOW_SIZE]]) # discuss _ _
            # ptn = ' '.join([ptn_transform(tk) for tk in line[token.i : token.i + WINDOW_SIZE]]) # VB _ _

            if (ptn in high_ptns[is_past_passive(token)]) == is_correct:
                correct += 1
            else:
                print(ptn in high_ptns, line)
            total += 1

print(correct/total)

False your family should discuss about my suggestion and we should find a date for a meeting in your favourite objects .
False i m writing to discuss about the cards that we are selling .
False isabella discusses that she loves john with her boyfriend because she does n't want to marry him .
False the story of the adjustment bureau discusses on whether we can control our fate or some invisible power controls us .
False maybe we could use this time to discuss about the project if you want .
False word limit if you want to discuss more about the conditions for this permission , you would contact me by e - mail or telephone # # # # -####.
False as discussed i will be at your department building at 11 am next monday .
False we discussed for a long time and our consensus seems to be that the best option is to make ads with an important actor .
False we 'll have enough time to discuss some issues after dinner .
False this topic is too important to discuss , once we live in a free society and

In [285]:
test =  nlp('we discussed about many themes , she is adorable and very smart !')
print(list(test.noun_chunks))
for tk in test:
    if tk.lemma_ == 'discuss':
        ptn = get_ptn(tk)
        print(ptn)
        print(ptn in high_ptns[is_past_passive(tk)])
high_ptns[False]

[we, about many themes, she]
VB NN ,
True


{'VB , CC': 24,
 'VB , NN': 28,
 'VB .': 132,
 "VB . ''": 27,
 'VB CC VB': 73,
 'VB NN': 25,
 "VB NN ''": 17,
 'VB NN ,': 389,
 'VB NN -LRB-': 17,
 'VB NN .': 803,
 'VB NN :': 43,
 'VB NN CC': 426,
 'VB NN CD': 25,
 'VB NN JJ': 52,
 'VB NN MD': 30,
 'VB NN NN': 234,
 'VB NN RB': 227,
 'VB NN RBR': 41,
 'VB NN TO': 71,
 'VB NN VB': 127,
 'VB NN VBN': 47,
 'VB NN WDT': 96,
 'VB NN about': 18,
 'VB NN as': 32,
 'VB NN at': 83,
 'VB NN between': 39,
 'VB NN by': 17,
 'VB NN for': 189,
 'VB NN from': 35,
 'VB NN in': 337,
 'VB NN of': 1209,
 'VB NN on': 83,
 'VB NN that': 26,
 'VB NN to': 67,
 'VB NN with': 615,
 'VB RB ,': 19,
 'VB RB .': 29,
 'VB RB NN': 27,
 'VB VB NN': 49,
 'VB WRB NN': 97,
 'VB WRB TO': 22,
 'VB `` NN': 31,
 'VB at NN': 21,
 'VB in NN': 65,
 'VB whether NN': 27,
 'VB with NN': 222,
 'VBN , NN': 26,
 'VBN .': 69,
 'VBN NN ,': 25,
 'VBN NN .': 18,
 'VBN NN CC': 26,
 'VBN NN RB': 17,
 'VBN NN in': 22,
 'VBN NN of': 87,
 'VBN NN with': 75,
 'VBN RB ,': 50,
 'VBN RB .': 75,

In [263]:
line = 'this zone has meeting in the year to discuss the economic , the society and other important issues to latin america countries .'
tokens = nlp(line)

In [264]:
for nc in tokens.noun_chunks:
    print(nc)

this zone
the year
the society
other important issues
latin america countries


In [278]:
explacy.print_parse_info(nlp, 'we discussed about many themes , she is adorable and very smart !')

Dep tree           Token   Dep type Lemma   Tag Part of Sp
────────────────── ─────── ──────── ─────── ─── ──────────
              ┌──> i       nsubj    i       PRP PRON      
              │┌─> m       npadvmod m       XX  X         
┌┬────────────┴┴── writing ROOT     write   VBG VERB      
││             ┌─> to      aux      to      TO  PART      
│└─>┌──────────┴── discuss advcl    discuss VB  VERB      
│   └─>┌────────── about   prep     about   IN  ADP       
│      │       ┌─> the     det      the     DT  DET       
│      └─>┌────┴── cards   pobj     card    NNS NOUN      
│         │  ┌───> that    dobj     that    WDT ADJ       
│         │  │┌──> we      nsubj    -PRON-  PRP PRON      
│         │  ││┌─> are     aux      be      VBP VERB      
│         └─>└┴┴── selling relcl    sell    VBG VERB      
└────────────────> .       punct    .       .   PUNCT     
