In [69]:
import nltk, re
import fileinput
from collections import Counter

In [6]:
def to_after(tokens, reserved='about'):
    def to_after_token(token):
        token = token.replace('\u3000', ' ')
        if token == ' ': return ''
        
        if token.endswith('-]'):
            return None
        # key part, may happen is+} get his+}
        elif reserved and (token.endswith(reserved + '+}') or token.startswith('[-' + reserved)):
            return token
        elif token.endswith('+}'):
            return token[token.rfind('>>')+2:-2]  if token.startswith('[-') else token[2:-2]  
        else:
            return token
    return [e for token in map(to_after_token, tokens) if token for e in token.split(' ')]

def to_before(tokens, reserved=''):
    def to_before_token(token):
        token = token.replace('\u3000', ' ')
        if token == ' ': return ''
        
        if token.endswith('-]'):
            return token[2:-2]
        elif token.endswith('+}'):
            return token[2:token.rfind('>>')]  if token.startswith('[-') else None
        else:
            return token
    return [ token for token in map(to_before_token, tokens) if token ]

In [7]:
to_after(['hello','[-about-]', '[-asdf\u3000asdfccc>>about\u3000qqqq+}', '[-about>>ddd+}', 'asdfdf'], reserved='about')
# to_before(['hello','[-about-]', '[-asdf>>about+}', '[-vvv>>ddd+}', 'asdfdf'])

['hello', 'about', 'qqqq', '[-about>>ddd+}', 'asdfdf']

In [8]:
# 擔心有一句包含兩個 about+} 所以換個邏輯
def divide_triedit_noedit(aft_tokens, reserved='about'):
    noedit_list, triedit_list = [[]], []

    for i, tk in enumerate(aft_tokens):
        if '+}' in tk: # reserved word ，應該是不會有 [--] 出現
            if i > 0 and i+1 < len(aft_tokens):
                triedit_list.append(tuple(aft_tokens[i-1:i+2]))
            
            noedit_list.append([])
        else: 
            noedit_list[-1].append(tk)

    return noedit_list, triedit_list

In [9]:
divide_triedit_noedit(['hello', '[-asdf>>about+}', 'ddd', 'asdfdf', '[-fff>>about+}', 'asdss'])
divide_triedit_noedit(['hello', '[-asdf>>about+}', 'ddd', 'asdfdf', 'asdss'])
divide_triedit_noedit(['hello', '[-about>>ddd+}', 'ddd', 'asdfdf', 'asdss', '[-asdsfas>>about+}'])

([['hello'], ['ddd', 'asdfdf', 'asdss'], []],
 [('hello', '[-about>>ddd+}', 'ddd')])

In [10]:
def get_bigram(tokens):
    return list(nltk.bigrams(tokens))

def get_trigram(tokens):
    return list(nltk.trigrams(tokens))

In [11]:
get_trigram(['one'])
get_trigram(['one', 'asd', 'aaa'])

[('one', 'asd', 'aaa')]

In [36]:
edit_list = [] # trigram edit
bigrams, trigrams = [], []
    
# for line in open('ef.diff.simplize.despace.txt', 'r', encoding='utf8').readlines():# fileinput.input():
for line in open('test.txt', 'r', encoding='utf8').readlines():# fileinput.input():
    tokens = line.strip().split(' ')
    # print(tokens)
    aft_tokens = to_after(tokens, reserved='about')

    noedit_list, edit_tokens = divide_triedit_noedit(aft_tokens)
    # print(noedit_list)
    # print(triedit_tokens)
    edit_list.extend(edit_tokens)
    # print(edit_list)

    # 會有 [] 出現，若是長度不夠
    for no_edit in noedit_list:
        bigrams.extend(get_bigram(no_edit))
        trigrams.extend(get_trigram(no_edit))

    # print(edit_list)  
    # print(bigrams)

uniq_edit = set(edit_list)

[('I', 'had', 'some'), ('had', 'some', 'money'), ('some', 'money', 'in'), ('money', 'in', 'my'), ('in', 'my', 'bag'), ('my', 'bag', '('), ('bag', '(', 'about'), ('(', 'about', '100'), ('about', '100', 'dollars'), ('100', 'dollars', ')')]


In [55]:
def bi_vs_edit(bigram, uniq_edit_list, reserved='about'):
    group = set()
    for edit in uniq_edit_list:
        if edit[1].endswith(reserved + '+}'): # only insertion and replace
            if bigram[0] == edit[0] and bigram[1] == edit[2]:
                group.add(edit)
    return group

def tri_vs_edit(trigram, uniq_edit_list, reserved='about'):
    group = set()
    for edit in uniq_edit_list:
        tri_befs = []
        for token in edit:
            # care deletion and replace and retrace
            tri_befs.append(token[2:-2].split('>>')[0] if token.startswith('[-' + reserved) else token)

        if trigram == tuple(tri_befs):
            group.add(edit)
    return group

In [56]:
tri_vs_edit(('discuss', 'about', 'it'), 
           [('discuss','[-about-]','it'), ('discuss','[-about>>abc+}','it'), 
            ('discuss','[-abc>>about+}','it'), ('discuss', '{+about+}', 'it')])

[('discuss', '[-about-]', 'it'), ('discuss', '[-about>>abc+}', 'it')]

In [64]:
bi_edit_group = dict()
for bi in set(bigrams):
    group = list(bi_vs_edit(bi, uniq_edit))
    if group: bi_edit_group[bi] = group
    # print(bi_edit_group[bi])
    # print('='*50) 
    
tri_edit_group = dict()
for tri in set(trigrams):
    group = list(tri_vs_edit(tri, uniq_edit))
    if group: tri_edit_group[tri] = group
    # print(tri_edit_group[tri])
    # print('='*50) 
    
tri_edit_group

{('How', 'about', 'you'): [('How', '[-about>>are+}', 'you')],
 ('apologize', 'about', 'the'): [('apologize', '[-about>>for+}', 'the')],
 ('care', 'about', 'you'): [('care', '[-about>>of+}', 'you')],
 ('control', 'about', 'their'): [('control', '[-about>>of+}', 'their')],
 ('flattered', 'about', 'your'): [('flattered', '[-about>>by+}', 'your')],
 ('information', 'about', 'the'): [('information', '[-about>>under+}', 'the')],
 ('results', 'about', 'the'): [('results', '[-about>>that+}', 'the')],
 ('thinking', 'about', 'going'): [('thinking', '[-about>>of+}', 'going')],
 ('you', 'about', 'your'): [('you', '[-about>>with+}', 'your')]}

In [104]:
def count_bi_edit(bi_edit, bigrams_count, edit_list_count):
    dic = {}
    dic[bi_edit[0]] = bigrams_count[bi_edit[0]]
    for ngram in bi_edit[1:]:
        dic[ngram] = [edit_list_count[ngram], edit_list_count[ngram] / dic[bi_edit[0]]]
    return sorted(dic.items(), key = lambda item: len(item[0]))


def count_tri_edit(tri_edit, trigrams_count, edit_list_count):
    dict = {}
    dict[tri_edit[0]] = trigrams_count[tri_edit[0]]
    for ngram in tri_edit[1:]:
        dict[ngram] = [edit_list_count[ngram], edit_list_count[ngram] / dict[tri_edit[0]]]
    return sorted(dict.items(), key = lambda item: len(item[0]))


def sort(count_bi):
    return sorted(count_bi, key=lambda item: (item[1][1][1]))

In [105]:
count_bi, count_tri = [], []

bigrams_count = Counter(bigrams)
trigrams_count = Counter(trigrams)
edit_list_count = Counter(edit_list)

for bi, edit in bi_edit_group.items():
    c_bi_edit = count_bi_edit([bi] + edit, bigrams_count, edit_list_count)
    count_bi.append(c_bi_edit)
    print(c_bi_edit)
    # print('='*50)
    
for tri, edit in tri_edit_group.items():
    c_tri_edit = count_tri_edit([tri] + edit, trigrams_count, edit_list_count)
    count_tri.append(c_tri_edit)
    # print(c_tri_edit)

    for freq in c_tri_edit:
        print(' '.join(freq[0]), ': ', freq[1], sep='')
    print('='*50)

[(('was', 'the'), 3), (('was', '{+about+}', 'the'), [1, 0.3333333333333333])]
[(('you', 'the'), 2), (('you', '{+about+}', 'the'), [1, 0.5])]
[(('me', 'the'), 2), (('me', '{+about+}', 'the'), [1, 0.5])]
[(('think', 'it'), 3), (('think', '[-for>>about+}', 'it'), [1, 0.3333333333333333])]
[(('you', 'some'), 6), (('you', '{+about+}', 'some'), [1, 0.16666666666666666])]
[(('hear', 'your'), 1), (('hear', '{+about+}', 'your'), [1, 1.0]), (('hear', 'about+}', 'your'), [1, 1.0])]
[(('me', 'a'), 1), (('me', '{+about+}', 'a'), [1, 1.0])]
[(('you', 'my'), 2), (('you', '{+about+}', 'my'), [1, 0.5])]
[(('think', 'a'), 1), (('think', '[-in>>about+}', 'a'), [1, 1.0])]
[(('and', 'his'), 4), (('and', '{+about+}', 'his'), [1, 0.25])]
[((',', 'his'), 2), ((',', '{+about+}', 'his'), [1, 0.5])]
[(('you', 'something'), 4), (('you', '{+about+}', 'something'), [1, 0.25])]
information about the: 5
information [-about>>under+} the: [1, 0.2]
results about the: 1
results [-about>>that+} the: [1, 1.0]
apologize abo

In [106]:
if __name__ == '__main__':
#     print(count_bi)
    print(sort(count_bi))
    print(count_tri)
    # print(sort(counts_tri))


    for count_bi in sort(counts_bi):
        for freq in count_bi:
            print(' '.join(freq[0]), ': ', freq[1], sep = '')
        print('='*50)

    # for count_tri in sort(counts_tri):
    #     for freq in count_tri:
    #         print(' '.join(freq[0]), ': ', freq[1], sep = '')
    #     print('='*50)


        # for freq in count_bvst:
        #     print(' '.join(freq[0]), ': ', freq[1], sep = '')
        # print('='*50)

   
        # for freq in count_tvst:
        #     print(' '.join(freq[0]), ': ', freq[1], sep = '')
        # print('='*50)



[[(('you', 'some'), 6), (('you', '{+about+}', 'some'), [1, 0.16666666666666666])], [(('and', 'his'), 4), (('and', '{+about+}', 'his'), [1, 0.25])], [(('you', 'something'), 4), (('you', '{+about+}', 'something'), [1, 0.25])], [(('was', 'the'), 3), (('was', '{+about+}', 'the'), [1, 0.3333333333333333])], [(('think', 'it'), 3), (('think', '[-for>>about+}', 'it'), [1, 0.3333333333333333])], [(('you', 'the'), 2), (('you', '{+about+}', 'the'), [1, 0.5])], [(('me', 'the'), 2), (('me', '{+about+}', 'the'), [1, 0.5])], [(('you', 'my'), 2), (('you', '{+about+}', 'my'), [1, 0.5])], [((',', 'his'), 2), ((',', '{+about+}', 'his'), [1, 0.5])], [(('hear', 'your'), 1), (('hear', '{+about+}', 'your'), [1, 1.0]), (('hear', 'about+}', 'your'), [1, 1.0])], [(('me', 'a'), 1), (('me', '{+about+}', 'a'), [1, 1.0])], [(('think', 'a'), 1), (('think', '[-in>>about+}', 'a'), [1, 1.0])]]
[[(('information', 'about', 'the'), 5), (('information', '[-about>>under+}', 'the'), [1, 0.2])], [(('results', 'about', 'the'),

NameError: name 'counts_bi' is not defined