In [1]:
import nltk, re
import fileinput
from collections import Counter

In [2]:
def to_after(tokens, reserved='about'):
    def to_after_token(token):
        token = token.replace('\u3000', ' ')
        if token == ' ': return ''
        
        if token.endswith('-]'):
            return None
        
        # 保留字邏輯區，但是可能有誤，像是 is+} 可能抓到 his+}
        elif reserved and (token.endswith(reserved + '+}') or token.startswith('[-' + reserved)):
            # 丟掉多於一個單字的，目前是 before/after 都丟
            return token[token.rfind('>>')+2:-2] if ' ' in token else token
        
        elif token.endswith('+}'):
            return token[token.rfind('>>')+2:-2]  if token.startswith('[-') else token[2:-2]  
        else:
            return token
        
    tokens = [e for token in map(to_after_token, tokens) if token for e in token.split(' ')]
    indices = [i for i, t in enumerate(tokens) if '+}' in t or '[-' in t ]
    return tokens, [-1] + indices + [len(tokens)]

In [3]:
to_after(['hello','[-about-]', '[-asdf\u3000asdfccc>>about\u3000qqqq+}', '[-about>>ddd+}', 'asdfdf'], reserved='about')

(['hello', 'about', 'qqqq', '[-about>>ddd+}', 'asdfdf'], [-1, 3, 5])

In [4]:
# 擔心有一句包含兩個 about+} 所以換個邏輯
def divide_triedit_noedit(aft_tokens, indices):
    noedit_list, triedit_list = [], []

    segs = zip(indices, indices[1:])
    for i, (start, end) in enumerate(segs):
        if i != 0 and start > 0 and start+1 < indices[-1]:
            triedit_list.append(tuple(aft_tokens[start-1:start+2]))
        noedit_list.append(aft_tokens[start+1:end])
        
    return noedit_list, triedit_list

In [5]:
aft_tks, ind = to_after(['hello', '[-asdf>>about+}', 'ddd', 'asdfdf', '[-fff>>about+}', 'asdss'])

divide_triedit_noedit(aft_tks, ind)

([['hello'], ['ddd', 'asdfdf'], ['asdss']],
 [('hello', '[-asdf>>about+}', 'ddd'), ('asdfdf', '[-fff>>about+}', 'asdss')])

In [6]:
def get_bigram(tokens):
    return list(nltk.bigrams(tokens))

def get_trigram(tokens):
    return list(nltk.trigrams(tokens))

In [8]:
edit_list = [] # trigram edit
bigrams, trigrams = [], []
    
# for line in open('ef.diff.simplize.despace.txt', 'r', encoding='utf8').readlines():# fileinput.input():
for line in open('test.txt', 'r', encoding='utf8').readlines():# fileinput.input():
    tokens = line.strip().split(' ')

    aft_tokens, indices = to_after(tokens, reserved='about')

    noedit_list, edit_tokens = divide_triedit_noedit(aft_tokens, indices)

    # if edit_tokens:
    #     print(edit_tokens)
    # print(noedit_list)
    # print(triedit_tokens)
    edit_list.extend(edit_tokens)
    # print(edit_list)

    # 會有 [] 出現，若是長度不夠
    for no_edit in noedit_list:
        bigrams.extend(get_bigram(no_edit))
        trigrams.extend(get_trigram(no_edit))
        
uniq_edit = set(edit_list)

In [9]:
def bi_vs_edit(bigram, uniq_edit_list):
    group = set()
    for edit in uniq_edit_list:
        if bigram[0] == edit[0] and bigram[1] == edit[2]:
            group.add(edit)
    return group

def tri_vs_edit(trigram, uniq_edit_list, reserved='about'):
    group = set()
    for edit in uniq_edit_list:
        if edit[0] == trigram[0] and edit[2] == trigram[2]:
            temp = edit[1][2:-2].split('>>')[0] if edit[1].startswith('[-' + reserved) else edit[1]
            if temp == trigram[1]:
                group.add(edit)
    return group

In [10]:
tri_vs_edit(('discuss', 'about', 'it'), 
           [('discuss','[-about-]','it'), ('discuss','[-about>>abc+}','it'), 
            ('discuss','[-abc>>about+}','it'), ('discuss', '{+about+}', 'it')])

{('discuss', '[-about-]', 'it'), ('discuss', '[-about>>abc+}', 'it')}

In [16]:
bi_edit_group = dict()
for bi in set(bigrams):
    group = list(bi_vs_edit(bi, uniq_edit))
    if group: 
        print(bi, group)
        bi_edit_group[bi] = group
    
tri_edit_group = dict()
for tri in set(trigrams):
    group = list(tri_vs_edit(tri, uniq_edit))
    if group: tri_edit_group[tri] = group
    # print(tri_edit_group[tri])
    # print('='*50) 
    
tri_edit_group

('you', 'some') [('you', '{+about+}', 'some')]
('me', 'a') [('me', '{+about+}', 'a')]
('was', 'the') [('was', '{+about+}', 'the')]
(',', 'his') [(',', '{+about+}', 'his')]
('me', 'the') [('me', '{+about+}', 'the')]
('think', 'a') [('think', '[-in>>about+}', 'a')]
('and', 'his') [('and', '{+about+}', 'his')]
('you', 'my') [('you', '{+about+}', 'my')]
('you', 'something') [('you', '{+about+}', 'something')]
('you', 'the') [('you', '{+about+}', 'the')]
('hear', 'your') [('hear', '{+about+}', 'your')]
('know', '.') [('know', '[-about>>of+}', '.')]
('think', 'it') [('think', '[-for>>about+}', 'it')]


{('How', 'about', 'you'): [('How', '[-about>>are+}', 'you')],
 ('apologize', 'about', 'the'): [('apologize', '[-about>>for+}', 'the')],
 ('care', 'about', 'you'): [('care', '[-about>>of+}', 'you')],
 ('control', 'about', 'their'): [('control', '[-about>>of+}', 'their')],
 ('flattered', 'about', 'your'): [('flattered', '[-about>>by+}', 'your')],
 ('information', 'about', 'the'): [('information', '[-about>>under+}', 'the')],
 ('results', 'about', 'the'): [('results', '[-about>>that+}', 'the')],
 ('thinking', 'about', 'going'): [('thinking', '[-about>>of+}', 'going')],
 ('you', 'about', 'your'): [('you', '[-about>>with+}', 'your')]}

In [18]:
def count_bi_edit(bi_edit, bigrams_count, edit_list_count):
    dic = { bi_edit[0]: bigrams_count[bi_edit[0]] }
    for ngram in bi_edit[1:]:
        dic[ngram] = [edit_list_count[ngram], edit_list_count[ngram] / dic[bi_edit[0]]]
    return sorted(dic.items(), key = lambda item: len(item[0]))


def count_tri_edit(tri_edit, trigrams_count, edit_list_count):
    dic = { tri_edit[0]: trigrams_count[tri_edit[0]] }
    for ngram in tri_edit[1:]:
        dic[ngram] = [edit_list_count[ngram], edit_list_count[ngram] / dic[tri_edit[0]]]
    return sorted(dic.items(), key = lambda item: len(item[0]))


def sort(count_bi):
    return sorted(count_bi, key=lambda item: (item[1][1][1]))

In [25]:
count_bi, count_tri = [], []

bigrams_count = Counter(bigrams)
trigrams_count = Counter(trigrams)
edit_list_count = Counter(edit_list)

for bi, edit in bi_edit_group.items():
    c_bi_edit = count_bi_edit([bi] + edit, bigrams_count, edit_list_count)
    count_bi.append(c_bi_edit)
    # print(c_bi_edit)
    # print('='*50)

for tri, edit in tri_edit_group.items():
    c_tri_edit = count_tri_edit([tri] + edit, trigrams_count, edit_list_count)
    count_tri.append(c_tri_edit)
    # print(c_tri_edit)

In [26]:
from pprint import pprint

if __name__ == '__main__':
#     print(count_bi)
#     print(sort(count_bi))
#     print(count_tri)
    # print(sort(counts_tri))
    pprint(count_bi)
    for count_bi in sort(count_bi):
        for freq in count_bi:
            print(' '.join(freq[0]), ': ', freq[1], sep = '')
        print('='*50)

    for count_tri in sort(count_tri):
        for freq in count_tri:
            print(' '.join(freq[0]), ': ', freq[1], sep = '')
        print('='*50)


[[(('you', 'some'), 6),
  (('you', '{+about+}', 'some'), [1, 0.16666666666666666])],
 [(('me', 'a'), 1), (('me', '{+about+}', 'a'), [1, 1.0])],
 [(('was', 'the'), 3), (('was', '{+about+}', 'the'), [1, 0.3333333333333333])],
 [((',', 'his'), 2), ((',', '{+about+}', 'his'), [1, 0.5])],
 [(('me', 'the'), 2), (('me', '{+about+}', 'the'), [1, 0.5])],
 [(('think', 'a'), 1), (('think', '[-in>>about+}', 'a'), [1, 1.0])],
 [(('and', 'his'), 4), (('and', '{+about+}', 'his'), [1, 0.25])],
 [(('you', 'my'), 2), (('you', '{+about+}', 'my'), [1, 0.5])],
 [(('you', 'something'), 4), (('you', '{+about+}', 'something'), [1, 0.25])],
 [(('you', 'the'), 3), (('you', '{+about+}', 'the'), [1, 0.3333333333333333])],
 [(('hear', 'your'), 1), (('hear', '{+about+}', 'your'), [1, 1.0])],
 [(('know', '.'), 2), (('know', '[-about>>of+}', '.'), [1, 0.5])],
 [(('think', 'it'), 3),
  (('think', '[-for>>about+}', 'it'), [1, 0.3333333333333333])]]
you some: 6
you {+about+} some: [1, 0.16666666666666666]
and his: 4
and