In [32]:
test_data = '''
Hello [-.//PU-]{+!//PU+} {+I　'm//MW+} [-Fine//C-]{+fine//C+} {+,//PU+} thanks {+.//PU+} [-and//C-]{+And//C+} you ?
Line up the bottles in rows of 4 , then 3 , then 2 , then 1 {+.//PU+} Get a frisbee per [-each//D-] player and allow to take two shots on each turn .
Each pin {+knocked　down//MW+} is one point .
{+For　a//MW+} [-Strike//C-]{+strike//C+} {+,//PU+} {+the　player//MW+} gets to take two more shots and [-add//AG-]{+adds//AG+} all the points together .
The player with the most points is {+the//AR+} winner .
Honey and ginger : [-it//PS-]{+they//PS+} [-is//AG-]{+are//AG+} [-natual//SP-]{+natural//SP+} food and good for sore throats [-,　no//PU-]{+,　no//PU+} side - effects , take a [-aspoonful//SP-]{+spoonful//SP+} anytime when you need .
Garlic and Echinacea tea : drink it when {+you//PS+} have infection [-,　it//PU-]{+,　it//PU+} is simple but {+an//AR+} excellent antibiotic .
Hot mixture of vinegar , olive oil and eucalyptus : place it on aches and pains [-,　it//PU-]{+,　it//PU+} is {+a//AR+} fast and effective way to relieve aches and pains .
[-everyone//C-]{+Everyone//C+} may use it in {+his　or　her//PS+} daily life .
First , John asked Isabella not to marry him and [-giving　a　chance//XC-]{+her　to　give　him　the　chance//XC+} to prove himself to have {+the//AR+} ability to make [-the//AR-]{+a//AR+} happy life for her .
'''

In [33]:
# 1. 把標點符號edit token，變成after
# 2. 簡化修改標記:  {+word+}, [-word-], [-word>>word+}
# 3. 再斷句一次
# 4. 把一句多錯誤，變成多句個含一個錯誤

In [35]:
# -*- coding: utf-8 -*-
import fileinput
import re
from pprint import pprint
from nltk.tokenize import sent_tokenize

def simple_tag(tags):
    if tags['d'] and tags['i']:    # d >> i
        return '[-{d}>>{i}+}}'.format(d=tags['d'], i=tags['i'])
    elif tags['d']:
        return '[-{d}-]'.format(d=tags['d'])
    elif tags['i']:
        return '{{+{i}+}}'.format(i=tags['i'])
    else:
        print("Should not be here in simple_tag()")

re_tag = r'(\[-(?P<d>.+)//(?P<d_tag>.+)-\])?({\+(?P<i>.+)//(?P<i_tag>.+)\+})?'
def correct_punc(line):
    new_line = []
    for token in line.split(' '):
        tags = re.match(re_tag, token).groupdict()
        if not tags['d_tag'] and not tags['i_tag']:  # no edit, 原字
            new_line.append(token)
        elif tags['i_tag'] == 'PU':                  # PU 錯誤類型不管，因此遇到 PU 則改成正確句子，只管被新增的符號
            for item in tags['i'].split():           # TODO: 照原本寫法，不確定 split 用意
                new_line.append(item)
        elif tags['d_tag'] != 'PU':                  # error type not 'PU'
            new_line.append(simple_tag(tags))   
    return' '.join(new_line)

def restore_line_break(text):
    return text.replace('<br/>', '\n').replace('<br>', '\n').replace('<br />', '\n')

def restore_xmlescape(text):
    while '&amp;' in text:
        text = text.replace('&amp;', '&')
    text = text.replace('&quote;', '"')
    text = text.replace('&quot;', '"')
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text

def mask_edits(text):
    edits, tokens = [], []
    for token in text.split(' '):
        if token.startswith('{+') or token.startswith('[-'):
            masked_token = "{{{0}}}".format(len(edits))
            tokens.append(masked_token)
            edits.append(token)
        else:
            tokens.append(token.replace('{', '{{').replace('}', '}}'))
    return ' '.join(tokens), edits


def tokenize_doc(text):
    text = restore_line_break(text)
    text = restore_xmlescape(text)

    # mask edit tokens first to prevent being segmented
    # I have {+a+} pen. => I have {0} pen.
    text_masked, edits = mask_edits(text)

    for line in text_masked.splitlines():
        for sent in sent_tokenize(line.strip()):
            # restore masked edit tokens and return
            yield sent.format(*edits) 

def to_after(tokens):
    def to_after_token(token):
        token = token.replace('\u3000', ' ')
        if token.endswith('-]'):
            return None
        elif token.endswith('+}'):
            return token[token.rfind('>>')+2:-2]  if token.startswith('[-') else token[2:-2]  
        else:
            return token
    return ' '.join(token for token in map(to_after_token, tokens) if token)

new_data = []
if __name__ == '__main__':
    for line in test_data.split('\n'): # fileinput.input():
        simple_line = correct_punc(line.strip()) # remove PU
        for sent in tokenize_doc(simple_line):
            # after_sent = to_after(sent.split(' ')) # correct sentence
            tokens = sent.split(' ')
            for i, token in enumerate(tokens):
                if token.startswith('[-') or token.startswith('{+'):
                    new_sent = to_after(tokens[:i]) + ' ' + token + ' ' + to_after(tokens[i+1:])
                    new_data.append(new_sent.strip())


pprint(new_data)

["{+I\u3000'm+} fine , thanks .",
 "I 'm [-Fine>>fine+} , thanks .",
 '[-and>>And+} you ?',
 'Get a frisbee per [-each-] player and allow to take two shots on each turn .',
 'Each pin {+knocked\u3000down+} is one point .',
 '{+For\u3000a+} strike , the player gets to take two more shots and adds all '
 'the points together .',
 'For a [-Strike>>strike+} , the player gets to take two more shots and adds '
 'all the points together .',
 'For a strike , {+the\u3000player+} gets to take two more shots and adds all '
 'the points together .',
 'For a strike , the player gets to take two more shots and [-add>>adds+} all '
 'the points together .',
 'The player with the most points is {+the+} winner .',
 'Honey and ginger : [-it>>they+} are natural food and good for sore throats , '
 'no side - effects , take a spoonful anytime when you need .',
 'Honey and ginger : they [-is>>are+} natural food and good for sore throats , '
 'no side - effects , take a spoonful anytime when you need .',
 'Ho

In [None]:
# to pattern

In [4]:
import fileinput
import spacy
from spacy.tokens import Doc

In [44]:
nlp = spacy.load('en')
# nlp = spacy.load('en_core_web_lg')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

In [5]:
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)


In [48]:
re_words = r'(\[-(?P<d>.+)-\]|{\+(?P<i>.+)\+}|\[-(?P<rd>.+)>>(?P<ri>.+)\+})?'
def diff2before_after(line):
    alignment_aft, alignment_bef = {}, {}
    before, after = [], []
    for i, token in enumerate(line.split(' ')):
        token = token.replace('\u3000', ' ') # fullwidth -> halfwidth
        words = re.match(re_words, token).groupdict()
        # TODO: 可以改寫成兩個 if
        if words['i']: # starts with '{+'
            for item in words['i'].split():
                after.append(item)
                alignment_aft[len(after)-1] = i
        elif words['rd'] and words['ri']: # '[-rd>>ri+}'
            for item in words['ri'].split():
                after.append(item)
                alignment_aft[len(after)-1] = i
            for item in words['rd'].split():
                before.append(item)
                alignment_bef[len(before)-1] = i
        elif words['d']:
            for item in words['d'].split():
                before.append(item)
                alignment_bef[len(before)-1] = i
        else:
            before.append(token)
            after.append(token)
            alignment_aft[len(after)-1] = i
            alignment_bef[len(before)-1] = i
    return before, alignment_bef, after, alignment_aft

def spacy_aft(after, nlp):
    doc_aft = nlp(' '.join(after))
    childs, childs_texts = [], []
    for i, token in enumerate(doc_aft):
        lemmas = [token.lemma_    for token in doc_aft]
        tags   = [token.tag_      for token in doc_aft]
        deps   = [token.dep_      for token in doc_aft]
        heads  = [token.head.text for token in doc_aft]
        head_lemmas = [token.head.lemma_ for token in doc_aft]
        head_tags = [token.head.tag_ for token in doc_aft]
        childs.append([child for child in token.children])
        for i, child_text in enumerate(childs): 
            child_subtext = []
            if child_text:
                for char in child_text:
                    child_subtext.append(str(char))
        childs_texts.append(child_subtext)
    return lemmas, tags, deps, heads, head_lemmas, head_tags, childs, childs_texts


def spacy_bef(before, nlp):
    doc_bef = nlp(' '.join(before))
    for i, token in enumerate(doc_bef):
        lemmas_bef = [token.lemma_ for token in doc_bef]
        tags_bef = [token.tag_ for token in doc_bef]
        return lemmas_bef, tags_bef


if __name__ == '__main__':
    for line in new_data: # fileinput.input():
        line = line.strip()
        
        before_tokens, alignment_bef, after_tokens, alignment_aft = diff2before_after(line)
        if after_tokens:
            lemmas, tags, deps, heads, head_lemmas, head_tags, childs, childs_texts = spacy_aft(after_tokens, nlp)
        if before_tokens:
            lemmas_bef, tags_bef = spacy_bef(before_tokens, nlp)

        for i, token in enumerate(after_tokens):
            diff_token = line.split(' ')[alignment_aft[i]]
#             print(diff_token)
            head_is = [head_i for head_i, token in enumerate(after_tokens) if token == heads[i]]
#             print(head_is)
            if diff_token.endswith('+}'):
                edit_spacy = '(' + after_tokens[i] + ')' + lemmas[i] + '_' + tags[i]
                head_spacy = head_tags[i] + '_' +  head_lemmas[i] + '('+ heads[i] + ')'
                edit_head_dep = '||' + deps[i] + '::'
                if diff_token.startswith('[-'):
                    for j, token in enumerate(before_tokens):
                        if alignment_bef[j] == alignment_aft[i]:
                            edit_bef_spacy = '(' + before_tokens[j] + ')' + lemmas_bef[j] + '_' + tags_bef[j]
                            print('RepH: ' + edit_spacy + '[' + edit_bef_spacy + ']' + edit_head_dep + head_spacy)
                else: 
                    print('InsH: ' + edit_spacy + edit_head_dep + head_spacy)
            
            for k, diff_token in enumerate(line.split(' ')):
                pre_token_spacys = []
                post_token_spacys = []
                if diff_token.endswith('-]'):
                    if alignment_aft[i]+1 == k:
                        if i > 1:
                            pre_spacy = '(' + after_tokens[i] + ')' + lemmas[i] + '_' + tags[i]
                            pre_head_spacy = head_tags[i] + '_' +  head_lemmas[i] + '('+ heads[i] + ')'
                            pre_head_dep = '||' + deps[i] + '::'
                            pre_token_spacys.append(pre_spacy + pre_head_dep + pre_head_spacy)
                            
                        if i < len(after_tokens)-1:
                            post_spacy = '(' + after_tokens[i+1] + ')' + lemmas[i+1] + '_' + tags[i+1]
                            post_head_spacy = head_tags[i+1] + '_' +  head_lemmas[i+1] + '('+ heads[i+1] + ')'
                            post_head_dep = '||' + deps[i+1] + '::'
                            post_token_spacys.append(post_spacy + post_head_dep + post_head_spacy)
                            
                        for j, token in enumerate(before_tokens):
                            if alignment_bef[j] == k:
                                edit_spacy = '(' + before_tokens[j] + ')' + lemmas_bef[j] + '_' + tags_bef[j]
                                print('D: ' + ''.join(pre_token_spacys) + '[' + edit_spacy + ']' + ''.join(post_token_spacys))


            for head_i in head_is:
                if after_tokens[i] in childs_texts[head_i]:  
                    diff_token_h = line.split(' ')[alignment_aft[head_i]]
                    if diff_token_h.endswith('+}'):
                        edit_spacy = '(' + after_tokens[head_i] + ')' + lemmas[head_i] + '_' + tags[head_i]
                        child_spacy = tags[i] + '_' + lemmas[i] + '('+ after_tokens[i] + ')'
                        edit_child_dep = '::' + deps[i] + '||'
                        if diff_token_h.startswith('[-'):
                            for j, token in enumerate(before_tokens):
                                if alignment_bef[j] == alignment_aft[head_i]:
                                    edit_bef_spacy = '(' + before_tokens[j] + ')' + lemmas_bef[j] + '_' + tags_bef[j]
                                    # return 'replace; child'
                                    print('RepC: ' + edit_spacy + '[' + edit_bef_spacy + ']' + edit_child_dep + child_spacy)
                        else:
                            # return 'insert; child'
                            print('InsC: ' + edit_spacy + edit_child_dep + child_spacy)
        print('')



InsH: (I)-PRON-_PRP||nsubj::VBP_'m('m)
InsC: ('m)'m_VBP::nsubj||PRP_-PRON-(I)
InsH: ('m)'m_VBP||ROOT::VBP_'m('m)
InsC: ('m)'m_VBP::acomp||JJ_fine(fine)
InsC: ('m)'m_VBP::punct||,_,(,)
InsC: ('m)'m_VBP::npadvmod||NNS_thank(thanks)
InsC: ('m)'m_VBP::punct||._.(.)

RepH: (fine)fine_JJ[(Fine)fine_JJ]||acomp::VBP_'m('m)

RepH: (And)and_CC[(and)and_CC]||cc::PRP_-PRON-(you)

D: (per)per_IN||prep::NN_frisbee(frisbee)[(each)each_DT](player)player_NN||pobj::IN_per(per)

InsC: (knocked)knock_VBD::nsubj||NN_pin(pin)
InsH: (knocked)knock_VBD||csubj::VBZ_be(is)
InsH: (down)down_RP||prt::VBD_knock(knocked)
InsC: (knocked)knock_VBD::prt||RP_down(down)

InsH: (For)for_IN||prep::VBZ_get(gets)
InsH: (a)a_DT||det::NN_strike(strike)
InsC: (For)for_IN::pobj||NN_strike(strike)

RepC: (strike)strike_NN[(Strike)strike_NN]::det||DT_a(a)
RepH: (strike)strike_NN[(Strike)strike_NN]||pobj::IN_for(For)

InsH: (the)the_DT||det::NN_player(player)
InsC: (player)player_NN::det||DT_the(the)
InsH: (player)player_NN||nsubj