In [51]:
test_data = '''
Hello [-.//PU-]{+!//PU+} {+I　'm//MW+} [-Fine//C-]{+fine//C+} {+,//PU+} thanks {+.//PU+} [-and//C-]{+And//C+} you ?
Line up the bottles in rows of 4 , then 3 , then 2 , then 1 {+.//PU+} Get a frisbee per [-each//D-] player and allow to take two shots on each turn .
Each pin {+knocked　down//MW+} is one point .
{+For　a//MW+} [-Strike//C-]{+strike//C+} {+,//PU+} {+the　player//MW+} gets to take two more shots and [-add//AG-]{+adds//AG+} all the points together .
The player with the most points is {+the//AR+} winner .
Honey and ginger : [-it//PS-]{+they//PS+} [-is//AG-]{+are//AG+} [-natual//SP-]{+natural//SP+} food and good for sore throats [-,　no//PU-]{+,　no//PU+} side - effects , take a [-aspoonful//SP-]{+spoonful//SP+} anytime when you need .
Garlic and Echinacea tea : drink it when {+you//PS+} have infection [-,　it//PU-]{+,　it//PU+} is simple but {+an//AR+} excellent antibiotic .
Hot mixture of vinegar , olive oil and eucalyptus : place it on aches and pains [-,　it//PU-]{+,　it//PU+} is {+a//AR+} fast and effective way to relieve aches and pains .
[-everyone//C-]{+Everyone//C+} may use it in {+his　or　her//PS+} daily life .
First , John asked Isabella not to marry him and [-giving　a　chance//XC-]{+her　to　give　him　the　chance//XC+} to prove himself to have {+the//AR+} ability to make [-the//AR-]{+a//AR+} happy life for her .
'''


In [52]:
# 1. 把標點符號edit token，變成after
# 2. 簡化修改標記:  {+word+}, [-word-], [-word>>word+}
# 3. 再斷句一次
# 4. 把一句多錯誤，變成多句個含一個錯誤

In [63]:
# -*- coding: utf-8 -*-
import fileinput
import re
from pprint import pprint
from nltk.tokenize import sent_tokenize

def simple_tag(tags):
    if tags['d'] and tags['i']:    # d >> i
        return '[-{d}>>{i}+}}'.format(d=tags['d'], i=tags['i'])
    elif tags['d']:
        return '[-{d}-]'.format(d=tags['d'])
    elif tags['i']:
        return '{{+{i}+}}'.format(i=tags['i'])
    else:
        print("Should not be here in simple_tag()")

re_tag = r'(\[-(?P<d>.+)//(?P<d_tag>.+)-\])?({\+(?P<i>.+)//(?P<i_tag>.+)\+})?'
def correct_punc(line):
    new_line = []
    for token in line.split(' '):
        tags = re.match(re_tag, token).groupdict()
        if not tags['d_tag'] and not tags['i_tag']:  # no edit, 原字
            new_line.append(token)
        elif tags['i_tag'] == 'PU':                  # PU 錯誤類型不管，因此遇到 PU 則改成正確句子，只管被新增的符號
            for item in tags['i'].split():           # TODO: 照原本寫法，不確定 split 用意
                new_line.append(item)
        elif tags['d_tag'] != 'PU':                  # error type not 'PU'
            new_line.append(simple_tag(tags))   
    return' '.join(new_line)

def restore_line_break(text):
    return text.replace('<br/>', '\n').replace('<br>', '\n').replace('<br />', '\n')

def restore_xmlescape(text):
    while '&amp;' in text:
        text = text.replace('&amp;', '&')
    text = text.replace('&quote;', '"')
    text = text.replace('&quot;', '"')
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text

def mask_edits(text):
    edits, tokens = [], []
    for token in text.split(' '):
        if token.startswith('{+') or token.startswith('[-'):
            masked_token = "{{{0}}}".format(len(edits))
            tokens.append(masked_token)
            edits.append(token)
        else:
            tokens.append(token.replace('{', '{{').replace('}', '}}'))
    return ' '.join(tokens), edits


def tokenize_doc(text):
    text = restore_line_break(text)
    text = restore_xmlescape(text)

    # mask edit tokens first to prevent being segmented
    # I have {+a+} pen. => I have {0} pen.
    text_masked, edits = mask_edits(text)

    for line in text_masked.splitlines():
        for sent in sent_tokenize(line.strip()):
            # restore masked edit tokens and return
            yield sent.format(*edits) 

def to_after(tokens):
    def to_after_token(token):
        token = token.replace('\u3000', ' ')
        if token.endswith('-]'):
            return None
        elif token.endswith('+}'):
            return token[token.rfind('>>')+2:-2]  if token.startswith('[-') else token[2:-2]  
        else:
            return token
    return ' '.join(token for token in map(to_after_token, tokens) if token)

new_data = []
if __name__ == '__main__':
    for line in test_data.split('\n'): # fileinput.input():
        simple_line = correct_punc(line.strip()) # remove PU
        for sent in tokenize_doc(simple_line):
            # after_sent = to_after(sent.split(' ')) # correct sentence
            tokens = sent.split(' ')
            for i, token in enumerate(tokens):
                if token.startswith('[-') or token.startswith('{+'):
                    new_sent = to_after(tokens[:i]) + ' ' + token + ' ' + to_after(tokens[i+1:])
                    new_data.append(new_sent.strip())


# pprint(new_data)

In [54]:
import fileinput
import spacy
from spacy.tokens import Doc

In [55]:
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

In [56]:
nlp = spacy.load('en')
# nlp = spacy.load('en_core_web_lg')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

In [62]:
# 用來抓 edit word
re_words = r'(\[-(?P<d>.+)-\]|{\+(?P<i>.+)\+}|\[-(?P<rd>.+)>>(?P<ri>.+)\+})?'
def correct(origin_tokens):
    correct_tokens, pairs = [], []
    for ot in origin_tokens:
        ot = ot.replace('\u3000', ' ')
        words = re.match(re_words, ot).groupdict()
        if words['rd'] and words['ri']:
            for ri in words['ri'].split():
                for rd in words['rd'].split():
                    pairs.append(('Replace', rd, ri, len(correct_tokens))) # 最後一欄位是對應 correct_tokens 用的
                correct_tokens.append(ri)
        elif words['i']:
            for i in words['i'].split():
                pairs.append(('Insert', None, i, len(correct_tokens)))
                correct_tokens.append(i)
        elif words['d']:
            pairs.append(('Delete', words['d'], None, len(correct_tokens)))
        else:
            correct_tokens.append(ot)
            
    return correct_tokens, pairs

    
def format_edit(edit):
    edit_type, origin_token, new_token, correct_token, correct_sent = edit
    
    template = '''
        ({edit_type}) {origin_token}\t->\t{new_token}
        Sent:\t{correct_sent}
        
        \tToken\tLemma\tTag\tDep(to head)
        Head:\t{head}\t{head_lemma}\t{head_tag}
        Target:\t{target_token}\t{target_token_lemma}\t{target_token_tag}\t{target_token_dep}'''.format(
               edit_type = edit_type, 
               origin_token = origin_token,
               new_token = new_token,
               correct_sent = correct_sent,
               target_token = correct_token.text,
               target_token_lemma = correct_token.lemma_,
               target_token_tag = correct_token.tag_,
               target_token_dep = correct_token.dep_,
                   
               head = correct_token.head.text,
               head_lemma = correct_token.head.lemma_,
               head_tag = correct_token.head.tag_)
    
    for child in correct_token.children:
        template += "\n\tChild:\t{child}\t{child_lemma}\t{child_tag}\t{child_dep}".format(
            child = child.text, child_lemma = child.lemma_, child_tag = child.tag_, child_dep = child.dep_)
            
    if edit_type.startswith("Delete"):
        origin_token = nlp(origin_token)
        for ot in origin_token:
            template += "\n\tDelete:\t{ot}\t{ot_lemma}\t{ot_tag}".format(
                    ot = ot.text, ot_lemma = ot.lemma_, ot_tag = ot.tag_)
    return template
        
        
if __name__ == "__main__":
    all_edits = []
    for line in new_data: # fileinput.input():
        origin_tokens = line.strip().split(' ')
        correct_tokens, pairs = correct(origin_tokens)
        if not correct_tokens or not pairs: continue # skip no edit or empty string
            
        correct_tokens = nlp(' '.join(correct_tokens))
        for pair in pairs:
            edit_type, origin_token, new_token, index = pair
            if edit_type == "Delete":
                if index < len(correct_tokens):
                    all_edits.append((edit_type + "/pre", origin_token, new_token, correct_tokens[index-1], correct_tokens.doc))
                if index > 0:
                    all_edits.append((edit_type + "/post", origin_token, new_token, correct_tokens[index], correct_tokens.doc))
            else:
                all_edits.append((edit_type, origin_token, new_token, correct_tokens[index], correct_tokens.doc))
    
    for edit in all_edits:
        print("====================================")
        print(format_edit(edit))


        (Insert) None	->	I
        Sent:	I 'm fine , thanks . 
        
        	Token	Lemma	Tag	Dep(to head)
        Head:	'm	'm	VBP
        Target:	I	-PRON-	PRP	nsubj

        (Insert) None	->	'm
        Sent:	I 'm fine , thanks . 
        
        	Token	Lemma	Tag	Dep(to head)
        Head:	'm	'm	VBP
        Target:	'm	'm	VBP	ROOT
	Child:	I	-PRON-	PRP	nsubj
	Child:	fine	fine	JJ	acomp
	Child:	,	,	,	punct
	Child:	thanks	thank	NNS	npadvmod
	Child:	.	.	.	punct

        (Replace) Fine	->	fine
        Sent:	I 'm fine , thanks . 
        
        	Token	Lemma	Tag	Dep(to head)
        Head:	'm	'm	VBP
        Target:	fine	fine	JJ	acomp

        (Replace) and	->	And
        Sent:	And you ? 
        
        	Token	Lemma	Tag	Dep(to head)
        Head:	you	-PRON-	PRP
        Target:	And	and	CC	cc

        (Delete/pre) each	->	None
        Sent:	Get a frisbee per player and allow to take two shots on each turn . 
        
        	Token	Lemma	Tag	Dep(to head)
        Head:	frisbee	frisbee	NN
  