In [28]:
fs = open('test.txt', 'r', encoding='utf8')
blocks = fs.read().split("====================================")

In [69]:
# 一行行拆成 json
def line2obj(line):
    col = line.split('\t')
    return col[0][:-1], {
        'token': col[1],
        'lemma': col[2],
        'tag': col[3],
        'dep': col[4] if len(col) == 5 else None
    }

# 將編輯部份轉成 json 形式
def parse_edit(edit):
    edit = edit.replace("	Token	Lemma	Tag	Dep(to head)\n", "")
    
    template = { 'Head': {}, 'Target': {}, 'Child': [] , 'Delete': []}
    for line in edit.split('\n'):
        node, content = line2obj(line)
        if node == 'Child' or node == 'Delete':
            template[node].append(content)
        else:
            template[node] = content
            
    return template

# 取得類型跟句子
def parse_info(meta):
    meta = meta.split('\n')
    edit_type = meta[0].split(' ')[0][1:-1]
    sent = meta[1].split('\t')[1]
    return edit_type, sent

In [70]:
all_edits = []
for block in blocks:
    block = block.strip()
    if block == '': continue
    
    sections = block.split('\n\n')
    edit_type, sent = parse_info(sections[0])

    all_edits.append({
        'edit_type': edit_type,
        'sent': sent,
        'edits': [parse_edit(edit) for edit in sections[1:]]
    })
        
# pprint(all_edits)

In [71]:
def search(edit_type_q=None, head_q={}, target_q={}, child_q={}, delete_q={}):
    groups = filter(lambda e: e['edit_type'] == edit_type_q, all_edits) if edit_type_q else all_edits
    
    def extract(group, node):
        return map(lambda e: e[node], group['edits']) 
        
    def match(nodes, query):
        for node in nodes:
            correct = sum([query[key] == node[key] for key in query])
            if correct == len(query):
                return True
        return False
    
    if head_q:
        groups = filter(lambda g: match(extract(g, 'Head'), head_q), groups)
    
    if target_q:
        groups = filter(lambda g: match(extract(g, 'Target'), target_q), groups)
    
    if child_q:
        get_childs = lambda g: [c for child in extract(g, 'Child') for c in child] # Extract and flatten
        groups = filter(lambda g: match(get_childs(g), child_q), groups)
    
    if delete_q:
        get_deletes = lambda g: [d for delete in extract(g, 'Delete') for d in delete] # Extract and flatten
        groups = filter(lambda g: match(get_deletes(g), delete_q), groups)
        
    return list(groups)


In [76]:
search(target_q={})

[{'edit_type': 'Insert',
  'edits': [{'Child': [],
    'Delete': [],
    'Head': {'dep': None, 'lemma': "'m", 'tag': 'VBP', 'token': "'m"},
    'Target': {'dep': 'nsubj', 'lemma': '-PRON-', 'tag': 'PRP', 'token': 'I'}},
   {'Child': [{'dep': 'nsubj', 'lemma': '-PRON-', 'tag': 'PRP', 'token': 'I'},
     {'dep': 'acomp', 'lemma': 'fine', 'tag': 'JJ', 'token': 'fine'},
     {'dep': 'punct', 'lemma': ',', 'tag': ',', 'token': ','},
     {'dep': 'npadvmod', 'lemma': 'thank', 'tag': 'NNS', 'token': 'thanks'},
     {'dep': 'punct', 'lemma': '.', 'tag': '.', 'token': '.'}],
    'Delete': [],
    'Head': {'dep': None, 'lemma': "'m", 'tag': 'VBP', 'token': "'m"},
    'Target': {'dep': 'ROOT', 'lemma': "'m", 'tag': 'VBP', 'token': "'m"}}],
  'sent': "{+I\u3000'm+} fine , thanks ."},
 {'edit_type': 'Replace',
  'edits': [{'Child': [],
    'Delete': [],
    'Head': {'dep': None, 'lemma': "'m", 'tag': 'VBP', 'token': "'m"},
    'Target': {'dep': 'acomp', 'lemma': 'fine', 'tag': 'JJ', 'token': 'fine'