In [1]:
import os
import json
import numpy as np

In [10]:
hyperace05_processed_path = "./datasets/hyperace05"
os.mkdir(hyperace05_processed_path)

In [18]:
entity_set = {"Entity"}
relation_set = set()
qualifier_set = set()

In [19]:
for dataset in {"train", "dev", "test"}:
    hyperace05_raw_file = f"./datasets/raw/hyperace05-raw/{dataset}_convert.json"
    hyperace05_processed_file = f"./datasets/hyperace05/{dataset}.json"
    with open(hyperace05_processed_file, 'w', encoding='utf-8') as pf:
        with open(hyperace05_raw_file, 'r', encoding='utf-8') as rf:
            while (True) :
                line = rf.readline()
                if not line:
                    break
                hyperfact = json.loads(line)
                sentence = hyperfact.get('sentence')
                s_start = hyperfact.get('s_start')
                ner = hyperfact.get("ner")
                event = hyperfact.get('event')
                if len(event) > 0:
                    for item in event:
                        if len(item) >= 3:
                            # construct relations
                            relations = []
                            eventType = '[r]' + item[0][1]
                            relation_set.add(eventType)
                            
                            head = item[1]
                            head[0] = head[0] - s_start
                            head[1] = head[1] - s_start
                            
                            tail = item[2]
                            tail[0] = tail[0] - s_start
                            tail[1] = tail[1] - s_start
                            
                            attributes = item[3:]
                            for attribute in attributes:
                                attribute[0] = attribute[0] - s_start
                                attribute[1] = attribute[1] - s_start
                                attribute[2] = '[q]' + attribute[2]
                                qualifier_set.add(attribute[2])
                            
                            relation = [head[0], head[1], tail[0], tail[1], eventType]
                            relation.append(attributes)
                            relations.append(relation)
                            
                            # construct ner
                            ners = []
                            for entity in ner:
                                entity[0] = entity[0] - s_start
                                entity[1] = entity[1] - s_start
                                entity[2] = 'Entity'
                            ners.append(entity)

                            # construct output dict
                            output = dict()
                            output['sentences'] = [sentence]
                            output['ner'] = [ners]
                            output['relations'] = [relations]
                            output['clusters'] = []
                            output['doc_key'] = ""

                            # output to file
                            output_line = json.dumps(output, ensure_ascii=False)
                            pf.write(output_line+'\n')

In [23]:
label = dict()
# construct label.id
label['id'] = dict()
label['symmetric'] = []
label['asymmetric'] = []
label['entity'] = []
label['relation'] = []
label['qualifier'] = []
label['q_num_logit'] = 0
number = 0
label['id']['None'] = number
for entity in entity_set:
    number += 1
    label['id'][entity] = number
    label['entity'].append(number)
for relation in relation_set:
    number += 1
    label['id'][relation] = number
    label['relation'].append(number)
for qualifier in qualifier_set:
    number += 1
    label['id'][qualifier] = number
    label['qualifier'].append(number)
label['q_num_logit'] = number + 1

hyperace05_label_file = "./datasets/hyperace05/label.json"
with open(hyperace05_label_file, 'w', encoding='utf-8') as lf:
    json.dump(label, lf, ensure_ascii=False, indent=2, separators=(',', ':'))