In [1]:
import os
import json

# Entity Linking Benchmarks

In [2]:
input_dir = "/harddisk/user/keminglu/evaluation_corpus/benchmarks"
output_dir = "/harddisk/user/keminglu/evaluation_corpus/processed_benchmarks"

In [13]:
with open(os.path.join(input_dir, "genre", "aida-dev-kilt.jsonl")) as f:
    data = [json.loads(line) for line in f]

In [14]:
def transform_train(sample):
    target = f"{{\"entities\": [{{\"mention\": \"{sample['meta']['mention']}\", \"title\": \"{sample['output'][0]['answer']}\"}}], \"triplets\": []}}"
    output = {
        'rephrased_input': '"' + sample['input'].replace("[START_ENT] ", "").replace(" [END_ENT]", "").strip() + '."' + "\n\n" + "Please extract 1 entity from the context.",
        'target': target,
        'id': sample['id'],
    }
    return output

In [15]:
with open(os.path.join(output_dir, "aida-dev-kilt"), "w") as f:
    for sample in data:
        f.write(json.dumps(transform_train(sample)) + "\n")

In [3]:
def transform(record, data_file):
    context = record['input'].replace("[START_ENT] ", "").replace(" [END_ENT]", "").strip()
    prompt = '{"entities": [{"mention": "%s",' % record['meta']['mention']
    return {
        "dataset": data_file,
        "rephrased_inputs": f"\"{context}\"\n\nPlease identify 1 entity in the context." + " " + prompt,
        "prompt": prompt,
        "true": [each['answer'] for each in record['output']],
        "orig_id": record["id"]
    }

In [4]:
data_files = [
    "ace2004-test-kilt.jsonl",
    "aida-test-kilt.jsonl",
    "aquaint-test-kilt.jsonl",
    "clueweb-test-kilt.jsonl",
    "msnbc-test-kilt.jsonl",
]
all_ie_data = []
for data_file in data_files:
    input_path = os.path.join(input_dir, "entity_linking", data_file)
    for line in open(input_path).readlines():
        processed_line = transform(json.loads(line), data_file) 
        all_ie_data.append(processed_line)

In [5]:
with open(os.path.join(output_dir, "entity_linking", "all_ie_data"), "w") as f:
    for line in all_ie_data:
        f.write(json.dumps(line) + "\n")

In [110]:
with open(os.path.join(input_dir, "entity_linking", "msnbc-test-kilt.jsonl")) as f:
    data = [json.loads(line) for line in f.readlines()]
transform(data[320])

{'input_text': '"30 panda cubs born in China in 2006 Current total of giant pandas bred in captivity now 217 BEIJING A mini baby boom last year has pushed up the number of pandas bred in captivity in China to 217 state media said Wednesday Some 34 pandas were born by artificial insemination in 2006 and 30 survived both record numbers for the endangered species Cao Qingyao a spokesman for the State Forestry Administration was quoted as saying by the Xinhua News Agency The previous record was the 21 baby pandas born in China s zoos and breeding centers in 2005 China has been raising pandas through artificial insemination for nearly 50 years mostly at two research facilities in the southwestern province of Sichuan In 2006 17 cubs were born at the Wolong Giant Panda Protection and Research Center and 12 at the Chengdu Research Base The other panda was bred at the zoo in the southwestern city of Chongqing The panda is one of the world s rarest animals with about 1 590 living in the wild in 

# Entity Typing

In [5]:
data_file = "/harddisk/user/keminglu/evaluation_corpus/benchmarks/open_entity/crowd/test.json"

In [6]:
def transform(record):
    return {
        'input_text': " ".join(record['left_context_token'] + [record['mention_span']] + record['right_context_token']) + "\n\nPlease identify 1 entity in the sentence.",
        'prompt': '{"entities": [{"mention": "%s",' % record['mention_span'],
        'true': record['y_str'],
        'orig_id': record['annot_id']
    }

In [7]:
with open(os.path.join(output_dir, "entity_typing", "ufet_test.json"), "w") as f:
    for line in open(data_file).readlines():
        processed_line = transform(json.loads(line))
        f.write(json.dumps(processed_line) + "\n")

# NER

In [6]:
import datasets

In [7]:
dataset = datasets.load_dataset("conllpp", cache_dir="/harddisk/user/keminglu/evaluation_corpus/")

Found cached dataset conllpp (/harddisk/user/keminglu/evaluation_corpus/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2)


A Jupyter Widget

In [8]:
type_dict = {
    1: "person",
    3: "organization",
    5: "location",
    7: "miscellaneous"
}

def transform(record, hint="none"):
    tokens = record['tokens']
    input_text = " ".join(tokens)
    mentions = []
    flag = True
    for token, tag in zip(tokens, record['ner_tags']):
        if tag != 0:
            if flag:
                mentions.append([(token, tag)])
                flag = False
            else:
                mentions[-1].append((token, tag))
        else:
            flag = True
    mentions = [(" ".join([item[0] for item in each]), type_dict[each[0][1]]) for each in mentions]
    if hint == "none":
        prompt = 'Please identify all entities in the types Person, Organization, Location, Miscellaneous.'
    elif hint == "gold":
        prompt = 'Please identify %d entities in the types Person, Organization, Location, Miscellaneous.' % len(mentions)
    elif hint == "large":
        prompt = 'Please identify %d entities in the types Person, Organization, Location, Miscellaneous.' % 5
    return {
        "dataset": "conllpp",
        "split": hint,
        "rephrased_inputs": f"\"{input_text}\"\n\n{prompt}",
        "true": mentions,
        "orig_id": record["id"]
    }

In [9]:
transform(dataset['test'][20], "none")

{'dataset': 'conllpp',
 'split': 'none',
 'rephrased_inputs': '"Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches ."\n\nPlease identify all entities in the types Person, Organization, Location, Miscellaneous.',
 'true': [('UAE', 'location'),
  ('Kuwait', 'location'),
  ('South Korea', 'location'),
  ('Indonesia', 'location')],
 'orig_id': '20'}

In [10]:
all_ner_data = []
for hint in ("none", "gold"):
    for sample in dataset["test"]:
        all_ner_data.append(transform(sample, hint)) 

In [11]:
all_ner_data[4000]

{'dataset': 'conllpp',
 'split': 'gold',
 'rephrased_inputs': '"BUFFALO AT SEATTLE"\n\nPlease identify 2 entities in the types Person, Organization, Location, Miscellaneous.',
 'true': [('BUFFALO', 'organization'), ('SEATTLE', 'location')],
 'orig_id': '547'}

In [15]:
with open(os.path.join(output_dir, "ner", "conllpp_large.jsonl"), "w") as f:
    for sample in dataset["test"]:
        f.write(json.dumps(transform(sample, "large")) + "\n")

# CrossNER

In [12]:
type_mapping = {
        'programlang': 'Programming language',
        'country': 'Country',
        'field': 'Field',
        'algorithm': 'Algorithm',
        'organisation': 'Organization',
        'task': 'Task',
        'university': 'University',
        'product': 'Product',
        'misc': 'Miscellaneous',
        'conference': 'Conference',
        'location': 'Location',
        'researcher': 'Researcher',
        'metrics': 'Metrics',
        'person': 'Person',
        'magazine': 'Magazine',
        'award': 'Award',
        'event': 'Event',
        'poem': 'Poem',
        'book': 'Book',
        'literarygenre': 'Literary genre',
        'writer': 'Writer',
        'song': 'Song',
        'musicalinstrument': 'Muscial instrument',
        'album': 'Album',
        'musicalartist': 'Musical artist',
        'band': 'Band',
        'musicgenre': 'Music genre',
        'politician': 'Politician',
        'politicalparty': 'Political party',
        'election': 'Election',
        'astronomicalobject': 'Astronomical object',
        'enzyme': 'Enzyme',
        'scientist': 'Scientist',
        'chemicalelement': 'Chemical element',
        'academicjournal': 'Academic journal',
        'university': 'University',
        'theory': 'Theory',
        'protein': 'Protein',
        'chemicalcompound': 'Chemical compound',
        'discipline': 'Discipline',
    }

In [14]:
import copy

def transform(split, hint="none"):
    with open(os.path.join("/harddisk/user/keminglu/evaluation_corpus/benchmarks/CrossNER/ner_data", split, "test.txt")) as f:
        data = []
        tmp = []
        line = f.readline()
        while line:
            if line == '\n':
                data.append(copy.deepcopy(tmp))
                tmp = []
            else:
                tmp.append(line.strip().split("\t"))
            line = f.readline()
    
    processed_data = []
    for idx, sample in enumerate(data):
        mentions = []
        flag = True
        for token, tag in sample:
            if tag != 'O':
                if flag:
                    mentions.append([(token, tag)])
                    flag = False
                else:
                    mentions[-1].append((token, tag))
            else:
                flag = True
        mentions = [(" ".join([item[0] for item in each]), each[0][1].split("-")[1]) for each in mentions]
        tokens = [each[0] for each in sample]
        string = " ".join(tokens)
        processed_data.append((string, mentions))
    
    types = set([item[1] for each in processed_data for item in each[1]])
    print(types)
    types = ", ".join([type_mapping[t] for t in types])

    final_processed_data = []
    for idx, (string, mentions) in enumerate(processed_data):
        if hint == "none":
            prompt = f"Please identify all entities in the types {types}."
        elif hint == "gold":
            prompt = f"Please identify %d entities in the types {types}." % len(mentions)
        elif hint == "large":
            prompt = f"Please identify %d entities in the types {types}." % 5
        final_processed_data.append( {
            "dataset": f"crossner_{split}",
            "split": hint,
            "rephrased_inputs": f"\"{string}\"\n\n{prompt}",
            "true": mentions,
            "orig_id": idx
        })
    return final_processed_data

In [15]:
tmp = transform("science", "none")

{'chemicalcompound', 'award', 'protein', 'organisation', 'discipline', 'theory', 'scientist', 'location', 'person', 'academicjournal', 'chemicalelement', 'university', 'enzyme', 'country', 'event', 'misc', 'astronomicalobject'}


In [16]:
for split in ("ai", "literature", "music", "politics", "science"):
    for hint in ("none", "gold"):
        processed_data = transform(split, hint)
        all_ner_data.extend(processed_data)

{'product', 'conference', 'country', 'metrics', 'organisation', 'programlang', 'task', 'location', 'person', 'field', 'university', 'researcher', 'algorithm', 'misc'}
{'product', 'conference', 'country', 'metrics', 'organisation', 'programlang', 'task', 'location', 'person', 'field', 'university', 'researcher', 'algorithm', 'misc'}
{'literarygenre', 'award', 'poem', 'magazine', 'organisation', 'book', 'location', 'person', 'country', 'event', 'misc', 'writer'}
{'literarygenre', 'award', 'poem', 'magazine', 'organisation', 'book', 'location', 'person', 'country', 'event', 'misc', 'writer'}
{'song', 'award', 'musicalartist', 'organisation', 'band', 'musicgenre', 'person', 'location', 'country', 'album', 'event', 'misc', 'musicalinstrument'}
{'song', 'award', 'musicalartist', 'organisation', 'band', 'musicgenre', 'person', 'location', 'country', 'album', 'event', 'misc', 'musicalinstrument'}
{'election', 'organisation', 'politicalparty', 'politician', 'location', 'person', 'country', 'eve

In [17]:
from collections import defaultdict
cnt = defaultdict(lambda: 0)
for each in all_ner_data:
    cnt[(each['dataset'], each['split'])] += 1
cnt


defaultdict(<function __main__.<lambda>()>,
            {('conllpp', 'none'): 3453,
             ('conllpp', 'gold'): 3453,
             ('crossner_ai', 'none'): 431,
             ('crossner_ai', 'gold'): 431,
             ('crossner_literature', 'none'): 416,
             ('crossner_literature', 'gold'): 416,
             ('crossner_music', 'none'): 465,
             ('crossner_music', 'gold'): 465,
             ('crossner_politics', 'none'): 651,
             ('crossner_politics', 'gold'): 651,
             ('crossner_science', 'none'): 543,
             ('crossner_science', 'gold'): 543})

In [18]:
with open("/harddisk/user/keminglu/evaluation_corpus/processed_benchmarks/ner/all_ner_data", "w") as f:
    for sample in all_ner_data:
        f.write(json.dumps(sample) + "\n")

In [40]:
for split in ("ai", "literature", "music", "politics", "science"):
    for hint in ("none", "gold", "large"):
        processed_data = transform(split, hint)
        with open(os.path.join("/harddisk/user/keminglu/evaluation_corpus/processed_benchmarks/ner", f"crossner_{split}_{hint}.jsonl"), "w") as f:
            for sample in processed_data:
                f.write(json.dumps(sample) + "\n")

# RE

In [60]:
data_file = "/harddisk/user/keminglu/evaluation_corpus/benchmarks/Re-TACRED/data/test.json"
data = json.load(open(data_file))

In [61]:
def replace_briskets(input_str):
    input_str = input_str.replace("-LRB-", "(").replace("-RRB-", ")")
    input_str = input_str.replace("-LSB-", "[").replace("-RSB-", "]")
    return input_str

In [64]:
with open(os.path.join(output_dir, "relation_extraction", "retacred_test.jsonl"), "w") as f:
    for each in data:
        token = each['token']
        sent = replace_briskets(" ".join(token))
        subj = replace_briskets(" ".join(token[each['subj_start']:each['subj_end']+1]))
        obj = replace_briskets(" ".join(token[each['obj_start']:each['obj_end']+1]))
        subj_type = each['subj_type']
        obj_type = each['obj_type']
        ent_string = ['{"mention": "%s", "title": "%s", "type": ["%s"]}' % (ent[0], ent[0], ent[1].lower()) for ent in [(subj, subj_type), (obj, obj_type)]]
        ent_string = ", ".join(ent_string)
        record = {
            "input_text": sent,
            #"prompt": '{"number of entities": 2, "entities": [%s], "triplets": [{"head": "%s", "tail": "%s", "relations":' % (ent_string, subj, obj),
            "prompt": '{"number of entities": 2, "entities": [%s], "triplets":' % (ent_string),
            "true": each['relation']
        }
        f.write(json.dumps(record) + "\n")

In [65]:
data_file = "/harddisk/user/keminglu/evaluation_corpus/benchmarks/redocred/test_revised.json"
data = json.load(open(data_file))
rel_map = json.load(open("/harddisk/user/keminglu/evaluation_corpus/benchmarks/redocred/rel_info.json"))

In [66]:
TYPE_DICT = {
    "PER": "person",
    "MISC": "other",
    "LOC": "location",
    "NUM": "number",
    "TIME": "time",
    "ORG": "organization",
}

In [59]:
with open(os.path.join(output_dir, "relation_extraction", "redocred_test.jsonl"), "w") as f:
    for each in data:
        sent = " ".join(map(" ".join, each['sents']))
        ent_string = ['{"mention": "%s", "title": "%s", "type": ["%s"]}' % (ent['name'], ent['name'], TYPE_DICT[ent['type']]) for ent in sum(each['vertexSet'], [])]
        ent_string = list(set(ent_string))
        num_of_ent = len(ent_string)
        ent_string = ", ".join(ent_string)
        true = [(each['vertexSet'][triplet['h']][0]['name'], each['vertexSet'][triplet['t']][0]['name'], rel_map[triplet['r']]) for triplet in each['labels']]
        record = {
                "input_text": sent,
                "prompt": '{"number of entities": %d, "entities": [%s], "triplets":' % (num_of_ent, ent_string),
                "true": true
            }
        f.write(json.dumps(record) + "\n")