### Convert predicted entities in the evaluation format

Define the imports

In [190]:
import json

Define paths to the prediction files

In [191]:
PATH_NER_PREDICTIONS_EVAL_FORMAT = "../Predictions/NER/predicted_entities_dev_eval_format.json"
PATH_NEL_PREDICTIONS_EVAL_FORMAT = "../Predictions/NEL/predicted_entities_dev.json"
PATH_RE_PREDICTIONS = "../Predictions/RE/predicted_relations_dev.json"

Define output path

In [192]:
PATH_OUTPUT_MERGED_PREDICTIONS = "../Predictions/predictions_dev_eval_format.json"
PATH_OUTPUT_NER = "../Eval/teamID_NER_runID_systemDesc.json"
PATH_OUTPUT_NERD = "../Eval/teamID_NERD_runID_systemDesc.json"
PATH_OUTPUT_MENTION_LEVEL_RE = "../Eval/teamID_MENTION_LEVEL_RE_runID_systemDesc.json"  
PATH_OUTPUT_CONCEPT_LEVEL_RE = "../Eval/teamID_CONCEPT_LEVEL_RE_runID_systemDesc.json"      

Load the input files into dictionary variables

In [193]:
with open(PATH_NER_PREDICTIONS_EVAL_FORMAT, 'r', encoding='utf-8') as file:
	ner_predictions = json.load(file)
	
with open(PATH_NEL_PREDICTIONS_EVAL_FORMAT, 'r', encoding='utf-8') as file:
	nel_predictions = json.load(file)
	
with open(PATH_RE_PREDICTIONS, 'r', encoding='utf-8') as file:
	re_predictions = json.load(file)

#### Define the functions to process RE predictions

Map subjects and objects of predicted relations to entities

In [194]:
def map_predicted_relations_to_entities(re_data, nel_data):
    for rel in re_data:
        title = rel['title']
        h_idx = rel['h_idx']
        t_idx = rel['t_idx']
        predicate = rel['r'].lower()
        doc_pmid = title.split('||')[0].strip()

        article_found = False
        for pmid, article in nel_data.items():
            if article['metadata']['title'] == title or pmid == doc_pmid:
                article_found = True
                subject_entity = article['entities'][h_idx]
                object_entity = article['entities'][t_idx]
                article['relations'].append({
                    "subject_start_idx": subject_entity['start_idx'],
                    "subject_end_idx": subject_entity['end_idx'],
                    "subject_location": subject_entity['location'],
                    "subject_text_span": subject_entity['text_span'],
                    "subject_label": subject_entity['label'],
                    "subject_uri": subject_entity['uri'],
                    "predicate": predicate,
                    "object_start_idx": object_entity['start_idx'],
                    "object_end_idx": object_entity['end_idx'],
                    "object_location": object_entity['location'],
                    "object_text_span": object_entity['text_span'],
                    "object_label": object_entity['label'],
                    "object_uri": object_entity['uri']
                })
            else:
                continue
        
        if not article_found:
            print(f'NO MATCH FOR RELATION: {rel}')

    return nel_data

In [195]:
merged_predictions = map_predicted_relations_to_entities(re_predictions, nel_predictions)

Remove relations not defined in the annotation guidelines and complete conversion to evaluation format

In [196]:
LEGAL_RELATIONS = [
    ("ddf", "affect", "ddf"),
    ("microbiome", "is linked to", "ddf"),
    ("ddf", "target", "human"),
    ("drug", "change effect", "ddf"),
    ("ddf", "is a", "ddf"),
    ("microbiome", "located in", "human"),
    ("chemical", "influence", "ddf"),
    ("dietary supplement", "influence", "ddf"),
    ("ddf", "target", "animal"),
    ("chemical", "impact", "microbiome"),
    ("anatomical location", "located in", "animal"),
    ("microbiome", "located in", "animal"),
    ("chemical", "located in", "anatomical location"),
    ("bacteria", "part of", "microbiome"),
    ("ddf", "strike", "anatomical location"),
    ("drug", "administered", "animal"),
    ("bacteria", "influence", "ddf"),
    ("drug", "impact", "microbiome"),
    ("ddf", "change abundance", "microbiome"),
    ("microbiome", "located in", "anatomical location"),
    ("microbiome", "used by", "biomedical technique"),
    ("chemical", "produced by", "microbiome"),
    ("dietary supplement", "impact", "microbiome"),
    ("bacteria", "located in", "animal"),
    ("animal", "used by", "biomedical technique"),
    ("chemical", "impact", "bacteria"),
    ("chemical", "located in", "animal"),
    ("food", "impact", "bacteria"),
    ("microbiome", "compared to", "microbiome"),
    ("human", "used by", "biomedical technique"),
    ("bacteria", "change expression", "gene"),
    ("chemical", "located in", "human"),
    ("drug", "interact", "chemical"),
    ("food", "administered", "human"),
    ("ddf", "change abundance", "bacteria"),
    ("chemical", "interact", "chemical"),
    ("chemical", "part of", "chemical"),
    ("dietary supplement", "impact", "bacteria"),
    ("ddf", "interact", "chemical"),
    ("food", "impact", "microbiome"),
    ("food", "influence", "ddf"),
    ("bacteria", "located in", "human"),
    ("dietary supplement", "administered", "human"),
    ("bacteria", "interact", "chemical"),
    ("drug", "change expression", "gene"),
    ("drug", "impact", "bacteria"),
    ("drug", "administered", "human"),
    ("anatomical location", "located in", "human"),
    ("dietary supplement", "change expression", "gene"),
    ("chemical", "change expression", "gene"),
    ("bacteria", "interact", "bacteria"),
    ("drug", "interact", "drug"),
    ("microbiome", "change expression", "gene"),
    ("bacteria", "interact", "drug"),
    ("food", "change expression", "gene"),
    ("chemical", "administered", "human"),
    ("chemical", "administered", "animal"),
    ('dietary supplement', 'administered', 'animal'),
    ('food', 'administered', 'animal'),
]
def remove_illegal_relations(data):
    dump_dict = {}
    total_rels = 0
    kept_rels = 0
    discared_rels = 0
    discared_rels_set = set()

    for pmid, article in data.items():
        dump_dict[pmid] = {}
        dump_dict[pmid]['metadata'] = article['metadata']
        dump_dict[pmid]['entities'] = []
        dump_dict[pmid]['relations'] = []

        for entity in article['entities']:
            dump_dict[pmid]['entities'].append({
                "start_idx": entity["start_idx"],
                "end_idx": entity["end_idx"],
                "location": entity["location"],
                "text_span": entity["text_span"],
                "label": entity["label"] if entity['label'] != 'ddf' else 'DDF',
                "uri": entity["uri"]
            })
        
        for relation in article['relations']:
            total_rels += 1
            rel_key = (relation["subject_label"], relation["predicate"], relation["object_label"])
            if rel_key in LEGAL_RELATIONS:
                kept_rels += 1
                dump_dict[pmid]['relations'].append({
                    "subject_start_idx": relation["subject_start_idx"],
                    "subject_end_idx": relation["subject_end_idx"],
                    "subject_location": relation["subject_location"],
                    "subject_text_span": relation["subject_text_span"],
                    "subject_label": relation["subject_label"] if relation["subject_label"] != 'ddf' else 'DDF',
                    "subject_uri": relation["subject_uri"],
                    "predicate": relation["predicate"],
                    "object_start_idx": relation["object_start_idx"],
                    "object_end_idx": relation["object_end_idx"],
                    "object_location": relation["object_location"],
                    "object_text_span": relation["object_text_span"],
                    "object_label": relation["object_label"] if relation["object_label"] != 'ddf' else 'DDF',
                    "object_uri": relation["object_uri"]
                })
            else:
                discared_rels += 1
                discared_rels_set.add(rel_key)

    print(f'total_rels: {total_rels}')
    print(f'kept_rels: {kept_rels}')
    print(f'discared_rels: {discared_rels}')
    print()
    print(f'discared_rels_set: {discared_rels_set}')
    for entry in discared_rels_set:
        print(entry)

    return dump_dict


In [197]:
dump_dict = remove_illegal_relations(merged_predictions)

total_rels: 1151
kept_rels: 1091
discared_rels: 60

discared_rels_set: {('chemical', 'influence', 'chemical'), ('bacteria', 'is linked to', 'ddf'), ('microbiome', 'is linked to', 'gene'), ('ddf', 'change abundance', 'ddf'), ('chemical', 'change effect', 'ddf'), ('ddf', 'influence', 'ddf'), ('drug', 'change effect', 'human'), ('bacteria', 'influence', 'animal'), ('food', 'change abundance', 'microbiome'), ('bacteria', 'interact', 'gene'), ('chemical', 'change expression', 'chemical'), ('biomedical technique', 'is linked to', 'ddf'), ('ddf', 'is linked to', 'ddf'), ('chemical', 'interact', 'bacteria'), ('biomedical technique', 'change effect', 'biomedical technique'), ('dietary supplement', 'interact', 'chemical'), ('ddf', 'located in', 'human'), ('animal', 'located in', 'human'), ('gene', 'located in', 'animal'), ('food', 'change effect', 'ddf'), ('drug', 'influence', 'ddf'), ('gene', 'impact', 'bacteria'), ('gene', 'located in', 'anatomical location'), ('bacteria', 'located in', 'anato

Sort entities and relations

In [198]:
def sort_entities(release_dict):
	def get_sorting_key(entity):
		location_priority = 0 if entity["location"] == "title" else 1
		return (location_priority, entity["start_idx"])

	for pmid, article in release_dict.items():
		article["entities"].sort(key=get_sorting_key)

In [199]:
sort_entities(dump_dict)

In [200]:
def sort_relations(release_dict):
	def get_sorting_key(relation):
		location_priority = 0 if relation["subject_location"] == "title" else 1
		return (location_priority, relation["subject_start_idx"])

	for pmid, article in release_dict.items():
		article["relations"].sort(key=get_sorting_key)

In [201]:
sort_relations(dump_dict)

Generate Mention Level Relations

In [202]:
def add_mention_level_relations_to_release_dict(release_dict):
    for pmid, article in release_dict.items():
        tuples = set()
        for relation in article["relations"]:
            tuples.add((relation["subject_text_span"], relation["subject_label"], relation["predicate"], relation["object_text_span"], relation["object_label"]))
        if "mention_level_relations" not in release_dict[pmid]:
            release_dict[pmid]["mention_level_relations"] = []		
        for entry in tuples:
            release_dict[pmid]["mention_level_relations"].append({"subject_text_span": entry[0], "subject_label": entry[1], "predicate": entry[2], "object_text_span": entry[3], "object_label": entry[4]})

In [203]:
add_mention_level_relations_to_release_dict(dump_dict)

Generate Concept Level Relations

In [204]:
def add_concept_level_relations_to_release_dict(release_dict):
    for pmid, article in release_dict.items():
        tuples = set()
        for relation in article["relations"]:
            tuples.add((relation["subject_uri"], relation["subject_label"], relation["predicate"], relation["object_uri"], relation["object_label"]))
        if "concept_level_relations" not in release_dict[pmid]:
            release_dict[pmid]["concept_level_relations"] = []
        for entry in tuples:
            release_dict[pmid]["concept_level_relations"].append({"subject_uri": entry[0], "subject_label": entry[1], "predicate": entry[2], "object_uri": entry[3], "object_label": entry[4]})

In [205]:
add_concept_level_relations_to_release_dict(dump_dict)

In [206]:
with open(PATH_OUTPUT_MERGED_PREDICTIONS, 'w', encoding='utf-8') as file:
    json.dump(dump_dict, file, indent=2)

In [207]:
NER_task = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    NER_task[pmid]['entities'] = []
    for entity in dump_dict[pmid]['entities']:
        NER_task[pmid]['entities'].append({
            "start_idx": entity["start_idx"],
            "end_idx": entity["end_idx"],
            "location": entity["location"],
            "text_span": entity["text_span"],
            "label": entity["label"],
        })


with open(PATH_OUTPUT_NER, 'w', encoding='utf-8') as file:
    json.dump(NER_task, file, indent=2)

In [208]:
NERD_task = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    NERD_task[pmid]['entities'] = []
    for entity in dump_dict[pmid]['entities']:
        NERD_task[pmid]['entities'].append({
            "start_idx": entity["start_idx"],
            "end_idx": entity["end_idx"],
            "location": entity["location"],
            "text_span": entity["text_span"],
            "label": entity["label"],
            "uri": entity["uri"]
        })

with open(PATH_OUTPUT_NERD, 'w', encoding='utf-8') as file:
    json.dump(NERD_task, file, indent=2)

In [209]:
MENTION_LEVEL_RE_task = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    MENTION_LEVEL_RE_task[pmid]['mention_level_relations'] = dump_dict[pmid]['mention_level_relations']

with open(PATH_OUTPUT_MENTION_LEVEL_RE, 'w', encoding='utf-8') as file:
    json.dump(MENTION_LEVEL_RE_task, file, indent=2)

In [210]:
CONCEPT_LEVEL_RE_task = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    CONCEPT_LEVEL_RE_task[pmid]['concept_level_relations'] = dump_dict[pmid]['concept_level_relations']

with open(PATH_OUTPUT_CONCEPT_LEVEL_RE, 'w', encoding='utf-8') as file:
    json.dump(CONCEPT_LEVEL_RE_task, file, indent=2)