In [1]:
from deeponto.onto import Ontology
from deeponto.align.bertmap import BERTMapPipeline
from deeponto.align.evaluation import AlignmentEvaluator
from deeponto.utils import FileUtils
from deeponto.align.mapping import EntityMapping, ReferenceMapping
import pandas as pd
import random
import numpy as np

Please enter the maximum memory located to JVM [8g]:
8g maximum memory allocated to JVM.
JVM started successfully.


In [8]:
test_cands = FileUtils.read_table("data/ncit2doid/test_cands.tsv")

In [9]:
config = BERTMapPipeline.load_bertmap_config()
config.global_matching.enabled = False
config.output_path = "experiments/ncit2doid.us/"
src_onto_path = "data/ncit2doid/ncit.owl"
tgt_onto_path = "data/ncit2doid/doid.owl"
src_onto = Ontology(src_onto_path)
tgt_onto = Ontology(tgt_onto_path)

bertmap = BERTMapPipeline(src_onto, tgt_onto, config)

Use the default configuration at /home/yuan/anaconda3/envs/deeponto/lib/python3.8/site-packages/deeponto/align/bertmap/default_config.yaml.


[Time: 00:03:24] - [PID: 2304892] - [Model: bertmap] 
Load the following configurations:
{
    "model": "bertmap",
    "output_path": "/home/yuan/projects/LLMap/experiments/ncit2doid.us",
    "annotation_property_iris": [
        "http://www.w3.org/2000/01/rdf-schema#label",
        "http://www.geneontology.org/formats/oboInOwl#hasSynonym",
        "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym",
        "http://www.w3.org/2004/02/skos/core#exactMatch",
        "http://www.ebi.ac.uk/efo/alternative_term",
        "http://www.orpha.net/ORDO/Orphanet_#symbol",
        "http://purl.org/sig/ont/fma/synonym",
        "http://www.w3.org/2004/02/skos/core#prefLabel",
        "http://www.w3.org/2004/02/skos/core#altLabel",
        "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P108",
        "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P90"
    ],
    "known_mappings": null,
    "auxiliary_ontos": [],
    "bert": {
        "pretrained_path": "emilyalsentzer/Bio_Clini

Loading a BERT model from: /home/yuan/projects/LLMap/experiments/ncit2doid.us/bertmap/bert/checkpoint-45300.
The BERT model is set to eval mode for making predictions.
There are 2 GPU(s) available.
We will use the GPU: Quadro RTX 8000


[Time: 00:03:46] - [PID: 2304892] - [Model: bertmap] 
Fine-tuning finished, found best checkpoint at /home/yuan/projects/LLMap/experiments/ncit2doid.us/bertmap/bert/checkpoint-45300.
[Time: 00:00:21] - [PID: 2304892] - [Model: bertmap] 
Fine-tuning finished, found best checkpoint at /home/yuan/projects/LLMap/experiments/ncit2doid.us/bertmap/bert/checkpoint-45300.
INFO:bertmap:Fine-tuning finished, found best checkpoint at /home/yuan/projects/LLMap/experiments/ncit2doid.us/bertmap/bert/checkpoint-45300.


[Time: 00:03:46] - [PID: 2304892] - [Model: bertmap] 
Build inverted annotation index for candidate selection.
[Time: 00:00:21] - [PID: 2304892] - [Model: bertmap] 
Build inverted annotation index for candidate selection.
INFO:bertmap:Build inverted annotation index for candidate selection.


0.9 0.9


In [17]:
bertmap_results = []
bertmaplt_results = []
for i, dp in test_cands.iterrows():
    
    bertmap_mappings = []
    bertmaplt_mappings = []
    
    src_class_iri = dp["SrcEntity"]
    tgt_class_iri = dp["TgtEntity"]
    tgt_cands = eval(dp["TgtCandidates"])
    
    # retrieve class annotations
    src_class_labels = bertmap.src_annotation_index[src_class_iri]
    
    for tgt_cand_iri in tgt_cands:
        tgt_cand_labels = bertmap.tgt_annotation_index[tgt_cand_iri]
        # the bertmap score
        bertmap_score = bertmap.mapping_predictor.bert_mapping_score(
            src_class_labels, tgt_cand_labels
        )
        bertmap_mappings.append(EntityMapping(src_class_iri, tgt_cand_iri, "=", bertmap_score))
        # the bertmaplt score
        bertmaplt_score = bertmap.mapping_predictor.edit_similarity_mapping_score(
            src_class_labels, tgt_cand_labels
        )
        bertmaplt_mappings.append(EntityMapping(src_class_iri, tgt_cand_iri, "=", bertmaplt_score))
        
    bertmap_results.append(EntityMapping.sort_entity_mappings_by_score(bertmap_mappings))
    bertmaplt_results.append(EntityMapping.sort_entity_mappings_by_score(bertmaplt_mappings))

In [18]:
import math
def mean_reciprocal_rank(prediction_and_candidates):
    r"""Compute $MRR$ for a list of `(prediction_mapping, candidate_mappings)` pair.

    $$MRR = \sum_i^N rank_i^{-1} / N$$
    """
    sum_inverted_ranks = 0
    for pred, cands in prediction_and_candidates:
        ordered_candidates = [c.to_tuple() for c in EntityMapping.sort_entity_mappings_by_score(cands)]
        if pred.to_tuple() in ordered_candidates:
            rank = ordered_candidates.index(pred.to_tuple()) + 1
        else:
            rank = math.inf
        sum_inverted_ranks += 1 / rank
    return sum_inverted_ranks / len(prediction_and_candidates)

In [19]:
# bertmap
results_dict = dict()
for i, mappings in enumerate(bertmap_results):
    results_dict[test_cands.iloc[i]["SrcEntity"], test_cands.iloc[i]["TgtEntity"]] = mappings
FileUtils.save_file(results_dict, "bertmap_ncit2doid_results.pkl")
# manually move results to the result folder
ranked_results = FileUtils.load_file("bertmap_ncit2doid_results.pkl")
# the first 50 mappings are the matched mappings
refs = ReferenceMapping.read_table_mappings("data/ncit2doid/refs/test_refs.tsv")[:50]

In [31]:
threshold = 0.9995
# Precision, Recall, F1
preds = []
for (src, tgt), mappings in ranked_results.items():
    # for t, answer, score in tgt_cands:
    for m in mappings:
        if m.score >= threshold:
            preds.append(m)
print(AlignmentEvaluator.f1(preds, refs, []))

# Accuracy
yes_correct = 0
no_correct = 0
for (src, tgt), mappings in ranked_results.items():
    # print(tgt_cands[0])
    for m in mappings:
        score = m.score
        if tgt == m.tail and score >= threshold:
            yes_correct += 1
    
    if tgt == "UnMatched" and mappings[0].score< threshold:
        no_correct += 1
print(yes_correct, no_correct, yes_correct + no_correct)

# MRR
formatted_results = []
for (src, tgt), mappings in ranked_results.items():
    ref_mapping = EntityMapping(src, tgt, "=", 1.0)
    formatted_results.append((ref_mapping, mappings))
# again, only the first 50 has a match
print(mean_reciprocal_rank(formatted_results[:50]))

{'P': 0.759, 'R': 0.44, 'F1': 0.557}
22 46 68
0.9337777777777779


In [32]:
# bertmaplt
results_dict = dict()
for i, mappings in enumerate(bertmaplt_results):
    results_dict[test_cands.iloc[i]["SrcEntity"], test_cands.iloc[i]["TgtEntity"]] = mappings
FileUtils.save_file(results_dict, "bertmaplt_ncit2doid_results.pkl")
# manually move results to the result folder
ranked_results = FileUtils.load_file("bertmaplt_ncit2doid_results.pkl")

In [36]:
threshold = 0.9
# Precision, Recall, F1
preds = []
for (src, tgt), mappings in ranked_results.items():
    # for t, answer, score in tgt_cands:
    for m in mappings:
        if m.score >= threshold:
            preds.append(m)
print(AlignmentEvaluator.f1(preds, refs, []))

# Accuracy
yes_correct = 0
no_correct = 0
for (src, tgt), mappings in ranked_results.items():
    # print(tgt_cands[0])
    for m in mappings:
        score = m.score
        if tgt == m.tail and score >= threshold:
            yes_correct += 1
    
    if tgt == "UnMatched" and mappings[0].score< threshold:
        no_correct += 1
print(yes_correct, no_correct, yes_correct + no_correct)

# MRR
formatted_results = []
for (src, tgt), mappings in ranked_results.items():
    ref_mapping = EntityMapping(src, tgt, "=", 1.0)
    formatted_results.append((ref_mapping, mappings))
# again, only the first 50 has a match
print(mean_reciprocal_rank(formatted_results[:50]))

{'P': 0.196, 'R': 0.18, 'F1': 0.187}
9 46 55
0.5160657125437978
