In [1]:
from deeponto.onto import Ontology
from deeponto.align.bertmap import BERTMapPipeline
from deeponto.utils import Tokenizer
from deeponto.align.mapping import ReferenceMapping, EntityMapping
from collections import defaultdict
import random
import pandas as pd

Please enter the maximum memory located to JVM [8g]:
8g maximum memory allocated to JVM.
JVM started successfully.


In [2]:
# load source and target ontologies
src_onto_path = "./snomed.body.owl"
tgt_onto_path = "./fma.body.owl"
src_onto = Ontology(src_onto_path)
tgt_onto = Ontology(tgt_onto_path)
config = BERTMapPipeline.load_bertmap_config()

[main] WARN uk.ac.manchester.cs.owl.owlapi.OWLOntologyManagerImpl - Illegal redeclarations of entities: reuse of entity http://purl.org/sig/ont/fma/has_direct_shape_type in punning not allowed [Declaration(DataProperty(<http://purl.org/sig/ont/fma/has_direct_shape_type>)), Declaration(ObjectProperty(<http://purl.org/sig/ont/fma/has_direct_shape_type>))]


Use the default configuration at /home/yuan/anaconda3/envs/deeponto/lib/python3.8/site-packages/deeponto/align/bertmap/default_config.yaml.


In [3]:
# build annotation index {class_iri: class_labels}
src_annotation_index, _ = src_onto.build_annotation_index(config.annotation_property_iris)
tgt_annotation_index, _ = tgt_onto.build_annotation_index(config.annotation_property_iris)
# inverted index for candidate selection
tgt_inv_annotation_index = tgt_onto.build_inverted_annotation_index(tgt_annotation_index, Tokenizer.from_pretrained("google/flan-t5-xxl"))

Selecting source ontology classes based on the reference mappings, ensuring:

    (1) all the reference mappings for a matched source ontology class are included
    (2) the matched ontology classes cannot be string-matched
    (3) 50% of the classes have a match and 50% don't 

In [4]:
full_refs =  ReferenceMapping.as_tuples(ReferenceMapping.read_table_mappings("refs/full_refs.tsv"))
# select the reference mappings that cannot be string-matched
selected_refs_dict = defaultdict(list)
for s, t in full_refs:
    src_class_labels = src_annotation_index[s]
    tgt_class_labels = tgt_annotation_index[t]
    if not src_class_labels.intersection(tgt_class_labels):
        selected_refs_dict[s].append(t)
        
print(len(selected_refs_dict))
selected_refs_dict = {k: v for k, v in selected_refs_dict.items() if len(v) == 1}
print(len(selected_refs_dict))

1846
1639


In [16]:
# select 50 matched source classes
matched_src_classes = random.sample(selected_refs_dict.keys(), k=50)
selected_refs = []
for s in matched_src_classes:
    for t in selected_refs_dict[s]:
        selected_refs.append((s, t))

In [17]:
# select 50 unmatched source classes
unmatched_src_classes = list(set(src_annotation_index.keys()) - set([s for s, _ in full_refs]))
unmatched_src_classes = random.sample(unmatched_src_classes, k=50)

In [18]:
# combine the matched and unmatched
selected_refs += [(s, "UnMatched") for s in unmatched_src_classes]

In [19]:
pd.DataFrame(selected_refs, columns=["SrcEntity", "TgtEntity"]).to_csv("refs/test_refs.tsv", sep="\t", index=False)

Assuming a perfect recall of candidate selection.

Use sub-word inverted index to select 100 candidates, ensuring the target reference is inside.

In [20]:
test_refs = ReferenceMapping.as_tuples(ReferenceMapping.read_table_mappings("refs/test_refs.tsv"))

In [23]:
# gathering candidates for each reference mapping
test_cands = []
recalled = 0
for s, t in test_refs:
    s_labels = src_annotation_index[s]
    t_cands = [c for c, _ in tgt_inv_annotation_index.idf_select(list(s_labels), pool_size=100)]
    # print(t_cands)
    if t in t_cands:
        recalled += 1
    elif t != "UnMatched":  # add the reference target if matched
        t_cands = [t] + t_cands[:-1]
    test_cands.append((s, t, t_cands))
recalled / 50

0.82

In [None]:
pd.DataFrame(data=test_cands, columns=["SrcEntity", "TgtEntity", "TgtCandidates"]).to_csv("test_cands.tsv", sep="\t", index=False)