In [1]:
import rdflib
from rdflib.namespace import RDF, OWL, RDFS
from rdflib import URIRef, Literal
from rapidfuzz import fuzz, process
# these tools which are taken from the rdflib will help with the handeling of the RDF imports


# what it does it look in the RDF graphs and looks at the extracted entities which havbe rdf types. then what it 
# does is collect the URIs which represent the classes or any defined types in the graph. this will allow to align 
# acros the ontologies by identifing the concepts that help with this.
def extract_entities(graph):
    entities = set()
    for s, p, o in graph.triples((None, RDF.type, None)):
        entities.add(str(s))
    return entities

# this function helps with the readability for the mathcing by cleaning the URIs or literals where they are placed after / and #
# for example like "https://website.com#example" where it will convert it to the "example"
def clean_uri(uri):
    if isinstance(uri, URIRef):
        return str(uri).split("/")[-1].split("#")[-1]
    elif isinstance(uri, Literal):
        return str(uri)
    else:
        return str(uri)

# what this function does is take two lists and finds the most similar list 2 to the list 1 by using the fuzzy method that even works
# when order and form of words are different from each other. after that it will complete will give if matching was successful if the 
# threshold that is set is meet.
def match_labels(labels1, labels2, threshold = 50):
    matches = []
    for label in labels1:
        result = process.extractOne(label, labels2, scorer=fuzz.token_sort_ratio)
        if result is not None:
            match, score, _ = result
            if score >= threshold:
                matches.append((label, match, score))
    return matches

# this function allows to get from the given etities rdfs label values but if they are not found then it will take the clean URIs instead
# because thats the next best thing to use. this method is comfortable if for the alignment you want to use the rdf labels instead of the URI
def get_labels(graph, entity):
    labels = []
    for _, _, label in graph.triples((entity, RDFS.label, None)):
        labels.append(str(label))
    if not labels:
        labels.append(str(entity).split("/")[-1].split("#")[-1])
    return labels

# this will load under the ontology1 the first ontology file in the rdf graph format
ontology1 = rdflib.Graph()
ontology1.parse("clariah-tools.ttl", format="turtle")

# similar as the ontology1 the ontology2 will do the same
ontology2 = rdflib.Graph()
ontology2.parse("code-lib.ttl", format="turtle")

# it will extract the class types from the onotlogy1 and ontology2
classes1 = extract_entities(ontology1)
classes2 = extract_entities(ontology2)

# this will use the function "clean_uri" to make it readable for the matching process
labels1 = [clean_uri(c) for c in classes1]
labels2 = [clean_uri(c) for c in classes2]

# it will match the ontology1 and ontology2
matches = match_labels(labels1, labels2)

# this will print the final matched classes with the paired labels and their scores based on the similarities in the percentages
for l1, l2, score in matches:
    print(f"Matched\n" f"{l1} ↔ {l2}\n" f"with score {score}\n")


Matched
https://doi.org/10.5281/zenodo.5984171 ↔ https://doi.org/10.5281/zenodo.11225732
with score 85.71428571428572

Matched
https://github.com/UtrechtUniversity/ricgraph/blob/main/README.md ↔ https://github.com/UtrechtUniversity/streetview-greenery
with score 69.42148760330579

Matched
https://orcid.org/0000-0001-7218-7642 ↔ https://doi.org/10.1016/j.lanepe.2023.100749
with score 54.32098765432099

Matched
https://iso639-3.sil.org/code/eng ↔ https://osf.io/gmcjv/
with score 55.55555555555556

Matched
https://docs.rs/stam ↔ https://osf.io/7semh/
with score 68.29268292682926

Matched
https://github.com/CLARIAH/grlc/wiki ↔ https://github.com/rivm-syso/DIAPER
with score 59.154929577464785

Matched
https://tools.clariah.nl/asr_nl/0.6.2 ↔ https://osf.io/8mw5s/
with score 51.72413793103448

Matched
https://doi.org/10.5281/zenodo.3250720 ↔ https://doi.org/10.5281/zenodo.11225732
with score 90.9090909090909

Matched
https://orcid.org/0000-0001-9510-0802 ↔ https://doi.org/10.7910/DVN/P39QGO
w

In [2]:
import pandas as pd


threshold = 50
rows = []

for label1 in labels1:
    
    res = process.extractOne(label1, labels2, scorer=fuzz.token_sort_ratio)
    if res:
        label2, score, _ = res
    else:
        label2, score = "—", 0

    rows.append({
        "File 1": "clariah-tools.ttl",
        "File 2": "code-lib.ttl" if label2 != "—" else "",
        "Entity from File 1": label1,
        "Entity from File 2": label2,
        "Match Score (%)": score,
        "Matched": "Yes" if score >= threshold else "No"
    })

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.DataFrame(rows)

df

Unnamed: 0,File 1,File 2,Entity from File 1,Entity from File 2,Match Score (%),Matched
0,clariah-tools.ttl,code-lib.ttl,https://doi.org/10.5281/zenodo.5984171,https://doi.org/10.5281/zenodo.11225732,85.714286,Yes
1,clariah-tools.ttl,code-lib.ttl,https://github.com/UtrechtUniversity/ricgraph/...,https://github.com/UtrechtUniversity/streetvie...,69.421488,Yes
2,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/MarkDVerhagen/Dutch_healthc...,41.791045,No
3,clariah-tools.ttl,code-lib.ttl,https://orcid.org/0000-0001-7218-7642,https://doi.org/10.1016/j.lanepe.2023.100749,54.320988,Yes
4,clariah-tools.ttl,code-lib.ttl,https://annorepo-python-client.readthedocs.io/...,https://github.com/MarkDVerhagen/Dutch_healthc...,40.28777,No
5,clariah-tools.ttl,code-lib.ttl,https://iso639-3.sil.org/code/eng,https://osf.io/gmcjv/,55.555556,Yes
6,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/UtrechtUniversity/streetvie...,40.625,No
7,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,42.718447,No
8,clariah-tools.ttl,code-lib.ttl,https://docs.rs/stam,https://osf.io/7semh/,68.292683,Yes
9,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,41.176471,No


In [3]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

matched_df   = df[df['Matched'] == 'Yes']
unmatched_df = df[df['Matched'] == 'No']

print("Matched Entities")
display(matched_df)

print("Unmatched Entities")
display(unmatched_df)

Matched Entities


Unnamed: 0,File 1,File 2,Entity from File 1,Entity from File 2,Match Score (%),Matched
0,clariah-tools.ttl,code-lib.ttl,https://doi.org/10.5281/zenodo.5984171,https://doi.org/10.5281/zenodo.11225732,85.714286,Yes
1,clariah-tools.ttl,code-lib.ttl,https://github.com/UtrechtUniversity/ricgraph/...,https://github.com/UtrechtUniversity/streetvie...,69.421488,Yes
3,clariah-tools.ttl,code-lib.ttl,https://orcid.org/0000-0001-7218-7642,https://doi.org/10.1016/j.lanepe.2023.100749,54.320988,Yes
5,clariah-tools.ttl,code-lib.ttl,https://iso639-3.sil.org/code/eng,https://osf.io/gmcjv/,55.555556,Yes
8,clariah-tools.ttl,code-lib.ttl,https://docs.rs/stam,https://osf.io/7semh/,68.292683,Yes
18,clariah-tools.ttl,code-lib.ttl,https://github.com/CLARIAH/grlc/wiki,https://github.com/rivm-syso/DIAPER,59.15493,Yes
20,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/asr_nl/0.6.2,https://osf.io/8mw5s/,51.724138,Yes
21,clariah-tools.ttl,code-lib.ttl,https://doi.org/10.5281/zenodo.3250720,https://doi.org/10.5281/zenodo.11225732,90.909091,Yes
22,clariah-tools.ttl,code-lib.ttl,https://orcid.org/0000-0001-9510-0802,https://doi.org/10.7910/DVN/P39QGO,53.521127,Yes
23,clariah-tools.ttl,code-lib.ttl,https://doi.org/10.5281/zenodo.4011801,https://doi.org/10.5281/zenodo.7443895,86.842105,Yes


Unmatched Entities


Unnamed: 0,File 1,File 2,Entity from File 1,Entity from File 2,Match Score (%),Matched
2,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/MarkDVerhagen/Dutch_healthc...,41.791045,No
4,clariah-tools.ttl,code-lib.ttl,https://annorepo-python-client.readthedocs.io/...,https://github.com/MarkDVerhagen/Dutch_healthc...,40.28777,No
6,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/UtrechtUniversity/streetvie...,40.625,No
7,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,42.718447,No
9,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,41.176471,No
10,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,44.230769,No
11,clariah-tools.ttl,code-lib.ttl,https://folia.readthedocs.io/en/latest/form.html,https://osf.io/7semh/,49.275362,No
12,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,43.564356,No
13,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/PerlineDemange/CBS-MR,43.396226,No
14,clariah-tools.ttl,code-lib.ttl,https://tools.clariah.nl/commandlineapplicatio...,https://github.com/MarkDVerhagen/Dutch_healthc...,45.801527,No
