In [1]:
import rdflib
from rdflib.namespace import RDF, OWL, RDFS
from rdflib import URIRef, Literal
from rapidfuzz import fuzz, process
# these tools which are taken from the rdflib will help with the handeling of the RDF imports

# form the RDF graphs it takes all of the unique predicates and then using that we can get the schema elements that we will use
# such as "label", "date", "author", and so on.
def extract_properties(graph):
    return set(str(p) for _, p, _ in graph)

# this function helps with the readability for the mathcing by cleaning the URIs or literals where they are placed after / and #
# for example like "https://website.com#example" where it will convert it to the "example"
def clean_uri(uri):
    if isinstance(uri, URIRef):
        return str(uri).split("/")[-1].split("#")[-1]
    elif isinstance(uri, Literal):
        return str(uri)
    else:
        return str(uri)

# what this function does is take two lists and finds the most similar list 2 to the list 1 by using the fuzzy method that even works
# when order and form of words are different from each other. after that it will complete will give if matching was successful if the 
# threshold that is set is meet.
def match_labels(labels1, labels2, threshold = 50):
    matches = []
    for label in labels1:
        result = process.extractOne(label, labels2, scorer=fuzz.token_sort_ratio)
        if result is not None:
            match, score, _ = result
            if score >= threshold:
                matches.append((label, match, score))
            
    return matches

# this function allows to get from the given etities rdfs label values but if they are not found then it will take the clean URIs instead
# because thats the next best thing to use. this method is comfortable if for the alignment you want to use the rdf labels instead of the URI
def get_labels(graph, entity):
    labels = []
    for _, _, label in graph.triples((entity, RDFS.label, None)):
        labels.append(str(label))
    if not labels:
        labels.append(str(entity).split("/")[-1].split("#")[-1])
    return labels

# this will load under the ontology1 the first ontology file in the rdf graph format
ontology1 = rdflib.Graph()
ontology1.parse("clariah-tools.ttl", format="turtle")

# similar as the ontology1 the ontology2 will do the same
ontology2 = rdflib.Graph()
ontology2.parse("code-lib.ttl", format="turtle")

# it will extract all of the predicates URIs from the ontologies
properties1 = extract_properties(ontology1)
properties2 = extract_properties(ontology2)

# this will use the function "clean_uri" to make it readable for the matching process
prop_labels1 = [clean_uri(p) for p in properties1]
prop_labels2 = [clean_uri(p) for p in properties2]

# this will use the fuzzy string matching between two ontology sets for the property labels
property_matches = match_labels(prop_labels1, prop_labels2)

# this will print the final matched classes with the paired labels and their scores based on the similarities in the percentages
for p1, p2, score in property_matches:
    print(f"Matched\n" f"{p1} ↔ {p2}\n" f"(score: {score})\n")


Matched
http://schema.org/codeRepository ↔ https://schema.org/produces
(score: 74.57627118644068)

Matched
https://codemeta.github.io/terms/readme ↔ https://codemeta.github.io/terms/referencePublication
(score: 78.26086956521739)

Matched
http://schema.org/repository ↔ https://schema.org/author
(score: 79.24528301886792)

Matched
http://schema.org/name ↔ https://schema.org/name
(score: 97.77777777777777)

Matched
http://schema.org/text ↔ https://schema.org/name
(score: 84.44444444444444)

Matched
http://schema.org/license ↔ https://schema.org/name
(score: 83.33333333333334)

Matched
https://w3id.org/software-types#executableName ↔ https://w3id.org/software-iodata#producesData
(score: 70.32967032967032)

Matched
http://schema.org/releaseNotes ↔ https://schema.org/produces
(score: 77.19298245614034)

Matched
https://codemeta.github.io/terms/contIntegration ↔ https://codemeta.github.io/terms/referencePublication
(score: 79.20792079207921)

Matched
http://schema.org/review ↔ https://schema

In [2]:
import pandas as pd
from rapidfuzz import fuzz, process

threshold = 50
rows = []

for label1 in prop_labels1:
    res = process.extractOne(label1, prop_labels2, scorer=fuzz.token_sort_ratio)
    
    if res:
        label2, score, _ = res
    else:
        label2, score = "—", 0

    rows.append({
        "File 1": "clariah-tools.ttl",
        "File 2": "code-lib.ttl" if label2 != "—" else "",
        "Property from File 1": label1,
        "Property from File 2": label2,
        "Match Score (%)": score,
        "Matched": "Yes" if score >= threshold else "No"
    })

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df_schema = pd.DataFrame(rows)
df_schema

Unnamed: 0,File 1,File 2,Property from File 1,Property from File 2,Match Score (%),Matched
0,clariah-tools.ttl,code-lib.ttl,http://schema.org/codeRepository,https://schema.org/produces,74.576271,Yes
1,clariah-tools.ttl,code-lib.ttl,https://codemeta.github.io/terms/readme,https://codemeta.github.io/terms/referencePubl...,78.26087,Yes
2,clariah-tools.ttl,code-lib.ttl,http://schema.org/repository,https://schema.org/author,79.245283,Yes
3,clariah-tools.ttl,code-lib.ttl,http://schema.org/name,https://schema.org/name,97.777778,Yes
4,clariah-tools.ttl,code-lib.ttl,http://schema.org/text,https://schema.org/name,84.444444,Yes
5,clariah-tools.ttl,code-lib.ttl,http://schema.org/license,https://schema.org/name,83.333333,Yes
6,clariah-tools.ttl,code-lib.ttl,https://w3id.org/software-types#executableName,https://w3id.org/software-iodata#producesData,70.32967,Yes
7,clariah-tools.ttl,code-lib.ttl,http://schema.org/releaseNotes,https://schema.org/produces,77.192982,Yes
8,clariah-tools.ttl,code-lib.ttl,https://codemeta.github.io/terms/contIntegration,https://codemeta.github.io/terms/referencePubl...,79.207921,Yes
9,clariah-tools.ttl,code-lib.ttl,http://schema.org/review,https://schema.org/name,80.851064,Yes
