In [None]:
import rdflib
from rdflib.namespace import RDF, OWL, RDFS
from rdflib import URIRef, Literal
from rapidfuzz import fuzz, process

File1 = "clariah-tools.ttl"
File2 = "code-lib.ttl"
setThreshold  = 80

def extract_properties(graph):

    props = set()

    for s, _, _ in graph.triples((None, RDF.type, OWL.ObjectProperty)):
        props.add(str(s))
    for s, _, _ in graph.triples((None, RDF.type, OWL.DatatypeProperty)):
        props.add(str(s))
    for s, _, _ in graph.triples((None, RDF.type, RDF.Property)):
        props.add(str(s))
    if not props:
        props = set(str(p) for p in graph.predicates())

    return props

def clean_uri(uri):
    txt = str(uri)
    return txt.split("#")[-1].split("/")[-1]

def get_prefix(graph, uri):
    qn = graph.namespace_manager.qname(URIRef(uri))
    if ':' in qn:
        return qn.split(':', 1)[0]
    else:
        return qn

def match_labels(labels1, labels2, threshold=setThreshold):
    matches = []
    for label in labels1:
        result = process.extractOne(label, labels2, scorer=fuzz.token_sort_ratio)
        if result:
            match, score, _ = result
            if score >= threshold:
                matches.append((label, match, score))
    return matches

def get_labels(graph, property):
    labels = [str(lbl) for _, _, lbl in graph.triples((URIRef(property), RDFS.label, None))]
    if not labels:
        labels = [property.split("/")[-1].split("#")[-1]]
    return labels

extraction1 = rdflib.Graph()
extraction2 = rdflib.Graph()
extraction1.parse(File1, format="turtle")
extraction2.parse(File2, format="turtle")

props1 = extract_properties(extraction1)
props2 = extract_properties(extraction2)

labels1 = [clean_uri(p) for p in props1]
labels2 = [clean_uri(p) for p in props2]

matches = match_labels(labels1, labels2)

for l1, l2, score in matches:
    uri1 = next(p for p in props1 if clean_uri(p) == l1)
    uri2 = next(p for p in props2 if clean_uri(p) == l2)

    prefix1 = get_prefix(extraction1, uri1)
    prefix2 = get_prefix(extraction2, uri2)

    print(f"Matched: {l1} ↔ {l2} (score: {score})")
    print(f"In {File1}: {prefix1}:{clean_uri(uri1)}  ←  {uri1}")
    print(f"In {File2}: {prefix2}:{clean_uri(uri2)}  ←  {uri2}")
    print("\n Triples using each property:\n")

    print(f" → {File1}:")
    for s, p, o in extraction1.triples((None, URIRef(uri1), None)):
        print(f"{s} {p} {o}")
    print(f"\n → {File2}:")
    for s, p, o in extraction2.triples((None, URIRef(uri2), None)):
        print(f"{s} {p} {o}")

    print("\n" + "-"*60 + "\n")

Matched: programmingLanguage ↔ programmingLanguage (score: 100.0)
In clariah-tools.ttl: schema1:programmingLanguage  ←  http://schema.org/programmingLanguage
In code-lib.ttl: sdo:programmingLanguage  ←  https://schema.org/programmingLanguage

 Triples using each property:

 → clariah-tools.ttl:
https://tools.clariah.nl/alpino-service/2.4.1 http://schema.org/programmingLanguage Python
https://tools.clariah.nl/annorepo-client/0.1.3 http://schema.org/programmingLanguage Python
https://tools.clariah.nl/asrservice/0.3 http://schema.org/programmingLanguage Python
https://tools.clariah.nl/auchann/0.2.0 http://schema.org/programmingLanguage Python
https://tools.clariah.nl/chamd/0.5.12 http://schema.org/programmingLanguage Python
https://tools.clariah.nl/alpinograph/1.0.5 http://schema.org/programmingLanguage Go
https://tools.clariah.nl/alud/2.14.0 http://schema.org/programmingLanguage Go
https://tools.clariah.nl/cesar/unknown http://schema.org/programmingLanguage https://tools.clariah.nl/stub/

In [None]:
import pandas as pd


threshold = setThreshold
rows = []

for label1 in labels1:
    
    res = process.extractOne(label1, labels2, scorer=fuzz.token_sort_ratio)
    if res:
        label2, score, _ = res
    else:
        label2, score = "—", 0
    
    uri1 = next(p for p in props1 if clean_uri(p) == label1)
    prefix1 = get_prefix(extraction1, uri1)

    if label2 != "—":
        uri2 = next(p for p in props2 if clean_uri(p) == label2)
        prefix2 = get_prefix(extraction2, uri2)
    else:
        prefix2 = ""

    rows.append({
        "File 1": "clariah-tools.ttl",
        "File 2": "code-lib.ttl" if label2 != "—" else "",
        "Prefix File 1": prefix1,
        "Property from File 1": label1,
        "Prefix File 2": prefix2,
        "Property from File 2": label2,
        "Match Score (%)": score,
        "Matched": "Yes" if score >= threshold else "No"
    })

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.DataFrame(rows)

df


Unnamed: 0,File 1,File 2,Prefix File 1,Property from File 1,Prefix File 2,Property from File 2,Match Score (%),Matched
0,clariah-tools.ttl,code-lib.ttl,schema1,programmingLanguage,sdo,programmingLanguage,100.0,Yes
1,clariah-tools.ttl,code-lib.ttl,schema1,softwareHelp,dct,isReferencedBy,38.461538,No
2,clariah-tools.ttl,code-lib.ttl,schema1,codeRepository,dct,creator,47.619048,No
3,clariah-tools.ttl,code-lib.ttl,codemeta,developmentStatus,codemeta,referencePublication,32.432432,No
4,clariah-tools.ttl,code-lib.ttl,schema1,dateModified,sdo,maintainer,36.363636,No
5,clariah-tools.ttl,code-lib.ttl,schema1,softwareRequirements,dct,requires,57.142857,No
6,clariah-tools.ttl,code-lib.ttl,codemeta,contIntegration,dct,creator,45.454545,No
7,clariah-tools.ttl,code-lib.ttl,schema1,repository,dct,creator,58.823529,No
8,clariah-tools.ttl,code-lib.ttl,dct,relation,dct,creator,66.666667,No
9,clariah-tools.ttl,code-lib.ttl,ns1,executableName,codemeta,referencePublication,41.176471,No


In [14]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

matched_df   = df[df['Matched'] == 'Yes']
unmatched_df = df[df['Matched'] == 'No']

print("Matched Properties")
display(matched_df)

print("Unmatched Properties")
display(unmatched_df)

Matched Properties


Unnamed: 0,File 1,File 2,Prefix File 1,Property from File 1,Prefix File 2,Property from File 2,Match Score (%),Matched
0,clariah-tools.ttl,code-lib.ttl,schema1,programmingLanguage,sdo,programmingLanguage,100.0,Yes
12,clariah-tools.ttl,code-lib.ttl,codemeta,referencePublication,codemeta,referencePublication,100.0,Yes
13,clariah-tools.ttl,code-lib.ttl,schema1,author,sdo,author,100.0,Yes
23,clariah-tools.ttl,code-lib.ttl,rdf,type,rdf,type,100.0,Yes
48,clariah-tools.ttl,code-lib.ttl,schema1,name,sdo,name,100.0,Yes
50,clariah-tools.ttl,code-lib.ttl,schema1,funding,sdo,funding,100.0,Yes
59,clariah-tools.ttl,code-lib.ttl,schema1,producer,sdo,producer,100.0,Yes
60,clariah-tools.ttl,code-lib.ttl,schema1,maintainer,sdo,maintainer,100.0,Yes


Unmatched Properties


Unnamed: 0,File 1,File 2,Prefix File 1,Property from File 1,Prefix File 2,Property from File 2,Match Score (%),Matched
1,clariah-tools.ttl,code-lib.ttl,schema1,softwareHelp,dct,isReferencedBy,38.461538,No
2,clariah-tools.ttl,code-lib.ttl,schema1,codeRepository,dct,creator,47.619048,No
3,clariah-tools.ttl,code-lib.ttl,codemeta,developmentStatus,codemeta,referencePublication,32.432432,No
4,clariah-tools.ttl,code-lib.ttl,schema1,dateModified,sdo,maintainer,36.363636,No
5,clariah-tools.ttl,code-lib.ttl,schema1,softwareRequirements,dct,requires,57.142857,No
6,clariah-tools.ttl,code-lib.ttl,codemeta,contIntegration,dct,creator,45.454545,No
7,clariah-tools.ttl,code-lib.ttl,schema1,repository,dct,creator,58.823529,No
8,clariah-tools.ttl,code-lib.ttl,dct,relation,dct,creator,66.666667,No
9,clariah-tools.ttl,code-lib.ttl,ns1,executableName,codemeta,referencePublication,41.176471,No
10,clariah-tools.ttl,code-lib.ttl,owl,sameAs,sdo,name,60.0,No


In [15]:
# import pandas as pd

# matched_df   = df[df['Matched'] == 'Yes']
# unmatched_df = df[df['Matched'] == 'No']

# matched_df.to_csv('matched_table.csv')
# unmatched_df.to_csv('unmatched_table.csv')
# df.to_csv('df_table.csv')

In [16]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

matched_df   = df[df['Matched'] == 'Yes']
unmatched_df = df[df['Matched'] == 'No']

total = len(df)
matched = len(matched_df)
unmatched = len(unmatched_df)

if total > 0:
    matched_pct = (matched / total) * 100
    unmatched_pct = (unmatched / total) * 100
else:
    matched_pct = 0
    unmatched_pct = 0

summary_df = pd.DataFrame({
    "Category": ["Matched", "Unmatched", "Total"],
    "Count": [matched, unmatched, total],
    "Percentage": [f"{matched_pct:.2f}"" %", f"{unmatched_pct:.2f}"" %", "100.00 %"]
})

print("Matching Summary")
display(summary_df)

Matching Summary


Unnamed: 0,Category,Count,Percentage
0,Matched,8,13.11 %
1,Unmatched,53,86.89 %
2,Total,61,100.00 %
