In [4]:
!pip install rdflib pandas rapidfuzz requests pyyaml




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Cell 0 — Setup (install & imports)

In [1]:
# If needed, uncomment to install:
# !pip install rdflib pandas rapidfuzz requests pyyaml

import os, json, math, re, uuid, time, pathlib, textwrap
from typing import List, Dict, Any, Tuple
import pandas as pd
from rdflib import Graph, RDF, RDFS, OWL, SKOS, URIRef
from rapidfuzz import process, fuzz
import numpy as np

print("Imports ready.")


Imports ready.


Cell 1 — Configuration

In [2]:
# --- REQUIRED: set your file paths here ---
TTL_PATH = r"sws.ttl"     # e.g., r"C:\projects\base.ttl"
DOC_MD_PATH = r"swsmd.md"        # e.g., r"C:\projects\usecase.md"

# Ollama local model (e.g., 'llama3.1:8b', 'qwen2.5:7b', etc.)
OLLAMA_MODEL = "mistral"

# Choose "flat_json" (default, simpler) or "jsonld"
MAPPING_FORMAT = "flat_json"

# Embedding settings
USE_EMBEDDINGS = True                 # turn off to fall back to fuzzy-only
OLLAMA_EMBED_MODEL = "nomic-embed-text"
EMBED_TOPK = 5                        # how many nearest ontology terms to consider
EMBED_MIN = 0.75                      # cosine threshold to accept a match directly
COMBO_ALPHA = 0.65                    # blend: alpha*cosine + (1-alpha)*fuzzy
COMBO_MIN = 0.80                      # accept if blended score >= this
EMBED_BATCH = 64                      # batch size for embedding ontology surface forms


# Output directory
OUT_DIR = "./outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print("Config set. Edit TTL_PATH and DOC_MD_PATH before running.")


Config set. Edit TTL_PATH and DOC_MD_PATH before running.


Cell 2 — Load ontology & build lookup index

In [3]:
g = Graph()
g.parse(TTL_PATH, format="turtle")

def qname_safe(uri: URIRef) -> str:
    try:
        return g.namespace_manager.normalizeUri(uri)
    except Exception:
        return str(uri)

def collect_terms(graph: Graph):
    classes = sorted(set(graph.subjects(RDF.type, OWL.Class)))
    obj_props = sorted(set(graph.subjects(RDF.type, OWL.ObjectProperty)))
    data_props = sorted(set(graph.subjects(RDF.type, OWL.DatatypeProperty)))
    return classes, obj_props, data_props

def get_label(graph: Graph, term: URIRef) -> str:
    lab = graph.value(term, RDFS.label)
    return str(lab) if lab else qname_safe(term)

classes, obj_props, data_props = collect_terms(g)

def build_index(graph: Graph, terms: List[URIRef], kind: str):
    rows = []
    for t in terms:
        row = {
            "kind": kind,
            "iri": str(t),
            "qname": qname_safe(t),
            "label": get_label(graph, t),
            "alt_labels": [str(o) for o in graph.objects(t, SKOS.altLabel)]
        }
        rows.append(row)
    return rows

rows = []
rows += build_index(g, classes, "class")
rows += build_index(g, obj_props, "object_property")
rows += build_index(g, data_props, "data_property")

onto_df = pd.DataFrame(rows).fillna("")
print(f"Loaded {len(classes)} classes, {len(obj_props)} object properties, {len(data_props)} data properties.")
onto_df

Loaded 71 classes, 26 object properties, 13 data properties.


Unnamed: 0,kind,iri,qname,label,alt_labels
0,class,http://data.europa.eu/esco/model#Skill,esco:Skill,Skill,[]
1,class,http://www.sws.org/sws#AIAcceptance,:AIAcceptance,AIAcceptance,[]
2,class,http://www.sws.org/sws#Accuracy,:Accuracy,Accuracy,[]
3,class,http://www.sws.org/sws#Actor,:Actor,Actor,[]
4,class,http://www.sws.org/sws#ActorStatus,:ActorStatus,ActorStatus,[]
...,...,...,...,...,...
105,data_property,http://www.sws.org/sws#hasSkillLevel,:hasSkillLevel,hasSkillLevel,[]
106,data_property,http://www.sws.org/sws#hasSocialPerformanceScore,:hasSocialPerformanceScore,hasSocialPerformanceScore,[]
107,data_property,http://www.sws.org/sws#hasTaskComplexity,:hasTaskComplexity,hasTaskComplexity,[]
108,data_property,http://www.sws.org/sws#hasTaskStatusDescriptor,:hasTaskStatusDescriptor,hasTaskStatusDescriptor,[]


Cell 3 — Build fuzzy matchers

In [4]:
def normalize(s: str) -> str:
    return re.sub(r"\s+", " ", s.lower().strip())

def build_search_space(df: pd.DataFrame):
    vocab, key_to_meta = [], {}
    for _, row in df.iterrows():
        entries = set([row["qname"], row["label"]])
        for a in (row["alt_labels"] if isinstance(row["alt_labels"], list) else []):
            entries.add(a)
        for e in entries:
            k = normalize(str(e))
            vocab.append(k)
            key_to_meta[k] = {"kind": row["kind"], "iri": row["iri"], "qname": row["qname"], "label": row["label"]}
    return vocab, key_to_meta

vocab, key_to_meta = build_search_space(onto_df)
print(f"Search space built with {len(vocab)} surface forms.")


Search space built with 220 surface forms.


In [5]:
import requests
import numpy as np

def _try_embed_batch(texts, model, timeout=120):
    """Try Ollama embeddings with batched 'input'. Fallback to per-item 'prompt' if needed.
    Returns list of vectors (len == len(texts))."""
    url = "http://localhost:11434/api/embeddings"
    # Preferred: batched 'input'
    try:
        r = requests.post(url, json={"model": model, "input": texts}, timeout=timeout)
        if r.ok:
            data = r.json()
            # Possible shapes:
            # 1) { "embeddings": [[...], [...], ...] }
            # 2) { "embeddings": [ { "embedding": [...] }, ... ] }
            embs = data.get("embeddings")
            if isinstance(embs, list) and embs:
                if isinstance(embs[0], dict) and "embedding" in embs[0]:
                    return [e["embedding"] for e in embs]
                if isinstance(embs[0], list):
                    return embs
    except Exception:
        pass

    # Fallback: per-text requests with 'prompt'
    out = []
    for t in texts:
        rr = requests.post(url, json={"model": model, "prompt": t}, timeout=timeout)
        rr.raise_for_status()
        dj = rr.json()
        vec = dj.get("embedding")
        if vec is None:
            data = dj.get("data")
            if isinstance(data, list) and data and isinstance(data[0], dict):
                vec = data[0].get("embedding")
        out.append(vec)
    return out

def embed_texts_ollama(texts, model, batch=64):
    vecs = []
    for i in range(0, len(texts), batch):
        chunk = texts[i:i+batch]
        vecs.extend(_try_embed_batch(chunk, model))
    return np.array(vecs, dtype=np.float32)

def l2_normalize(mat):
    mat = np.array(mat, dtype=np.float32)
    norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9
    return mat / norms

# Build embedding index over ontology surface forms (from Cell 3)
if USE_EMBEDDINGS:
    print(f"Embedding {len(vocab)} ontology surface forms with {OLLAMA_EMBED_MODEL} ...")
    E_onto = embed_texts_ollama(vocab, OLLAMA_EMBED_MODEL, batch=EMBED_BATCH)
    E_onto = l2_normalize(E_onto)
    print("Ontology embedding index ready:", E_onto.shape)
else:
    E_onto = None


Embedding 220 ontology surface forms with nomic-embed-text ...
Ontology embedding index ready: (220, 768)


Cell 4 — Load Markdown and chunk

In [6]:
with open(DOC_MD_PATH, "r", encoding="utf-8") as f:
    md_text = f.read()

CHUNK_CHAR_LIMIT = 2500  # adjust if needed
paras = [p.strip() for p in re.split(r"\n{2,}", md_text) if p.strip()]
chunks, buf = [], ""
for p in paras:
    if len(buf) + len(p) + 2 <= CHUNK_CHAR_LIMIT:
        buf = (buf + "\n\n" + p).strip()
    else:
        if buf: chunks.append(buf)
        buf = p
if buf: chunks.append(buf)

print(f"Document split into {len(chunks)} chunks.")


Document split into 22 chunks.


Cell 5 — Ollama helper

In [7]:
import requests

def ollama_chat(model: str, system_prompt: str, user_prompt: str, as_json: bool = True, timeout: int = 120):
    """Calls Ollama's /api/chat endpoint. Ensure Ollama is running locally."""
    url = "http://localhost:11434/api/chat"
    hdrs = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "stream": False,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "options": {"temperature": 0.2}
    }
    r = requests.post(url, headers=hdrs, json=data, timeout=timeout)
    r.raise_for_status()
    content = r.json().get("message", {}).get("content", "")
    if as_json:
        m = re.search(r"\{.*\}", content, flags=re.DOTALL)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                pass
    return content

print("Ollama helper ready.")


Ollama helper ready.


Cell 6 — Mapping prompts (Flat JSON default, JSON-LD optional)

In [8]:
SYSTEM_PROMPT = (
    "You are an expert ontology mapper. Given an ontology dictionary and a text chunk, "
    "extract: (A) mentions aligned to existing ontology classes/properties, and (B) new concept candidates. "
    "Return the requested output format exactly."
)

def make_user_prompt(chunk_text: str, ontology_terms: pd.DataFrame, format: str = "flat_json") -> str:
    MAX_TERMS = 300
    df = ontology_terms.head(MAX_TERMS)

    lines = []
    for _, r in df.iterrows():
        alts = ", ".join(r["alt_labels"]) if isinstance(r["alt_labels"], list) else ""
        base = f"- [{r['kind']}] {r['qname']} | label: {r['label']}"
        lines.append(base + (f" | alt: {alts}" if alts else ""))

    if format == "flat_json":
        out_spec = """
Return strictly valid JSON with two arrays:
{
  "mapped": [
    {
      "span": "<exact text>",
      "sentence": "<the sentence>",
      "concept_kind": "class|object_property|data_property|instance",
      "ontology_term": "<qname or iri>",
      "confidence": 0-1
    }
  ],
  "new_candidates": [
    {
      "term": "<preferred label>",
      "type_guess": "class|object_property|data_property",
      "definition": "<one-sentence definition inferred from text>",
      "example_sentence": "<verbatim sentence from chunk>",
      "rationale": "<why it seems new>",
      "confidence": 0-1
    }
  ]
}
"""
    else:
        out_spec = """
Return strictly valid JSON-LD with an @graph array; also add top-level "new_candidates".
Use @type for class and properties as keys; keep nesting minimal.
{
  "@graph": [ { "@type": "ex:YourClass", "ex:someProperty": "value" } ],
  "new_candidates": [ { "term": "...", "type_guess": "...", "definition": "...", "example_sentence": "...", "confidence": 0-1 } ]
}
"""

    prompt = f"""ONTOLOGY DICTIONARY (subset):
{chr(10).join(lines)}

TEXT CHUNK (verbatim):
<<<
{chunk_text}
<<<

OUTPUT FORMAT SPEC:
{out_spec}

Rules:
- Use only terms that appear in the dictionary for "ontology_term".
- If no matching term exists for a mention, put it under "new_candidates".
- Be conservative; prefer precision over recall.
- Ensure the JSON is syntactically valid.
"""
    return prompt

print("Prompt templates ready.")


Prompt templates ready.


Cell 7 — Run mapping over chunks

In [9]:
all_mapped, all_new = [], []
fmt = "flat_json" if MAPPING_FORMAT.lower() == "flat_json" else "jsonld"

for i, ch in enumerate(chunks, 1):
    print(f"Processing chunk {i}/{len(chunks)} ...")
    up = make_user_prompt(ch, onto_df, format=fmt)
    resp = ollama_chat(OLLAMA_MODEL, SYSTEM_PROMPT, up, as_json=True)

    if isinstance(resp, dict) and "mapped" in resp:  # flat_json path
        mapped = resp.get("mapped", [])
        newc = resp.get("new_candidates", [])
    else:  # jsonld path → flatten a bit
        mapped = resp.get("@graph", []) if isinstance(resp, dict) else []
        newc = resp.get("new_candidates", []) if isinstance(resp, dict) else []
        flat_rows = []
        for item in mapped:
            cls = item.get("@type", "")
            for k, v in item.items():
                if k.startswith("@"):
                    continue
                if isinstance(v, list):
                    for el in v:
                        flat_rows.append({"span": str(el), "sentence": "", "concept_kind": "object_property", "ontology_term": k, "confidence": 0.6})
                else:
                    flat_rows.append({"span": str(v), "sentence": "", "concept_kind": "object_property", "ontology_term": k, "confidence": 0.6})
            if cls:
                flat_rows.append({"span": str(cls), "sentence": "", "concept_kind": "class", "ontology_term": str(cls), "confidence": 0.6})
        mapped = flat_rows

    all_mapped.extend(mapped)
    all_new.extend(newc)

print(f"Collected {len(all_mapped)} mapped mentions and {len(all_new)} new concept candidates.")


Processing chunk 1/22 ...
Processing chunk 2/22 ...
Processing chunk 3/22 ...
Processing chunk 4/22 ...
Processing chunk 5/22 ...
Processing chunk 6/22 ...
Processing chunk 7/22 ...
Processing chunk 8/22 ...
Processing chunk 9/22 ...
Processing chunk 10/22 ...
Processing chunk 11/22 ...
Processing chunk 12/22 ...
Processing chunk 13/22 ...
Processing chunk 14/22 ...
Processing chunk 15/22 ...
Processing chunk 16/22 ...
Processing chunk 17/22 ...
Processing chunk 18/22 ...
Processing chunk 19/22 ...
Processing chunk 20/22 ...
Processing chunk 21/22 ...
Processing chunk 22/22 ...
Collected 158 mapped mentions and 66 new concept candidates.


Cell 8 — Post-process (align & filter)

In [10]:
mapped_df = pd.DataFrame(all_mapped)
if not mapped_df.empty:
    mapped_df["span_norm"] = mapped_df["span"].astype(str).str.lower().str.strip()
    known_qnames = set(onto_df["qname"].str.lower())
    known_iris = set(onto_df["iri"].str.lower())

    def pick_meta(term: str, span_text: str):
        """Map to ontology via exact → embedding → fuzzy."""
        # 1) exact qname/IRI
        if term:
            k = str(term).lower().strip()
            if k in known_qnames or k in known_iris:
                return {"qname": term, "kind": key_to_meta.get(k, {}).get("kind", "")}

        # 2) embeddings (prefer span text)
        if USE_EMBEDDINGS and E_onto is not None:
            qtxt = (span_text or term or "").strip()
            if qtxt:
                qv = embed_texts_ollama([qtxt], OLLAMA_EMBED_MODEL, batch=1)
                qv = l2_normalize(qv)[0]
                sims = E_onto @ qv
                top_idx = sims.argsort()[-EMBED_TOPK:][::-1]
                best_idx = int(top_idx[0])
                best_key = vocab[best_idx]
                best_sim = float(sims[best_idx])

                # Blend with fuzzy for stability
                from rapidfuzz import process, fuzz
                fuzzy_match = process.extractOne(qtxt.lower(), list(key_to_meta.keys()), scorer=fuzz.WRatio)
                fuzzy_score = (fuzzy_match[1] / 100.0) if fuzzy_match else 0.0

                blended = COMBO_ALPHA * best_sim + (1.0 - COMBO_ALPHA) * fuzzy_score
                if (best_sim >= EMBED_MIN) or (blended >= COMBO_MIN):
                    meta = key_to_meta.get(best_key)
                    if meta:
                        return {"qname": meta["qname"], "kind": meta["kind"]}

        # 3) fuzzy fallback
        if key_to_meta:
            q = (span_text or term or "").lower().strip()
            cand = process.extractOne(q, list(key_to_meta.keys()), scorer=fuzz.WRatio)
            if cand and cand[1] >= 90:
                meta = key_to_meta[cand[0]]
                return {"qname": meta["qname"], "kind": meta["kind"]}
        return None

    metas = [pick_meta(row.get("ontology_term",""), row.get("span","")) for _, row in mapped_df.iterrows()]
    mapped_df["match_meta"] = metas
    mapped_df = mapped_df[mapped_df["match_meta"].notnull()]
    mapped_df["matched_qname"] = mapped_df["match_meta"].apply(lambda m: m["qname"] if m else "")
    mapped_df["matched_kind"] = mapped_df["match_meta"].apply(lambda m: m["kind"] if m else "")
else:
    mapped_df = pd.DataFrame(columns=[
        "span","sentence","concept_kind","ontology_term","confidence","span_norm","match_meta","matched_qname","matched_kind"
    ])

new_df = pd.DataFrame(all_new).fillna("")

print("Aligned mapped mentions:", len(mapped_df))
print("New concept candidates:", len(new_df))
mapped_df.head(10)


Aligned mapped mentions: 32
New concept candidates: 66


Unnamed: 0,span,sentence,concept_kind,ontology_term,confidence,span_norm,match_meta,matched_qname,matched_kind
1,Operational Equipment Effectiveness (OEE),Packaging material production lines are genera...,class,esco:Performance,0.8,operational equipment effectiveness (oee),"{'qname': ':Operational', 'kind': 'class'}",:Operational,class
7,"less skilled, inexperienced operators",Given the repetitive and demanding nature of m...,instance,esco:HumanActor,0.6,"less skilled, inexperienced operators","{'qname': 'esco:Skill', 'kind': 'class'}",esco:Skill,class
8,Human actor,The addition of the human actor inside DTs pos...,class,http://www.w3.org/ns/activitystreams#Actor,0.8,human actor,"{'qname': ':Actor', 'kind': 'class'}",:Actor,class
23,working skills,advanced automatic equipment presents a differ...,class,http://purl.org/ontology/mo/WorkingSkill,0.8,working skills,"{'qname': 'esco:Skill', 'kind': 'class'}",esco:Skill,class
34,IMA E-CO Flex machine’s current status,1. AI4WORK shall provide a troubleshooting too...,instance,esco:Resource,0.8,ima e-co flex machine’s current status,"{'qname': ':Status', 'kind': 'class'}",:Status,class
41,digital twin,3. The digital twin should model the functioni...,class,DigitalTwin,1.0,digital twin,"{'qname': ':DigitalTwin', 'kind': 'class'}",:DigitalTwin,class
44,AI/robot,The SWS Management component aims to facilitat...,NonHumanActor,:NonHumanActor,0.9,ai/robot,"{'qname': ':NonHumanActor', 'kind': 'class'}",:NonHumanActor,class
45,human,"Depending on the respective work situation, th...",HumanActor,:HumanActor,0.9,human,"{'qname': ':HumanActor', 'kind': 'class'}",:HumanActor,class
46,current level of uncertainty of the AI/robot,"Depending on the respective work situation, th...",data_property,:hasConfidence,0.8,current level of uncertainty of the ai/robot,"{'qname': ':hasConfidence', 'kind': 'data_prop...",:hasConfidence,data_property
47,the current work situation,"Depending on the respective work situation, th...",Context,:Context,0.8,the current work situation,"{'qname': ':Context', 'kind': 'class'}",:Context,class


In [12]:
used_classes = sorted(set(mapped_df.loc[mapped_df["matched_kind"]=="class", "matched_qname"].tolist()))
used_obj_props = sorted(set(mapped_df.loc[mapped_df["matched_kind"]=="object_property", "matched_qname"].tolist()))
used_data_props = sorted(set(mapped_df.loc[mapped_df["matched_kind"]=="data_property", "matched_qname"].tolist()))

def group_candidates(df: pd.DataFrame):
    groups = {}
    if df.empty:
        return groups
    for tg in ["class","object_property","data_property",""]:
        sub = df[df["type_guess"].str.lower()==tg] if tg else df[df["type_guess"]==""]
        if not sub.empty:
            items = []
            for _, r in sub.iterrows():
                items.append({
                    "term": r.get("term",""),
                    "definition": r.get("definition",""),
                    "example_sentence": r.get("example_sentence",""),
                    "rationale": r.get("rationale",""),
                    "confidence": r.get("confidence","")
                })
            groups[tg if tg else "unspecified"] = items
    return groups

groups = group_candidates(new_df)

dso_lines = []
dso_lines.append("# Domain-Specific Ontology (Text Outline)\n")
dso_lines.append("## Reused Classes")
dso_lines += [f"- {c}" for c in used_classes] if used_classes else ["- (none)"]

dso_lines.append("\n## Reused Object Properties")
dso_lines += [f"- {p}" for p in used_obj_props] if used_obj_props else ["- (none)"]

dso_lines.append("\n## Reused Data Properties")
dso_lines += [f"- {p}" for p in used_data_props] if used_data_props else ["- (none)"]

dso_lines.append("\n## New Class Candidates")
for item in groups.get("class", []):
    dso_lines.append(f"- **{item['term']}** — {item['definition']}  (e.g., \"{item['example_sentence']}\")")

dso_lines.append("\n## New Object Property Candidates")
for item in groups.get("object_property", []):
    dso_lines.append(f"- **{item['term']}** — {item['definition']}  (e.g., \"{item['example_sentence']}\")")

dso_lines.append("\n## New Data Property Candidates")
for item in groups.get("data_property", []):
    dso_lines.append(f"- **{item['term']}** — {item['definition']}  (e.g., \"{item['example_sentence']}\")")

dso_lines.append("\n## Unspecified-Type Candidates")
for item in groups.get("unspecified", []):
    dso_lines.append(f"- **{item['term']}** — {item['definition']}  (e.g., \"{item['example_sentence']}\")")

outline_path = os.path.join(OUT_DIR, "dso_outline.md")
with open(outline_path, "w", encoding="utf-8") as f:
    f.write("\n".join(dso_lines))

mapped_csv = os.path.join(OUT_DIR, "mapped_mentions.csv")
new_csv = os.path.join(OUT_DIR, "new_concepts.csv")
mapped_df.to_csv(mapped_csv, index=False, encoding="utf-8")
new_df.to_csv(new_csv, index=False, encoding="utf-8")

print("Wrote:", outline_path)
print("Also wrote:", mapped_csv, "and", new_csv)


Wrote: ./outputs\dso_outline.md
Also wrote: ./outputs\mapped_mentions.csv and ./outputs\new_concepts.csv
