In [1]:
!pip install ollama rdflib networkx faiss-cpu pandas numpy rapidfuzz pyshacl

Collecting ollama
  Downloading ollama-0.5.3-py3-none-any.whl.metadata (4.3 kB)
Collecting networkx
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp313-cp313-win_amd64.whl.metadata (5.1 kB)
Collecting pyshacl
  Downloading pyshacl-0.30.1-py3-none-any.whl.metadata (35 kB)
Collecting httpx>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic>=2.9 (from ollama)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting owlrl<8,>=7.1.2 (from pyshacl)
  Downloading owlrl-7.1.4-py3-none-any.whl.metadata (3.8 kB)
Collecting prettytable>=3.7.0 (from pyshacl)
  Downloading prettytable-3.16.0-py3-none-any.whl.metadata (33 kB)
Collecting html5rdf<2,>=1.2 (from rdflib[html]!=7.1.2,<8.0,>=7.1.1->pyshacl)
  Downloading html5rdf-1.2.1-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting anyio (from httpx>=0.27->ollama)
  Using cached anyio-4.10.0-py3-none-any.whl.


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#Cell 1 — Setup
# If needed, uncomment:
# !pip install ollama rdflib networkx faiss-cpu pandas numpy rapidfuzz pyshacl

import os, re, json, time, uuid, glob
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
import ollama
import rdflib
from rdflib import Graph
from rdflib.namespace import RDF, RDFS, OWL, SKOS, XSD
import networkx as nx
import faiss
from rapidfuzz import fuzz

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    na = np.linalg.norm(a) + 1e-9
    nb = np.linalg.norm(b) + 1e-9
    return float((a @ b) / (na * nb))




In [6]:
# Cell 2 — Config
DATA_DIR = Path("data")
ONTOLOGY_TTL = DATA_DIR / "sws.ttl"
CORPUS_DIR = DATA_DIR / "corpus"

LLM_MODEL   = os.environ.get("OLLAMA_LLM", "mistral")
EMBED_MODEL = os.environ.get("OLLAMA_EMBED", "nomic-embed-text")

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# Retrieval / alignment knobs
K_ENTITY = 60
K_HYPEREDGE = 60
ENTITY_ALIGN_MIN = 0.55
NEW_CONCEPT_FREQ_MIN = 2

DATA_DIR.mkdir(exist_ok=True); CORPUS_DIR.mkdir(exist_ok=True)

print("LLM:", LLM_MODEL, "| EMBED:", EMBED_MODEL)
print("Data:", DATA_DIR.resolve(), "| Output:", OUT_DIR.resolve())


LLM: mistral | EMBED: nomic-embed-text
Data: D:\RAG\onto-rag-koli\data | Output: D:\RAG\onto-rag-koli\outputs


In [7]:
# Cell 3 — Check Ollama
def check_ollama():
    try:
        ms = ollama.list()
        names = [m.get("name") for m in ms.get("models", [])]
        print("Ollama models:", names)
        if LLM_MODEL not in names:
            print(f"Note: pull chat model ->  ollama pull {LLM_MODEL}")
        if EMBED_MODEL not in names:
            print(f"Note: pull embed model -> ollama pull {EMBED_MODEL}")
        return True
    except Exception as e:
        print("Could not reach Ollama. Run: `ollama serve`")
        raise
_ = check_ollama()


Ollama models: [None, None, None, None, None, None, None, None]
Note: pull chat model ->  ollama pull mistral
Note: pull embed model -> ollama pull nomic-embed-text


In [8]:
# Cell 4 — Load upper ontology (TTL → catalogs)
g = Graph(); g.parse(str(ONTOLOGY_TTL))
RDFS = rdflib.namespace.RDFS; OWL = rdflib.namespace.OWL; SKOS = rdflib.namespace.SKOS

def lit(s): return str(s) if s is not None else ""

def label(graph, node):
    for l in graph.objects(node, RDFS.label): return lit(l)
    return None

def alts(graph, node):
    return [lit(a) for a in graph.objects(node, SKOS.altLabel)]

classes = [{"iri": str(c), "label": label(g,c) or str(c), "alt": alts(g,c)}
           for c in g.subjects(rdflib.RDF.type, OWL.Class)]

obj_props = [{"iri": str(p), "label": label(g,p) or str(p), "alt": alts(g,p)} 
             for p in g.subjects(rdflib.RDF.type, OWL.ObjectProperty)]

dt_props  = [{"iri": str(p), "label": label(g,p) or str(p), "alt": alts(g,p)} 
             for p in g.subjects(rdflib.RDF.type, OWL.DatatypeProperty)]

print(f"Loaded classes={len(classes)}, obj_props={len(obj_props)}, dt_props={len(dt_props)}")
ONTO = {"classes": classes, "obj_props": obj_props, "dt_props": dt_props}


Loaded classes=71, obj_props=26, dt_props=13


In [12]:
# Cell 5 — Load Markdown corpus

# Drop a demo file if empty:
if not any(CORPUS_DIR.glob("*.md")):
    print("No docs found")
    
DOCS = [{"path": str(p), "text": p.read_text(encoding="utf-8", errors="ignore")}
        for p in sorted(CORPUS_DIR.glob("*.md"))]
print("Docs:", len(DOCS))


Docs: 1


In [None]:
from string import Template
import json5
!pip install json5

In [21]:
# Cell 6 — N-ary extraction (hyperedges + entities) via Ollama
EXTRACTION_PROMPT_TMPL = Template("""
You extract n-ary relational facts (hyperedges) and entities.

Return ONE JSON object ONLY. No prose, no extra blocks, no code fences.

{
  "hyperedges": [
    {
      "text": "...",
      "score": 0.0,
      "entities": [
        { "name": "...", "type": "...", "description": "...", "key_score": 0.0 }
      ]
    }
  ]
}

- Split the input into coherent knowledge fragments (as hyperedges).
- Include all entities per hyperedge.
- Keep scores as floats.
- Same language as the input.

INPUT:
---
$chunk
---
""")


def _strip_code_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        # remove leading and trailing fenced blocks
        s = s.strip("`")
        # often starts with ```json
        if s.lower().startswith("json"):
            s = s[4:].lstrip()
    return s

def _clean_common_json_issues(s: str) -> str:
    # normalize smart quotes
    s = s.replace("“", "\"").replace("”", "\"").replace("’", "'")
    # remove trailing commas before } or ]
    s = re.sub(r',\s*([}\]])', r'\1', s)
    # remove BOM if present
    s = s.lstrip("\ufeff")
    return s

def _extract_json_objects(s: str) -> list[str]:
    """Extract top-level {...} blocks from a string that may contain multiple JSON objects."""
    objs, in_str, esc, depth, start = [], False, False, 0, None
    for i, ch in enumerate(s):
        if in_str:
            if esc:
                esc = False
            elif ch == '\\':
                esc = True
            elif ch == '"':
                in_str = False
            continue
        else:
            if ch == '"':
                in_str = True
            elif ch == '{':
                if depth == 0:
                    start = i
                depth += 1
            elif ch == '}':
                depth -= 1
                if depth == 0 and start is not None:
                    objs.append(s[start:i+1])
                    start = None
    return objs

def _try_load_json(s: str):
    try:
        return json.loads(s)
    except Exception:
        if json5 is not None:
            try:
                return json5.loads(s)
            except Exception:
                pass
    # final attempt: clean again then try std json
    s2 = _clean_common_json_issues(s)
    try:
        return json.loads(s2)
    except Exception:
        if json5 is not None:
            try:
                return json5.loads(s2)
            except Exception:
                pass
    raise

def ollama_json(prompt: str, model=LLM_MODEL, tries=2) -> dict:
    last_err = None
    for _ in range(tries + 1):
        # Prefer strict JSON mode; some models may ignore it
        try:
            resp = ollama.generate(
                model=model,
                prompt=prompt,
                options={"temperature": 0.2, "format": "json"}
            )
        except Exception:
            resp = ollama.generate(model=model, prompt=prompt, options={"temperature": 0.2})

        raw = resp.get("response", "").strip()
        txt = _strip_code_fences(raw)
        txt = _clean_common_json_issues(txt)

        # 1) If it’s a single JSON object, parse directly
        try:
            obj = _try_load_json(txt)
            if isinstance(obj, dict) and "hyperedges" in obj:
                return obj
        except Exception as e:
            last_err = e

        # 2) Otherwise, merge multiple top-level JSON objects
        merged = {"hyperedges": []}
        ok = False
        for chunk in _extract_json_objects(txt):
            try:
                one = _try_load_json(_clean_common_json_issues(chunk))
                if isinstance(one, dict) and "hyperedges" in one and isinstance(one["hyperedges"], list):
                    merged["hyperedges"].extend(one["hyperedges"])
                    ok = True
            except Exception as e:
                last_err = e
                continue
        if ok:
            return merged

        time.sleep(0.3)

    # Debug help: write the raw text so you can inspect the failing case
    dbg_path = OUT_DIR / "failed_ollama_response.txt"
    dbg_path.write_text(raw, encoding="utf-8")
    raise RuntimeError(f"Failed to parse JSON from model output. Last error: {last_err}. "
                       f"Saved raw response to: {dbg_path}")



def extract_hypergraph(text: str, max_chars=2500):
    # simple line-based chunking
    lines, chunks, buf, n = text.splitlines(), [], [], 0
    for L in lines:
        if n + len(L) + 1 > max_chars and buf:
            chunks.append("\n".join(buf)); buf=[]; n=0
        buf.append(L); n += len(L)+1
    if buf: chunks.append("\n".join(buf))

    out = []
    for ch in chunks:
        prompt = EXTRACTION_PROMPT_TMPL.substitute(chunk=ch)
        obj = ollama_json(prompt)
        for he in obj.get("hyperedges", []):
            he["id"] = str(uuid.uuid4())
            for ent in he.get("entities", []):
                ent["id"] = str(uuid.uuid4())
            out.append(he)
    return out

# cache
CACHE = OUT_DIR / "extraction.json"
if CACHE.exists():
    data = json.loads(CACHE.read_text())
else:
    data = {"docs":[]}
    for d in DOCS:
        hyperedges = extract_hypergraph(d["text"])
        data["docs"].append({"path": d["path"], "hyperedges": hyperedges})
    CACHE.write_text(json.dumps(data, indent=2, ensure_ascii=False))

print("Hyperedges per doc:", [len(d["hyperedges"]) for d in data["docs"]])


Hyperedges per doc: [197]


In [22]:
# Cell 7 — Build bipartite hypergraph + embeddings + FAISS
import networkx as nx

G = nx.Graph()
ENTITY_NODES, HYPER_NODES = {}, {}

def add_hyperedge(doc_path, he):
    hid = he["id"]
    HYPER_NODES[hid] = {"id": hid, "text": he.get("text",""), "score": float(he.get("score",0)), "doc": doc_path}
    G.add_node(hid, kind="hyperedge")
    for ent in he.get("entities", []):
        eid = ent["id"]
        if eid not in ENTITY_NODES:
            ENTITY_NODES[eid] = {"id": eid, "name": ent.get("name",""), "type": ent.get("type","Entity"),
                                 "description": ent.get("description",""), "key_score": float(ent.get("key_score",0))}
            G.add_node(eid, kind="entity")
        G.add_edge(hid, eid, kind="MENTIONS")

for doc in data["docs"]:
    for he in doc["hyperedges"]:
        add_hyperedge(doc["path"], he)

print(f"Entities={len(ENTITY_NODES)} Hyperedges={len(HYPER_NODES)} Edges={G.number_of_edges()}")

def embed_texts(texts: List[str], model=EMBED_MODEL):
    vecs=[]
    for t in texts:
        r = ollama.embeddings(model=model, prompt=t)
        vecs.append(np.array(r["embedding"], dtype=np.float32))
    return np.vstack(vecs) if vecs else np.zeros((0,768), dtype=np.float32)

entity_ids = list(ENTITY_NODES.keys())
hyper_ids  = list(HYPER_NODES.keys())

E_TEXTS = [ENTITY_NODES[e]["name"] + " :: " + ENTITY_NODES[e]["description"] for e in entity_ids]
H_TEXTS = [HYPER_NODES[h]["text"] for h in hyper_ids]

E_MAT = embed_texts(E_TEXTS)
H_MAT = embed_texts(H_TEXTS)

def build_index(mat):
    if mat.size == 0: return None
    d = mat.shape[1]
    index = faiss.IndexFlatIP(d)
    matn = mat / (np.linalg.norm(mat,axis=1,keepdims=True)+1e-9)
    index.add(matn.astype(np.float32))
    return index

E_INDEX = build_index(E_MAT)
H_INDEX = build_index(H_MAT)

def embed_query(q:str):
    r = ollama.embeddings(model=EMBED_MODEL, prompt=q)
    return np.array(r["embedding"], dtype=np.float32)

def faiss_search(index, mat, qvec, k=10):
    qn = qvec / (np.linalg.norm(qvec)+1e-9)
    D, I = index.search(qn[np.newaxis,:].astype(np.float32), k)
    return D[0], I[0]


Entities=620 Hyperedges=197 Edges=620


In [23]:
# Cell 8 — Dual retrieval + expansion
def retrieve_entities(query:str, k=K_ENTITY):
    if E_INDEX is None: return []
    qv = embed_query(query)
    D,I = faiss_search(E_INDEX, E_MAT, qv, k=min(k, len(entity_ids)))
    out=[]
    for d,i in zip(D,I):
        if i<0: continue
        eid = entity_ids[int(i)]
        out.append({"id": eid, "score": float(d), **ENTITY_NODES[eid]})
    return out

def retrieve_hyperedges(query:str, k=K_HYPEREDGE):
    if H_INDEX is None: return []
    qv = embed_query(query)
    D,I = faiss_search(H_INDEX, H_MAT, qv, k=min(k, len(hyper_ids)))
    out=[]
    for d,i in zip(D,I):
        if i<0: continue
        hid = hyper_ids[int(i)]
        out.append({"id": hid, "score": float(d), **HYPER_NODES[hid]})
    return out

def expand_from_entities(eids: List[str]):
    facts=[]
    for eid in eids:
        for nbr in G.neighbors(eid):
            if G.nodes[nbr].get("kind")=="hyperedge":
                ents=[n for n in G.neighbors(nbr) if G.nodes[n].get("kind")=="entity"]
                facts.append({"hyperedge": HYPER_NODES[nbr], "entities":[ENTITY_NODES[x] for x in ents]})
    return facts

def expand_from_hyperedges(hids: List[str]):
    facts=[]
    for hid in hids:
        ents=[n for n in G.neighbors(hid) if G.nodes[n].get("kind")=="entity"]
        facts.append({"hyperedge": HYPER_NODES[hid], "entities":[ENTITY_NODES[x] for x in ents]})
    return facts

def fused_retrieval(query:str, k_entity=K_ENTITY, k_hyper=K_HYPEREDGE):
    ents = retrieve_entities(query, k_entity)
    hyps = retrieve_hyperedges(query, k_hyper)
    eids = [e["id"] for e in ents]
    hids = [h["id"] for h in hyps]
    facts = expand_from_entities(eids) + expand_from_hyperedges(hids)
    uniq, out = set(), []
    for f in facts:
        hid = f["hyperedge"]["id"]
        if hid not in uniq:
            uniq.add(hid); out.append(f)
    return ents, hyps, out

# smoke test
e,h,f = fused_retrieval("hypertension creatinine male mild elevation", 10, 10)
print(len(e), len(h), len(f))




10 10 15


In [24]:
# Cell 9 — Prepare ontology label embeddings
def flatten_labels(items):
    out=[]
    for it in items:
        variants = [it.get("label")] + it.get("alt", [])
        variants = [v for v in variants if v]
        text = " || ".join(dict.fromkeys(variants)) if variants else it["iri"]
        out.append({"iri": it["iri"], "text": text})
    return out

CLASS_LABELS = flatten_labels(ONTO["classes"])
OBJPROP_LABELS = flatten_labels(ONTO["obj_props"])
DTPROP_LABELS  = flatten_labels(ONTO["dt_props"])

def embed_list_texts(rows): 
    return embed_texts([r["text"] for r in rows]) if rows else np.zeros((0,0), dtype=np.float32)

CLS_MAT = embed_list_texts(CLASS_LABELS)
OP_MAT  = embed_list_texts(OBJPROP_LABELS)
DP_MAT  = embed_list_texts(DTPROP_LABELS)

def build_idx(mat):
    if mat.size==0: return None
    idx = faiss.IndexFlatIP(mat.shape[1])
    idx.add((mat/(np.linalg.norm(mat,axis=1,keepdims=True)+1e-9)).astype(np.float32))
    return idx

CLS_IDX = build_idx(CLS_MAT)
OP_IDX  = build_idx(OP_MAT)
DP_IDX  = build_idx(DP_MAT)

def nearest(mat, idx, qvec, k=5):
    if idx is None or mat.size==0: return []
    qn = qvec / (np.linalg.norm(qvec)+1e-9)
    D,I = idx.search(qn[np.newaxis,:].astype(np.float32), k)
    return list(zip(D[0].tolist(), I[0].tolist()))


In [25]:
# Cell 10 — Alignment helpers
from rapidfuzz import fuzz

def lexical_sim(a:str, b:str)->float:
    if not a or not b: return 0.0
    return 0.01 * fuzz.token_set_ratio(a.lower(), b.lower())  # 0..1

def align_entity_to_class(ent:Dict[str,Any], topk=5, alpha=0.65):
    text = (ent.get("name","") + " :: " + ent.get("description","")).strip()
    qv = embed_query(text)
    nns = nearest(CLS_MAT, CLS_IDX, qv, k=topk)
    out=[]
    for score, i in nns:
        cls = CLASS_LABELS[int(i)]
        lex = lexical_sim(text, cls["text"])
        comb = float(alpha*score + (1-alpha)*lex)
        out.append({"class_iri": cls["iri"], "class_text": cls["text"], "embed": float(score), "lex": float(lex), "score": comb})
    return sorted(out, key=lambda x: x["score"], reverse=True)

def suggest_property(ent_type_or_label:str, prefer_dt=False, topk=3):
    if prefer_dt:
        pool, mat, idx = DTPROP_LABELS, DP_MAT, DP_IDX
    else:
        pool, mat, idx = OBJPROP_LABELS, OP_MAT, OP_IDX
    if mat.size==0 or idx is None: return []
    qv = embed_query(ent_type_or_label)
    nns = nearest(mat, idx, qv, k=topk)
    return [{"prop_iri": pool[i]["iri"], "prop_text": pool[i]["text"], "embed": float(s)} for s,i in nns]


In [26]:
def iri_safe(s:str)->str:
    return re.sub(r'[^A-Za-z0-9_]+', '_', s).strip('_') or ("X_"+uuid.uuid4().hex[:6])

BASE = "http://example.org/onto#"

def propose_axioms_for_fact(fact:Dict[str,Any], cls_threshold=ENTITY_ALIGN_MIN):
    he = fact["hyperedge"]; ents = fact["entities"]
    situation_id = "Situation_" + he["id"].split("-")[0]
    abox, tbox, new_classes = [], [], []

    abox.append(f":{situation_id} a :ClinicalSituation .")

    bindings=[]
    for e in ents:
        label = e["name"] or e["type"]
        eid = f"{e['id'].split('-')[0]}_{iri_safe(label)[:24]}"
        best = align_entity_to_class(e, topk=5)
        if best and best[0]["score"] >= cls_threshold:
            cls_iri = best[0]["class_iri"]
        else:
            new_cls = f":{iri_safe(label)}"
            tbox.append(f"{new_cls} a owl:Class ; rdfs:label \"{label}\" .")
            cls_iri = f"{BASE}{iri_safe(label)}"
            new_classes.append({"proposed": cls_iri, "label": label})
        abox.append(f":{eid} a <{cls_iri}> ; rdfs:label \"{label}\" .")
        bindings.append({"id": eid, "label": label, "type": e.get("type","")})

    for b in bindings:
        prefer_dt = "measure" in (b["type"] or b["label"]).lower()
        props = suggest_property(b["type"] or b["label"], prefer_dt=prefer_dt, topk=1)
        if props:
            piri = props[0]["prop_iri"]
        else:
            piri = f"{BASE}relatedTo"
            tbox.append(f"<{piri}> a owl:ObjectProperty ; rdfs:label \"related to\" .")
        abox.append(f":{situation_id} <{piri}> :{b['id']} .")

    abox.append(f":{situation_id} rdfs:comment \"{he['text'].replace('\"','\\\"')}\" .")
    return {"abox":"\n".join(abox), "tbox":"\n".join(sorted(set(tbox))), "situation": situation_id, "new_classes": new_classes}

# Try on first fused fact
_,_,facts = fused_retrieval("hypertension creatinine mild male", 10, 10)
if facts:
    demo = propose_axioms_for_fact(facts[0])
    print(demo["abox"][:400], "\n---\n", demo["tbox"][:400])


:Situation_be2f0654 a :ClinicalSituation .
:9552f7e6_Industry_5_0_paradigm a <http://example.org/onto#Industry_5_0_paradigm> ; rdfs:label "Industry 5.0 paradigm" .
:c207a65b_DTs a <http://www.sws.org/sws#DigitalTwin> ; rdfs:label "DTs" .
:15a3699d_human_actor a <http://www.sws.org/sws#Actor> ; rdfs:label "human actor" .
:be7e7596_physical_parameters a <http://example.org/onto#physical_parameters>  
---
 :Industry_5_0_paradigm a owl:Class ; rdfs:label "Industry 5.0 paradigm" .
:physical_parameters a owl:Class ; rdfs:label "physical parameters" .


In [27]:
from collections import Counter

def best_align_score(e): 
    a = align_entity_to_class(e, topk=3)
    return a[0]["score"] if a else 0.0

def discover_new_concepts(freq_min=NEW_CONCEPT_FREQ_MIN, score_max=ENTITY_ALIGN_MIN):
    counter, ents_by_name = Counter(), {}
    for h in HYPER_NODES.values():
        for eid in [n for n in G.neighbors(h["id"]) if G.nodes[n]["kind"]=="entity"]:
            ent = ENTITY_NODES[eid]; key = (ent["name"] or ent["type"]).strip()
            ents_by_name[key] = ent; counter[key]+=1
    proposals=[]
    for name, freq in counter.items():
        e = ents_by_name[name]; s = best_align_score(e)
        if freq >= freq_min and s < score_max:
            qv = embed_query(name); nns = nearest(CLS_MAT, CLS_IDX, qv, k=1)
            parent = CLASS_LABELS[nns[0][1]]["iri"] if nns else f"{BASE}Entity"
            proposals.append({"label": name, "freq": freq, "best_align": s, "parent_iri": parent})
    return sorted(proposals, key=lambda x:(-x["freq"], x["best_align"]))

NEW_CONCEPTS = discover_new_concepts()
pd.DataFrame(NEW_CONCEPTS)[:10]


Unnamed: 0,label,freq,best_align,parent_iri
0,AI4Work,7,0.392436,http://www.sws.org/sws#Task
1,AI4Work components,7,0.418128,http://www.sws.org/sws#HumanActor
2,AI,7,0.510613,http://www.sws.org/sws#AIAcceptance
3,SWS Management,5,0.449268,http://www.sws.org/sws#Operational
4,HDT,4,0.425655,http://www.sws.org/sws#HardConstraint
5,virtual assistant,4,0.450721,http://www.sws.org/sws#HumanActor
6,SWS Management component,4,0.461684,http://www.sws.org/sws#Task
7,manufacturing,4,0.474152,http://www.sws.org/sws#Productivity
8,Operators,4,0.498976,http://www.sws.org/sws#Operational
9,human,4,0.519578,http://www.sws.org/sws#HumanActor


In [28]:
def export_updates_for_query(query:str, top_facts=10):
    _,_,facts = fused_retrieval(query, K_ENTITY, K_HYPEREDGE)
    facts = facts[:top_facts]
    abox_all, tbox_all, new_rows = [], [], []
    for f in facts:
        ax = propose_axioms_for_fact(f)
        if ax["abox"]: abox_all.append(ax["abox"])
        if ax["tbox"]: tbox_all.append(ax["tbox"])
        new_rows += ax["new_classes"]

    abox_ttl = "@prefix : <http://example.org/onto#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n\n" + "\n\n".join(abox_all)
    tbox_ttl = "@prefix : <http://example.org/onto#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n\n" + "\n\n".join(sorted(set(tbox_all)))

    (OUT_DIR / "abox.ttl").write_text(abox_ttl)
    (OUT_DIR / "tbox.ttl").write_text(tbox_ttl)
    pd.DataFrame(NEW_CONCEPTS).to_csv(OUT_DIR / "new_concepts.csv", index=False)
    pd.DataFrame(new_rows).to_csv(OUT_DIR / "new_classes_from_facts.csv", index=False)
    print("Wrote:", (OUT_DIR/"abox.ttl").resolve())
    print("Wrote:", (OUT_DIR/"tbox.ttl").resolve())
    print("Wrote:", (OUT_DIR/"new_concepts.csv").resolve())

export_updates_for_query("hypertension serum creatinine male mild elevation", top_facts=5)


Wrote: D:\RAG\onto-rag-koli\outputs\abox.ttl
Wrote: D:\RAG\onto-rag-koli\outputs\tbox.ttl
Wrote: D:\RAG\onto-rag-koli\outputs\new_concepts.csv
