# ontology

> RDF ontology loading and meta-graph navigation for RLM

In [None]:
#| default_exp ontology

## Overview

This module implements Stage 1 of the trajectory: Define the Ontology "Context Model".

### Design Principles

- **Handles, not dumps**: Return graph handles with bounded view operations
- **Meta-graph scaffolding**: Build navigation indexes (labels, hierarchy, properties)
- **Progressive disclosure**: Small summaries guide exploration
- **RLM-compatible**: Works with namespace-explicit `rlm_run()`

### Context Model

From the trajectory document:
> The *root model never gets a graph dump*. It gets a handle name (e.g. `ont`, `res_0`) and uses bounded view operations.

## Imports

In [None]:
#| export
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal
from pathlib import Path
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from fastcore.basics import AttrDict

## Graph Loading

In [None]:
#| export
def load_ontology(path: str | Path, ns: dict, name: str = 'ont') -> str:
    """Load an RDF ontology file into namespace as a Graph handle.
    
    Args:
        path: Path to ontology file (.ttl, .rdf, .owl)
        ns: Namespace dict where Graph will be stored
        name: Variable name for the Graph handle
        
    Returns:
        Summary string describing what was loaded
    """
    g = Graph()
    g.parse(path)
    ns[name] = g
    
    return f"Loaded {len(g)} triples from {Path(path).name} into '{name}'"

In [None]:
# Test loading prov.ttl
test_ns = {}
result = load_ontology('ontology/prov.ttl', test_ns, name='prov_ont')
print(result)
assert 'prov_ont' in test_ns
assert isinstance(test_ns['prov_ont'], Graph)
assert len(test_ns['prov_ont']) > 0
print(f"✓ Loaded {len(test_ns['prov_ont'])} triples")

## Meta-Graph Navigation

Build navigation scaffolding from a Graph to enable progressive disclosure.
This is what goes in the REPL environment, not the graph itself.

In [None]:
#| export
@dataclass
class GraphMeta:
    """Meta-graph navigation scaffolding for an RDF Graph.
    
    This is REPL-resident and provides bounded views over the graph.
    Indexes discovered in dialogs/inspect_tools.ipynb exploration.
    """
    graph: Graph
    name: str = 'ont'
    
    # Computed lazily
    _namespaces: dict = field(default=None, init=False, repr=False)
    _classes: list = field(default=None, init=False, repr=False)
    _properties: list = field(default=None, init=False, repr=False)
    _individuals: list = field(default=None, init=False, repr=False)
    _labels: dict = field(default=None, init=False, repr=False)
    _by_label: dict = field(default=None, init=False, repr=False)
    _subs: dict = field(default=None, init=False, repr=False)
    _supers: dict = field(default=None, init=False, repr=False)
    _doms: dict = field(default=None, init=False, repr=False)
    _rngs: dict = field(default=None, init=False, repr=False)
    
    @property
    def triple_count(self) -> int:
        """Total number of triples in graph."""
        return len(self.graph)
    
    @property
    def namespaces(self) -> dict:
        """Get namespace prefix bindings."""
        if self._namespaces is None:
            self._namespaces = {prefix: str(ns) for prefix, ns in self.graph.namespaces()}
        return self._namespaces
    
    @property
    def classes(self) -> list:
        """Get all OWL/RDFS classes (URIs only, sorted)."""
        if self._classes is None:
            classes = set(
                self.graph.subjects(RDF.type, OWL.Class)
            ).union(
                self.graph.subjects(RDF.type, RDFS.Class)
            )
            self._classes = sorted([str(c) for c in classes])
        return self._classes
    
    @property
    def properties(self) -> list:
        """Get all properties (URIs only, sorted)."""
        if self._properties is None:
            props = set(
                self.graph.subjects(RDF.type, OWL.ObjectProperty)
            ).union(
                self.graph.subjects(RDF.type, OWL.DatatypeProperty)
            ).union(
                self.graph.subjects(RDF.type, OWL.AnnotationProperty)
            ).union(
                self.graph.subjects(RDF.type, RDF.Property)
            )
            self._properties = sorted([str(p) for p in props])
        return self._properties
    
    @property
    def individuals(self) -> list:
        """Get all named individuals (URIs only, sorted)."""
        if self._individuals is None:
            inds = set(self.graph.subjects(RDF.type, OWL.NamedIndividual))
            self._individuals = sorted([str(i) for i in inds])
        return self._individuals
    
    @property
    def labels(self) -> dict:
        """Get label index: URI -> label string."""
        if self._labels is None:
            self._labels = {}
            for s, o in self.graph.subject_objects(RDFS.label):
                self._labels[str(s)] = str(o)
        return self._labels
    
    @property
    def by_label(self) -> dict:
        """Get inverted label index: label_text -> list of URIs."""
        if self._by_label is None:
            inv = defaultdict(list)
            for uri, lbl in self.labels.items():
                inv[lbl.lower()].append(uri)
            self._by_label = dict(inv)
        return self._by_label
    
    @property
    def subs(self) -> dict:
        """Get subclass relationships: superclass_uri -> list of subclass_uris."""
        if self._subs is None:
            subs_dict = defaultdict(list)
            for s, _, o in self.graph.triples((None, RDFS.subClassOf, None)):
                if isinstance(o, URIRef):
                    subs_dict[str(o)].append(str(s))
            self._subs = dict(subs_dict)
        return self._subs
    
    @property
    def supers(self) -> dict:
        """Get superclass relationships: subclass_uri -> list of superclass_uris."""
        if self._supers is None:
            supers_dict = defaultdict(list)
            for s, _, o in self.graph.triples((None, RDFS.subClassOf, None)):
                if isinstance(o, URIRef):
                    supers_dict[str(s)].append(str(o))
            self._supers = dict(supers_dict)
        return self._supers
    
    @property
    def doms(self) -> dict:
        """Get property domains: property_uri -> domain_uri."""
        if self._doms is None:
            self._doms = {str(s): str(o) for s, _, o in self.graph.triples((None, RDFS.domain, None))}
        return self._doms
    
    @property
    def rngs(self) -> dict:
        """Get property ranges: property_uri -> range_uri."""
        if self._rngs is None:
            self._rngs = {str(s): str(o) for s, _, o in self.graph.triples((None, RDFS.range, None))}
        return self._rngs
    
    def summary(self) -> str:
        """Generate a summary of the graph for display."""
        lines = [
            f"Graph '{self.name}': {self.triple_count:,} triples",
            f"Classes: {len(self.classes)}",
            f"Properties: {len(self.properties)}",
            f"Individuals: {len(self.individuals)}",
            f"Namespaces: {', '.join(self.namespaces.keys())}"
        ]
        return '\n'.join(lines)

In [None]:
# Test GraphMeta with prov ontology
prov_g = test_ns['prov_ont']
meta = GraphMeta(prov_g, name='prov')

print(meta.summary())
print()
print(f"Sample classes (first 5): {meta.classes[:5]}")
print(f"Sample properties (first 5): {meta.properties[:5]}")
print(f"Namespaces: {list(meta.namespaces.keys())}")

## Bounded View Functions

These operate on GraphMeta and return small, bounded summaries.

In [None]:
#| export
def graph_stats(meta: GraphMeta) -> str:
    """Get graph statistics summary."""
    return meta.summary()

In [None]:
#| export
def search_by_label(meta: GraphMeta, search: str, limit: int = 10) -> list:
    """Search for entities by label substring (case-insensitive).
    
    Args:
        meta: GraphMeta to search
        search: Substring to search for in labels
        limit: Maximum results to return
        
    Returns:
        List of (URI, label) tuples
    """
    search_lower = search.lower()
    matches = [
        (uri, label) 
        for uri, label in meta.labels.items()
        if search_lower in label.lower()
    ]
    return matches[:limit]

In [None]:
# Test search_by_label
results = search_by_label(meta, 'activity', limit=5)
print(f"Found {len(results)} matches for 'activity':")
for uri, label in results:
    print(f"  {label}: {uri}")

In [None]:
#| export
def describe_entity(meta: GraphMeta, uri: str, limit: int = 20) -> dict:
    """Get bounded description of an entity.
    
    Args:
        meta: GraphMeta containing the entity
        uri: URI of entity to describe
        limit: Max number of triples to include
        
    Returns:
        Dict with label, types, and sample triples
    """
    from rdflib import URIRef
    
    entity = URIRef(uri)
    
    # Get label
    label = meta.labels.get(uri, uri)
    
    # Get types
    types = [str(t) for t in meta.graph.objects(entity, RDF.type)]
    
    # Get sample of outgoing triples
    outgoing = []
    for p, o in list(meta.graph.predicate_objects(entity))[:limit]:
        outgoing.append((str(p), str(o)))
    
    # Get comment if available
    comments = list(meta.graph.objects(entity, RDFS.comment))
    comment = str(comments[0]) if comments else None
    
    return {
        'uri': uri,
        'label': label,
        'types': types,
        'comment': comment,
        'outgoing_sample': outgoing[:limit]  # FIX: Use limit parameter, not hardcoded [:10]
    }

In [None]:
# Test describe_entity
# Find the Activity class
activity_uri = 'http://www.w3.org/ns/prov#Activity'
desc = describe_entity(meta, activity_uri)

print(f"Label: {desc['label']}")
print(f"Types: {desc['types']}")
print(f"Comment: {desc['comment'][:100]}..." if desc['comment'] else "No comment")
print(f"Outgoing triples: {len(desc['outgoing_sample'])}")

## Additional Exploration Functions

Functions discovered in `dialogs/inspect_tools.ipynb` for deeper ontology exploration.

In [None]:
#| export
def ont_describe(ont: str, uri: str, name: str = 'desc', ns: dict = None) -> str:
    """Get all triples about a URI, store in namespace.
    
    Returns both triples where URI is subject and where it's object.
    
    Args:
        ont: Name of ontology variable in namespace
        uri: URI to describe
        name: Variable name for storing result
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    u = URIRef(uri) if not isinstance(uri, URIRef) else uri
    
    # Get triples where URI is subject
    subj_triples = [(str(s), str(p), str(obj)) for s, p, obj in o.graph.triples((u, None, None))]
    
    # Get triples where URI is object
    obj_triples = [(str(s), str(p), str(obj)) for s, p, obj in o.graph.triples((None, None, u))]
    
    result = {
        'as_subject': subj_triples,
        'as_object': obj_triples
    }
    ns[name] = result
    return f"Stored {len(subj_triples)} + {len(obj_triples)} triples about '{uri}' into '{name}'"

In [None]:
#| export
def ont_meta(ont: str, name: str = 'meta', ns: dict = None) -> str:
    """Extract ontology metadata (prefixes, annotation predicates, imports).
    
    Args:
        ont: Name of ontology variable in namespace
        name: Variable name for storing result
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    
    prefixes = dict(o.graph.namespaces())
    ann_preds = set(str(p) for s, p, obj in o.graph.triples((None, None, None)) if isinstance(obj, Literal))
    imports = [str(obj) for s, p, obj in o.graph.triples((None, OWL.imports, None))]
    
    res = AttrDict(
        prefixes=prefixes,
        ann_preds=list(ann_preds)[:50],  # Limit to first 50
        imports=imports
    )
    ns[name] = res
    return f"Stored metadata into '{name}': {len(prefixes)} prefixes, {len(ann_preds)} annotation predicates, {len(imports)} imports"

In [None]:
#| export
def ont_roots(ont: str, name: str = 'roots', ns: dict = None) -> str:
    """Find root classes (no declared superclass), store in namespace.
    
    Args:
        ont: Name of ontology variable in namespace
        name: Variable name for storing result
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    
    has_super = set(o.supers.keys())
    roots = [str(c) for c in o.classes if str(c).startswith('http') and str(c) not in has_super]
    
    ns[name] = roots
    return f"Stored {len(roots)} root classes into '{name}'"

## Ontology Sense Building

Build a structured "sense document" for an ontology - a summary that helps LLMs understand the ontology well enough to construct SPARQL queries and reason about knowledge graphs.

This uses a **workflow pattern** (not agentic): gather structure programmatically, then use one LLM call to synthesize findings.

In [None]:
#| export
def build_sense(path: str, name: str = 'sense', ns: dict = None) -> str:
    """Build ontology sense document using workflow + LLM synthesis.
    
    This function:
    1. Loads ontology and extracts metadata/roots programmatically
    2. Builds hierarchy (2 levels), property info, characteristics
    3. Makes one LLM call to synthesize domain/scope/patterns/hints
    4. Returns structured AttrDict stored in namespace
    
    Args:
        path: Path to ontology file
        name: Variable name for sense document (default: 'sense')
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = {}
    
    # Derive ontology name from sense name
    ont_name = name.replace('_sense', '').replace('sense', 'ont')
    
    # Setup ontology context (loads graph + creates GraphMeta)
    setup_ontology_context(path, ns, name=ont_name)
    
    # Get metadata and roots
    ont_meta(f'{ont_name}_meta', name=f'{ont_name}_metadata', ns=ns)
    ont_roots(f'{ont_name}_meta', name=f'{ont_name}_roots', ns=ns)
    
    # Get references
    meta_obj = ns[f'{ont_name}_meta']
    metadata = ns[f'{ont_name}_metadata']
    roots = ns[f'{ont_name}_roots']
    
    # Build hierarchy (2 levels deep from roots)
    hier = {}
    for r in roots[:10]:  # Limit to first 10 roots
        lbl = meta_obj.labels.get(r, r)
        children = meta_obj.subs.get(r, [])
        hier[lbl] = {
            meta_obj.labels.get(c, c): [
                meta_obj.labels.get(gc, gc) 
                for gc in meta_obj.subs.get(c, [])[:5]
            ] 
            for c in children[:10]
        }
    
    # Extract top properties with domains/ranges
    top_props = []
    for p in meta_obj.properties[:20]:
        if p.startswith('http'):
            prop_label = meta_obj.labels.get(p, p)
            dom_uri = meta_obj.doms.get(p, '')
            rng_uri = meta_obj.rngs.get(p, '')
            dom_label = meta_obj.labels.get(dom_uri, dom_uri) if dom_uri else ''
            rng_label = meta_obj.labels.get(rng_uri, rng_uri) if rng_uri else ''
            top_props.append((prop_label, dom_label, rng_label))
    
    # Detect property characteristics (OWL axioms)
    prop_chars = {}
    for p in meta_obj.properties[:50]:
        if p.startswith('http'):
            chars = []
            p_uri = URIRef(p)
            
            # Check for transitive
            if list(meta_obj.graph.triples((p_uri, RDF.type, OWL.TransitiveProperty))):
                chars.append('transitive')
            
            # Check for symmetric
            if list(meta_obj.graph.triples((p_uri, RDF.type, OWL.SymmetricProperty))):
                chars.append('symmetric')
            
            # Check for inverse
            if list(meta_obj.graph.triples((p_uri, OWL.inverseOf, None))):
                chars.append('has_inverse')
            
            if chars:
                prop_chars[p] = chars
    
    # Get URI pattern samples
    uri_sample = [c for c in meta_obj.classes[:5] if c.startswith('http')]
    uri_pattern = uri_sample[0].rsplit('/', 1)[0] if uri_sample else ''
    
    # Build prompt for LLM synthesis
    prompt = f"""Analyze this ontology and provide a sense document:

Stats: {len(meta_obj.classes)} classes, {len(meta_obj.properties)} properties, {len(meta_obj.labels)} labels
Prefixes: {list(metadata.prefixes.keys())[:10]}
Annotation predicates: {metadata.ann_preds[:10]}
Root classes: {[meta_obj.labels.get(r, r) for r in roots[:10]]}
Hierarchy (2 levels): {hier}
Top properties (label, domain, range): {top_props[:10]}
Property characteristics: {prop_chars}
URI pattern examples: {uri_sample[:3]}

Provide a concise sense document with:
1. Domain/scope - what is this ontology about?
2. Key branches - main conceptual areas in the hierarchy
3. Important properties - key relationships to know
4. Detected patterns - reification, measurement patterns, part-whole relationships, etc.
5. SPARQL navigation hints - how to effectively query this ontology"""
    
    # Use llm_query from rlm.core (already in namespace from setup_ontology_context)
    from rlm.core import llm_query
    summary = llm_query(prompt, ns=ns, name='_sense_summary')
    
    # Build structured sense document
    sense_doc = AttrDict(
        ont=ont_name,
        stats={'cls': len(meta_obj.classes), 'props': len(meta_obj.properties), 'lbls': len(meta_obj.labels)},
        prefixes=metadata.prefixes,
        ann_preds=metadata.ann_preds,
        roots=roots,
        hier=hier,
        top_props=top_props,
        prop_chars=prop_chars,
        uri_pattern=uri_pattern,
        summary=ns['_sense_summary']
    )
    
    ns[name] = sense_doc
    return f"Built sense document into '{name}' with {len(hier)} root branches, {len(top_props)} properties"

In [None]:
#| eval: false
# Test build_sense with PROV ontology
# Note: Requires API key, marked eval:false to avoid CI failures

test_ns = {}
result = build_sense('ontology/prov.ttl', name='prov_sense', ns=test_ns)
print(result)
print()

# Inspect the sense document
sense = test_ns['prov_sense']
print(f"Ontology: {sense.ont}")
print(f"Stats: {sense.stats}")
print(f"Roots: {sense.roots}")
print(f"Root branches: {list(sense.hier.keys())}")
print(f"Top properties (first 3): {sense.top_props[:3]}")
print(f"Property characteristics: {sense.prop_chars}")
print(f"URI pattern: {sense.uri_pattern}")
print()
print("LLM Summary:")
print(sense.summary)

## Integration with RLM

Helper to setup ontology context for `rlm_run()`.

In [None]:
#| export
def setup_ontology_context(path: str | Path, ns: dict, name: str = 'ont') -> str:
    """Load ontology and create meta-graph for RLM use.
    
    This sets up both the Graph and GraphMeta in the namespace.
    
    Args:
        path: Path to ontology file
        ns: Namespace dict
        name: Base name for graph handle
        
    Returns:
        Summary string
    """
    # Load graph
    load_msg = load_ontology(path, ns, name=name)
    
    # Create meta-graph
    g = ns[name]
    meta = GraphMeta(g, name=name)
    ns[f"{name}_meta"] = meta
    
    # FIX: Namespace helper functions by ontology name to avoid overwriting
    # This allows multiple ontologies to coexist
    from functools import partial
    ns[f'{name}_graph_stats'] = partial(graph_stats, meta)
    ns[f'{name}_search_by_label'] = partial(search_by_label, meta)
    ns[f'{name}_describe_entity'] = partial(describe_entity, meta)
    
    # Also bind without prefix for single-ontology convenience
    # (will be overwritten if multiple ontologies loaded, but prefixed versions persist)
    ns['graph_stats'] = partial(graph_stats, meta)
    ns['search_by_label'] = partial(search_by_label, meta)
    ns['describe_entity'] = partial(describe_entity, meta)
    
    return f"{load_msg}\nCreated meta-graph '{name}_meta' with {len(meta.classes)} classes, {len(meta.properties)} properties"

In [None]:
# Test setup for RLM
test_ns = {}
result = setup_ontology_context('ontology/prov.ttl', test_ns, name='prov')
print(result)
print()
print("Namespace contains:")
for k in test_ns.keys():
    print(f"  {k}: {type(test_ns[k]).__name__}")

In [None]:
# Test new exploration functions
# Reuse the test_ns from previous cell with loaded prov ontology
# Note: prov_meta is a GraphMeta object in test_ns

# Test that new indexes work
meta = test_ns['prov_meta']
assert len(meta.by_label) > 0  # inverted label index
assert len(meta.subs) > 0 or len(meta.supers) > 0  # class hierarchy
print(f"✓ New GraphMeta indexes work: by_label has {len(meta.by_label)} entries")

# Test ont_describe (need to pass GraphMeta object as namespace entry)
result = ont_describe('prov_meta', 'http://www.w3.org/ns/prov#Activity', name='activity_desc', ns=test_ns)
assert 'activity_desc' in test_ns
print(f"✓ ont_describe works: {result}")

# Test ont_meta  
result = ont_meta('prov_meta', name='prov_metadata', ns=test_ns)
assert 'prov_metadata' in test_ns
print(f"✓ ont_meta works: {result}")

# Test ont_roots
result = ont_roots('prov_meta', name='prov_roots', ns=test_ns)
assert 'prov_roots' in test_ns
print(f"✓ ont_roots works: {result}")

## Test with RLM

Now let's test asking a question about the PROV ontology using `rlm_run()`.

In [None]:
#| eval: false
from rlm.core import rlm_run

# Setup namespace with PROV ontology
ns = {}
setup_ontology_context('ontology/prov.ttl', ns, name='prov')

# Ask a question
# The context is the GraphMeta summary, not the full graph
context = ns['prov_meta'].summary()

answer, iterations, ns = rlm_run(
    "What is the Activity class in the PROV ontology?",
    context,
    ns=ns,
    max_iters=3
)

print(f"Answer: {answer}")
print(f"Iterations: {len(iterations)}")