# ontology

> RDF ontology loading and meta-graph navigation for RLM

In [None]:
#| default_exp ontology

## Overview

This module implements Stages 1-2 of the trajectory: Define the Ontology "Context Model" and provide bounded view primitives for progressive disclosure.

**Stage 1**: Meta-graph scaffolding with navigation indexes  
**Stage 2**: Bounded view primitives for safe graph exploration

### Design Principles

- **Handles, not dumps**: Return graph handles with bounded view operations
- **Meta-graph scaffolding**: Build navigation indexes (labels, hierarchy, properties)
- **Progressive disclosure**: Small summaries guide exploration
- **RLM-compatible**: Works with namespace-explicit `rlm_run()`

### Context Model

From the trajectory document:
> The *root model never gets a graph dump*. It gets a handle name (e.g. `ont`, `res_0`) and uses bounded view operations.

## Imports

In [None]:
#| export
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal, SKOS, DCTERMS, FOAF
from pathlib import Path
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from fastcore.basics import AttrDict
from itertools import islice

# Additional namespaces for ontology metadata (Widoco guide)
VANN = Namespace('http://purl.org/vocab/vann/')
DC = Namespace('http://purl.org/dc/elements/1.1/')

## Graph Loading

In [None]:
#| export
def load_ontology(path: str | Path, ns: dict, name: str = 'ont') -> str:
    """Load an RDF ontology file into namespace as a Graph handle.
    
    Args:
        path: Path to ontology file (.ttl, .rdf, .owl)
        ns: Namespace dict where Graph will be stored
        name: Variable name for the Graph handle
        
    Returns:
        Summary string describing what was loaded
    """
    g = Graph()
    g.parse(path)
    ns[name] = g
    
    return f"Loaded {len(g)} triples from {Path(path).name} into '{name}'"

In [None]:
# Test loading prov.ttl
test_ns = {}
result = load_ontology('ontology/prov.ttl', test_ns, name='prov_ont')
print(result)
assert 'prov_ont' in test_ns
assert isinstance(test_ns['prov_ont'], Graph)
assert len(test_ns['prov_ont']) > 0
print(f"‚úì Loaded {len(test_ns['prov_ont'])} triples")

Loaded 1664 triples from prov.ttl into 'prov_ont'
‚úì Loaded 1664 triples


## Meta-Graph Navigation

Build navigation scaffolding from a Graph to enable progressive disclosure.
This is what goes in the REPL environment, not the graph itself.

In [None]:
#| export
# Namespace-specific label property mappings (Widoco guide)
NAMESPACE_LABEL_MAPPINGS = {
    'skos': [SKOS.prefLabel, SKOS.altLabel],
    'dc': [DC.title],
    'dcterms': [DCTERMS.title],
    'foaf': [FOAF.name],
}

@dataclass
class GraphMeta:
    """Meta-graph navigation scaffolding for an RDF Graph.
    
    This is REPL-resident and provides bounded views over the graph.
    Indexes discovered in dialogs/inspect_tools.ipynb exploration.
    """
    graph: Graph
    name: str = 'ont'
    
    # Computed lazily
    _namespaces: dict = field(default=None, init=False, repr=False)
    _classes: list = field(default=None, init=False, repr=False)
    _properties: list = field(default=None, init=False, repr=False)
    _individuals: list = field(default=None, init=False, repr=False)
    _labels: dict = field(default=None, init=False, repr=False)
    _by_label: dict = field(default=None, init=False, repr=False)
    _subs: dict = field(default=None, init=False, repr=False)
    _supers: dict = field(default=None, init=False, repr=False)
    _doms: dict = field(default=None, init=False, repr=False)
    _rngs: dict = field(default=None, init=False, repr=False)
    _pred_freq: Counter = field(default=None, init=False, repr=False)
    
    @property
    def triple_count(self) -> int:
        """Total number of triples in graph."""
        return len(self.graph)
    
    @property
    def namespaces(self) -> dict:
        """Get namespace prefix bindings."""
        if self._namespaces is None:
            self._namespaces = {prefix: str(ns) for prefix, ns in self.graph.namespaces()}
        return self._namespaces
    
    @property
    def classes(self) -> list:
        """Get all OWL/RDFS classes (URIs only, sorted)."""
        if self._classes is None:
            classes = set(
                self.graph.subjects(RDF.type, OWL.Class)
            ).union(
                self.graph.subjects(RDF.type, RDFS.Class)
            )
            self._classes = sorted([str(c) for c in classes])
        return self._classes
    
    @property
    def properties(self) -> list:
        """Get all properties (URIs only, sorted)."""
        if self._properties is None:
            props = set(
                self.graph.subjects(RDF.type, OWL.ObjectProperty)
            ).union(
                self.graph.subjects(RDF.type, OWL.DatatypeProperty)
            ).union(
                self.graph.subjects(RDF.type, OWL.AnnotationProperty)
            ).union(
                self.graph.subjects(RDF.type, RDF.Property)
            )
            self._properties = sorted([str(p) for p in props])
        return self._properties
    
    @property
    def individuals(self) -> list:
        """Get all named individuals (URIs only, sorted)."""
        if self._individuals is None:
            inds = set(self.graph.subjects(RDF.type, OWL.NamedIndividual))
            self._individuals = sorted([str(i) for i in inds])
        return self._individuals
    
    @property
    def labels(self) -> dict:
        """Get label index: URI -> label string.
        
        Uses prefix-guided label indexing per Widoco guide:
        - Always indexes rdfs:label
        - Conditionally indexes namespace-specific label properties
          (skos:prefLabel, dc:title, etc.) based on bound namespaces
        """
        if self._labels is None:
            self._labels = {}
            
            # Always index rdfs:label
            for s, o in self.graph.subject_objects(RDFS.label):
                self._labels[str(s)] = str(o)
            
            # Prefix-guided indexing for bound namespaces
            for prefix, predicates in NAMESPACE_LABEL_MAPPINGS.items():
                if prefix in self.namespaces:
                    for pred in predicates:
                        for s, o in self.graph.subject_objects(pred):
                            # Only add if not already labeled (rdfs:label takes precedence)
                            if str(s) not in self._labels:
                                self._labels[str(s)] = str(o)
        
        return self._labels
    
    @property
    def by_label(self) -> dict:
        """Get inverted label index: label_text -> list of URIs."""
        if self._by_label is None:
            inv = defaultdict(list)
            for uri, lbl in self.labels.items():
                inv[lbl.lower()].append(uri)
            self._by_label = dict(inv)
        return self._by_label
    
    @property
    def subs(self) -> dict:
        """Get subclass relationships: superclass_uri -> list of subclass_uris."""
        if self._subs is None:
            subs_dict = defaultdict(list)
            for s, _, o in self.graph.triples((None, RDFS.subClassOf, None)):
                if isinstance(o, URIRef):
                    subs_dict[str(o)].append(str(s))
            self._subs = dict(subs_dict)
        return self._subs
    
    @property
    def supers(self) -> dict:
        """Get superclass relationships: subclass_uri -> list of superclass_uris."""
        if self._supers is None:
            supers_dict = defaultdict(list)
            for s, _, o in self.graph.triples((None, RDFS.subClassOf, None)):
                if isinstance(o, URIRef):
                    supers_dict[str(s)].append(str(o))
            self._supers = dict(supers_dict)
        return self._supers
    
    @property
    def doms(self) -> dict:
        """Get property domains: property_uri -> domain_uri."""
        if self._doms is None:
            self._doms = {str(s): str(o) for s, _, o in self.graph.triples((None, RDFS.domain, None))}
        return self._doms
    
    @property
    def rngs(self) -> dict:
        """Get property ranges: property_uri -> range_uri."""
        if self._rngs is None:
            self._rngs = {str(s): str(o) for s, _, o in self.graph.triples((None, RDFS.range, None))}
        return self._rngs
    
    @property
    def pred_freq(self) -> Counter:
        """Get predicate frequency counts (cached)."""
        if self._pred_freq is None:
            from collections import Counter
            self._pred_freq = Counter(str(p) for s, p, o in self.graph.triples((None, None, None)))
        return self._pred_freq
    
    def summary(self) -> str:
        """Generate a summary of the graph for display."""
        lines = [
            f"Graph '{self.name}': {self.triple_count:,} triples",
            f"Classes: {len(self.classes)}",
            f"Properties: {len(self.properties)}",
            f"Individuals: {len(self.individuals)}",
            f"Namespaces: {', '.join(self.namespaces.keys())}"
        ]
        return '\n'.join(lines)

In [None]:
# Test GraphMeta with prov ontology
prov_g = test_ns['prov_ont']
meta = GraphMeta(prov_g, name='prov')

print(meta.summary())
print()
print(f"Sample classes (first 5): {meta.classes[:5]}")
print(f"Sample properties (first 5): {meta.properties[:5]}")
print(f"Namespaces: {list(meta.namespaces.keys())}")

Graph 'prov': 1,664 triples
Classes: 59
Properties: 89
Individuals: 1
Namespaces: brick, csvw, dc, dcat, dcmitype, dcterms, dcam, doap, foaf, geo, odrl, org, prof, qb, schema, sh, skos, sosa, ssn, time, vann, void, wgs, owl, rdf, rdfs, xsd, xml, prov

Sample classes (first 5): ['http://www.w3.org/2002/07/owl#Thing', 'http://www.w3.org/ns/prov#Accept', 'http://www.w3.org/ns/prov#Activity', 'http://www.w3.org/ns/prov#ActivityInfluence', 'http://www.w3.org/ns/prov#Agent']
Sample properties (first 5): ['http://www.w3.org/2000/01/rdf-schema#comment', 'http://www.w3.org/2000/01/rdf-schema#isDefinedBy', 'http://www.w3.org/2000/01/rdf-schema#label', 'http://www.w3.org/2000/01/rdf-schema#seeAlso', 'http://www.w3.org/2002/07/owl#topObjectProperty']
Namespaces: ['brick', 'csvw', 'dc', 'dcat', 'dcmitype', 'dcterms', 'dcam', 'doap', 'foaf', 'geo', 'odrl', 'org', 'prof', 'qb', 'schema', 'sh', 'skos', 'sosa', 'ssn', 'time', 'vann', 'void', 'wgs', 'owl', 'rdf', 'rdfs', 'xsd', 'xml', 'prov']


## Bounded View Functions (Stage 1)

Basic operations on GraphMeta that return small, bounded summaries:

- **graph_stats()**: Overall graph statistics
- **search_by_label()**: Simple label-based search
- **describe_entity()**: Get entity description with sample triples

These provide the foundation for progressive disclosure.

In [None]:
#| export
def graph_stats(meta: GraphMeta) -> str:
    """Get graph statistics summary."""
    return meta.summary()

In [None]:
#| export
def search_entity(meta: GraphMeta, query: str, limit: int = 10,
                  search_in: str = 'all') -> list:
    """Search for entities by label, IRI, or localname.

    Args:
        meta: GraphMeta to search
        query: Search string (case-insensitive substring match)
        limit: Maximum results to return
        search_in: Where to search - 'label', 'iri', 'localname', or 'all'

    Returns:
        List of dicts: [{'uri': str, 'label': str, 'match_type': str}, ...]
    """
    query_lower = query.lower()
    matches = []

    # Search in labels
    if search_in in ('label', 'all'):
        for uri, label in meta.labels.items():
            if query_lower in label.lower():
                matches.append({
                    'uri': uri,
                    'label': label,
                    'match_type': 'label'
                })

    # Search in full IRIs
    if search_in in ('iri', 'all'):
        all_uris = set(meta.classes + meta.properties + meta.individuals)
        for uri in all_uris:
            if query_lower in uri.lower() and not any(m['uri'] == uri for m in matches):
                label = meta.labels.get(uri, uri)
                matches.append({
                    'uri': uri,
                    'label': label,
                    'match_type': 'iri'
                })

    # Search in localnames (fragment or last path segment)
    if search_in in ('localname', 'all'):
        all_uris = set(meta.classes + meta.properties + meta.individuals)
        for uri in all_uris:
            # Extract localname (after # or last /)
            if '#' in uri:
                localname = uri.split('#')[-1]
            else:
                localname = uri.split('/')[-1]

            if query_lower in localname.lower() and not any(m['uri'] == uri for m in matches):
                label = meta.labels.get(uri, uri)
                matches.append({
                    'uri': uri,
                    'label': label,
                    'match_type': 'localname'
                })

    return matches[:limit]


def search_by_label(meta: GraphMeta, search: str, limit: int = 10) -> list:
    """Search for entities by label substring (case-insensitive).

    Backward-compatible wrapper around search_entity().

    Args:
        meta: GraphMeta to search
        search: Substring to search for in labels
        limit: Maximum results to return

    Returns:
        List of (URI, label) tuples
    """
    results = search_entity(meta, search, limit=limit, search_in='label')
    return [(r['uri'], r['label']) for r in results]

In [None]:
# Test search_entity
results = search_entity(meta, 'activity', limit=5)
print(f"Found {len(results)} matches for 'activity':")
for r in results:
    print(f"  {r['label']}: {r['uri']} ({r['match_type']})")

# Test different search modes
print("\nSearch by IRI only:")
iri_results = search_entity(meta, 'prov', search_in='iri', limit=3)
for r in iri_results:
    print(f"  {r['label']}: {r['uri']}")

# Test backward compatibility
print("\nBackward compatibility test:")
legacy_results = search_by_label(meta, 'activity', limit=5)
print(f"Found {len(legacy_results)} matches using search_by_label():")
for uri, label in legacy_results:
    print(f"  {label}: {uri}")

Found 5 matches for 'activity':
  Activity: http://www.w3.org/ns/prov#Activity (label)
  ActivityInfluence: http://www.w3.org/ns/prov#ActivityInfluence (label)
  activity: http://www.w3.org/ns/prov#activity (label)
  hadActivity: http://www.w3.org/ns/prov#hadActivity (label)
  activityOfInfluence: http://www.w3.org/ns/prov#activityOfInfluence (label)

Search by IRI only:
  Attribution: http://www.w3.org/ns/prov#Attribution
  invalidatedAtTime: http://www.w3.org/ns/prov#invalidatedAtTime
  Derivation: http://www.w3.org/ns/prov#Derivation

Backward compatibility test:
Found 5 matches using search_by_label():
  Activity: http://www.w3.org/ns/prov#Activity
  ActivityInfluence: http://www.w3.org/ns/prov#ActivityInfluence
  activity: http://www.w3.org/ns/prov#activity
  hadActivity: http://www.w3.org/ns/prov#hadActivity
  activityOfInfluence: http://www.w3.org/ns/prov#activityOfInfluence


In [None]:
#| export
def _expand_uri(meta: GraphMeta, uri: str) -> URIRef:
    """Expand prefixed URI (e.g., 'prov:Activity') to full URI.

    Args:
        meta: GraphMeta containing namespace bindings
        uri: URI string (may be prefixed like 'prov:Activity' or full URI)

    Returns:
        URIRef with expanded URI
    """
    from rdflib import URIRef

    # If already a full URI, return as-is
    if uri.startswith('http://') or uri.startswith('https://'):
        return URIRef(uri)

    # Try to expand as CURIE (prefix:localname)
    if ':' in uri:
        try:
            return meta.graph.namespace_manager.expand_curie(uri)
        except:
            pass  # Fall through to URIRef if expansion fails

    return URIRef(uri)

def describe_entity(meta: GraphMeta, uri: str, limit: int = 20) -> dict:
    """Get bounded description of an entity.

    Args:
        meta: GraphMeta containing the entity
        uri: URI of entity to describe (supports prefixed forms like 'prov:Activity')
        limit: Max number of triples to include

    Returns:
        Dict with label, types, and sample triples
    """
    # Expand URI (handles prefixed forms like 'prov:Activity')
    entity = _expand_uri(meta, uri)
    uri_str = str(entity)

    # Get label
    label = meta.labels.get(uri_str, uri_str)

    # Get types
    types = [str(t) for t in meta.graph.objects(entity, RDF.type)]

    # Get sample of outgoing triples using islice
    outgoing = []
    for p, o in islice(meta.graph.predicate_objects(entity), limit):
        outgoing.append((str(p), str(o)))

    # Get comment if available
    comments = list(meta.graph.objects(entity, RDFS.comment))
    comment = str(comments[0]) if comments else None

    return {
        'uri': uri_str,
        'label': label,
        'types': types,
        'comment': comment,
        'outgoing_sample': outgoing
    }


In [None]:
# Test describe_entity
# Find the Activity class
activity_uri = 'http://www.w3.org/ns/prov#Activity'
desc = describe_entity(meta, activity_uri)

print(f"Label: {desc['label']}")
print(f"Types: {desc['types']}")
print(f"Comment: {desc['comment'][:100]}..." if desc['comment'] else "No comment")
print(f"Outgoing triples: {len(desc['outgoing_sample'])}")

Label: Activity
Types: ['http://www.w3.org/2002/07/owl#Class']
No comment
Outgoing triples: 10


### Stage 2: Progressive Disclosure Primitives

Advanced bounded view operations that enable root models to explore graphs iteratively:

- **search_entity()**: Multi-mode entity search (label/IRI/localname)
- **probe_relationships()**: One-hop neighbor exploration with filtering
- **find_path()**: BFS path finding between entities
- **predicate_frequency()**: Usage analysis for understanding graph structure

These primitives answer questions like:
- "Is X defined?" ‚Üí `search_entity()`
- "What connects A to B?" ‚Üí `find_path()`
- "What are the most important predicates?" ‚Üí `predicate_frequency()`
- "What does X relate to?" ‚Üí `probe_relationships()`

In [None]:
#| export
def probe_relationships(meta: GraphMeta, uri: str, predicate: str = None,
                        direction: str = 'both', limit: int = 20) -> dict:
    """Get one-hop neighbors of an entity, optionally filtered by predicate.

    Args:
        meta: GraphMeta containing the entity
        uri: URI of entity to probe (supports prefixed forms like 'prov:Activity')
        predicate: Optional predicate URI to filter by (supports prefixed forms)
        direction: 'out', 'in', or 'both' (default: 'both')
        limit: Maximum neighbors to return per direction

    Returns:
        {
            'uri': str, 'label': str,
            'outgoing': [{'predicate': str, 'pred_label': str,
                          'object': str, 'obj_label': str}, ...],
            'incoming': [{'subject': str, 'subj_label': str,
                          'predicate': str, 'pred_label': str}, ...],
            'outgoing_count': int, 'incoming_count': int
        }
    """
    from rdflib import URIRef

    # Expand URIs (handles prefixed forms)
    entity = _expand_uri(meta, uri)
    uri_str = str(entity)
    entity_label = meta.labels.get(uri_str, uri_str)

    outgoing = []
    incoming = []

    # Get outgoing triples (entity as subject)
    if direction in ('out', 'both'):
        pred_filter = _expand_uri(meta, predicate) if predicate else None
        triples = list(meta.graph.triples((entity, pred_filter, None)))

        for s, p, o in triples[:limit]:
            pred_uri = str(p)
            obj_uri = str(o)
            outgoing.append({
                'predicate': pred_uri,
                'pred_label': meta.labels.get(pred_uri, pred_uri),
                'object': obj_uri,
                'obj_label': meta.labels.get(obj_uri, obj_uri)
            })

    # Get incoming triples (entity as object)
    if direction in ('in', 'both'):
        pred_filter = _expand_uri(meta, predicate) if predicate else None
        triples = list(meta.graph.triples((None, pred_filter, entity)))

        for s, p, o in triples[:limit]:
            subj_uri = str(s)
            pred_uri = str(p)
            incoming.append({
                'subject': subj_uri,
                'subj_label': meta.labels.get(subj_uri, subj_uri),
                'predicate': pred_uri,
                'pred_label': meta.labels.get(pred_uri, pred_uri)
            })

    # Count total (not just limited sample)
    pred_filter = _expand_uri(meta, predicate) if predicate else None
    if direction in ('out', 'both'):
        outgoing_count = sum(1 for _ in meta.graph.triples((entity, pred_filter, None)))
    else:
        outgoing_count = 0

    if direction in ('in', 'both'):
        incoming_count = sum(1 for _ in meta.graph.triples((None, pred_filter, entity)))
    else:
        incoming_count = 0

    return {
        'uri': uri_str,
        'label': entity_label,
        'outgoing': outgoing,
        'incoming': incoming,
        'outgoing_count': outgoing_count,
        'incoming_count': incoming_count
    }


In [None]:
#| export
def find_path(meta: GraphMeta, source: str, target: str,
              max_depth: int = 2, limit: int = 10) -> list:
    """Find predicates connecting two entities using BFS.

    Answers "What predicates connect A to B?"

    Args:
        meta: GraphMeta to search
        source: Source entity URI (supports prefixed forms like 'prov:Activity')
        target: Target entity URI (supports prefixed forms like 'prov:Entity')
        max_depth: Maximum path length (default: 2)
        limit: Maximum paths to return

    Returns:
        List of paths, each path is list of steps:
        [{'from': uri, 'predicate': uri, 'to': uri, 'direction': 'out'|'in'}, ...]
    """
    from rdflib import URIRef
    from collections import deque

    # Expand URIs (handles prefixed forms)
    source_uri = _expand_uri(meta, source)
    target_uri = _expand_uri(meta, target)

    # BFS to find paths
    queue = deque([(source_uri, [])])  # (current_node, path_so_far)
    visited = set()
    paths_found = []

    while queue and len(paths_found) < limit:
        current, path = queue.popleft()

        # Skip if we've exceeded max depth
        if len(path) >= max_depth:
            continue

        # Skip if visited (but allow revisiting in different paths up to limit)
        path_key = (current, tuple(step['predicate'] for step in path))
        if path_key in visited:
            continue
        visited.add(path_key)

        # Check if we reached the target
        if current == target_uri:
            paths_found.append(path)
            continue

        # Explore outgoing edges
        for s, p, o in meta.graph.triples((current, None, None)):
            if isinstance(o, URIRef):
                step = {
                    'from': str(s),
                    'predicate': str(p),
                    'to': str(o),
                    'direction': 'out'
                }
                queue.append((o, path + [step]))

        # Explore incoming edges
        for s, p, o in meta.graph.triples((None, None, current)):
            if isinstance(s, URIRef):
                step = {
                    'from': str(current),
                    'predicate': str(p),
                    'to': str(s),
                    'direction': 'in'
                }
                queue.append((s, path + [step]))

    return paths_found


In [None]:
#| export
def predicate_frequency(meta: GraphMeta, limit: int = 20,
                        predicate_type: str = None) -> list:
    """Get predicates ranked by frequency of use.

    Args:
        meta: GraphMeta to analyze
        limit: Maximum predicates to return
        predicate_type: Optional filter - 'object', 'datatype', 'annotation'

    Returns:
        List of dicts: [{'predicate': str, 'label': str, 'count': int,
                         'sample_subject': str, 'sample_object': str}, ...]
    """
    from rdflib import URIRef
    from collections import Counter

    # Get frequency counter (cached)
    freq = meta.pred_freq

    # Filter by type if requested
    if predicate_type:
        type_props = set()

        if predicate_type == 'object':
            type_props = set(meta.graph.subjects(RDF.type, OWL.ObjectProperty))
        elif predicate_type == 'datatype':
            type_props = set(meta.graph.subjects(RDF.type, OWL.DatatypeProperty))
        elif predicate_type == 'annotation':
            type_props = set(meta.graph.subjects(RDF.type, OWL.AnnotationProperty))

        # Filter frequency counts to only include predicates of this type
        # FIX: Wrap in Counter so most_common() works
        filtered_freq = Counter({str(p): count for p, count in freq.items() if URIRef(p) in type_props})
    else:
        filtered_freq = freq

    # Get top N by frequency
    top_predicates = filtered_freq.most_common(limit)

    # Build result list with samples
    results = []
    for pred_uri, count in top_predicates:
        # Get label
        pred_label = meta.labels.get(pred_uri, pred_uri)

        # Get sample triple
        sample_triple = next(meta.graph.triples((None, URIRef(pred_uri), None)), None)

        if sample_triple:
            sample_subj = str(sample_triple[0])
            sample_obj = str(sample_triple[2])
        else:
            sample_subj = None
            sample_obj = None

        results.append({
            'predicate': pred_uri,
            'label': pred_label,
            'count': count,
            'sample_subject': sample_subj,
            'sample_object': sample_obj
        })

    return results

In [None]:
# Test probe_relationships
activity_uri = 'http://www.w3.org/ns/prov#Activity'
probe_result = probe_relationships(meta, activity_uri, limit=5)

print(f"Probing: {probe_result['label']}")
print(f"Outgoing relationships: {probe_result['outgoing_count']} total, showing {len(probe_result['outgoing'])}")
for rel in probe_result['outgoing'][:3]:
    print(f"  --{rel['pred_label']}--> {rel['obj_label']}")

print(f"\nIncoming relationships: {probe_result['incoming_count']} total, showing {len(probe_result['incoming'])}")
for rel in probe_result['incoming'][:3]:
    print(f"  <--{rel['pred_label']}-- {rel['subj_label']}")

# Test find_path
# Find path between two PROV classes
entity_uri = 'http://www.w3.org/ns/prov#Entity'
paths = find_path(meta, activity_uri, entity_uri, max_depth=2, limit=3)

print(f"\n\nPaths from Activity to Entity:")
if paths:
    for i, path in enumerate(paths, 1):
        print(f"Path {i}:")
        for step in path:
            direction_sym = '-->' if step['direction'] == 'out' else '<--'
            pred_label = meta.labels.get(step['predicate'], step['predicate'])
            print(f"  {direction_sym} {pred_label}")
else:
    print("  No paths found")


Probing: Activity
Outgoing relationships: 10 total, showing 5
  --http://www.w3.org/1999/02/22-rdf-syntax-ns#type--> http://www.w3.org/2002/07/owl#Class
  --http://www.w3.org/2000/01/rdf-schema#isDefinedBy--> W3C PROVenance Interchange Ontology (PROV-O)
  --http://www.w3.org/2000/01/rdf-schema#label--> Activity

Incoming relationships: 34 total, showing 5
  <--http://www.w3.org/2000/01/rdf-schema#range-- activity
  <--http://www.w3.org/1999/02/22-rdf-syntax-ns#first-- n0fe42a034f254bbc9cc97fe482231e2cb5
  <--http://www.w3.org/2000/01/rdf-schema#domain-- endedAtTime


Paths from Activity to Entity:
Path 1:
  --> http://www.w3.org/2002/07/owl#disjointWith


In [None]:
# Test predicate_frequency
print("Top 10 predicates by frequency:")
freq_results = predicate_frequency(meta, limit=10)
for r in freq_results:
    print(f"  {r['count']:4d} uses - {r['label']}")

# Test filtering by predicate type
print("\nTop 5 object properties:")
obj_props = predicate_frequency(meta, limit=5, predicate_type='object')
for r in obj_props:
    print(f"  {r['count']:4d} uses - {r['label']}")

Top 10 predicates by frequency:
   184 uses - http://www.w3.org/2000/01/rdf-schema#isDefinedBy
   175 uses - http://www.w3.org/1999/02/22-rdf-syntax-ns#type
   161 uses - http://www.w3.org/2000/01/rdf-schema#label
   107 uses - http://www.w3.org/2000/01/rdf-schema#comment
   104 uses - http://www.w3.org/ns/prov#category
    85 uses - http://www.w3.org/ns/prov#component
    64 uses - http://www.w3.org/2000/01/rdf-schema#domain
    63 uses - http://www.w3.org/ns/prov#definition
    60 uses - http://www.w3.org/2000/01/rdf-schema#range
    55 uses - http://www.w3.org/2000/01/rdf-schema#subClassOf

Top 5 object properties:
     7 uses - wasDerivedFrom
     3 uses - wasRevisionOf
     3 uses - specializationOf


## Additional Exploration Functions

Functions discovered in `dialogs/inspect_tools.ipynb` for deeper ontology exploration.

In [None]:
#| export
def ont_describe(ont: str, uri: str, name: str = 'desc', ns: dict = None,
                 limit: int = 100) -> str:
    """Get triples about a URI, store in namespace.
    
    Returns both triples where URI is subject and where it's object.
    
    Args:
        ont: Name of ontology variable in namespace
        uri: URI to describe
        name: Variable name for storing result
        ns: Namespace dict
        limit: Maximum triples to return per direction (default: 100)
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    u = URIRef(uri) if not isinstance(uri, URIRef) else uri
    
    # Get triples where URI is subject (bounded)
    subj_triples = [(str(s), str(p), str(obj)) 
                    for s, p, obj in islice(o.graph.triples((u, None, None)), limit)]
    
    # Get triples where URI is object (bounded)
    obj_triples = [(str(s), str(p), str(obj)) 
                   for s, p, obj in islice(o.graph.triples((None, None, u)), limit)]
    
    result = {
        'as_subject': subj_triples,
        'as_object': obj_triples
    }
    ns[name] = result
    return f"Stored {len(subj_triples)} + {len(obj_triples)} triples about '{uri}' into '{name}'"

In [None]:
#| export
def ont_meta(ont: str, name: str = 'meta', ns: dict = None) -> str:
    """Extract ontology metadata (prefixes, annotation predicates, imports).
    
    Args:
        ont: Name of ontology variable in namespace
        name: Variable name for storing result
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    
    prefixes = dict(o.graph.namespaces())
    ann_preds = set(str(p) for s, p, obj in o.graph.triples((None, None, None)) if isinstance(obj, Literal))
    imports = [str(obj) for s, p, obj in o.graph.triples((None, OWL.imports, None))]
    
    res = AttrDict(
        prefixes=prefixes,
        ann_preds=list(ann_preds)[:50],  # Limit to first 50
        imports=imports
    )
    ns[name] = res
    return f"Stored metadata into '{name}': {len(prefixes)} prefixes, {len(ann_preds)} annotation predicates, {len(imports)} imports"

In [None]:
#| export
def ont_roots(ont: str, name: str = 'roots', ns: dict = None) -> str:
    """Find root classes (no declared superclass), store in namespace.
    
    Args:
        ont: Name of ontology variable in namespace
        name: Variable name for storing result
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = globals()
    o = ns[ont]
    
    has_super = set(o.supers.keys())
    roots = [str(c) for c in o.classes if str(c).startswith('http') and str(c) not in has_super]
    
    ns[name] = roots
    return f"Stored {len(roots)} root classes into '{name}'"

In [None]:
#| export
def setup_ontology_context(path: str | Path, ns: dict, name: str = 'ont', dataset_meta=None) -> str:
    """Load ontology and create meta-graph for RLM use.
    
    This sets up both the Graph and GraphMeta in the namespace.
    
    NEW: Dataset integration - if dataset_meta provided, automatically mounts
    the ontology into the dataset as onto/<name> graph.
    
    Args:
        path: Path to ontology file
        ns: Namespace dict
        name: Base name for graph handle
        dataset_meta: Optional DatasetMeta for auto-mounting
        
    Returns:
        Summary string
    """
    # Load graph
    load_msg = load_ontology(path, ns, name=name)
    
    # Create meta-graph
    g = ns[name]
    meta = GraphMeta(g, name=name)
    ns[f"{name}_meta"] = meta
    
    # NEW: Auto-mount in dataset if provided
    if dataset_meta is not None:
        try:
            from rlm.dataset import mount_ontology
            mount_msg = mount_ontology(dataset_meta, ns, str(path), name)
            load_msg += f"\n{mount_msg}"
        except Exception as e:
            print(f"Warning: Failed to mount ontology in dataset: {e}")
    
    # FIX: Namespace helper functions by ontology name to avoid overwriting
    # This allows multiple ontologies to coexist
    from functools import partial
    
    # Bind existing functions
    ns[f'{name}_graph_stats'] = partial(graph_stats, meta)
    ns[f'{name}_search_by_label'] = partial(search_by_label, meta)
    ns[f'{name}_describe_entity'] = partial(describe_entity, meta)
    
    # NEW: Bind Stage 2 bounded view primitives
    ns[f'{name}_search_entity'] = partial(search_entity, meta)
    ns[f'{name}_probe_relationships'] = partial(probe_relationships, meta)
    ns[f'{name}_find_path'] = partial(find_path, meta)
    ns[f'{name}_predicate_frequency'] = partial(predicate_frequency, meta)
    
    # Also bind without prefix for single-ontology convenience
    # (will be overwritten if multiple ontologies loaded, but prefixed versions persist)
    ns['graph_stats'] = partial(graph_stats, meta)
    ns['search_by_label'] = partial(search_by_label, meta)
    ns['describe_entity'] = partial(describe_entity, meta)
    ns['search_entity'] = partial(search_entity, meta)
    ns['probe_relationships'] = partial(probe_relationships, meta)
    ns['find_path'] = partial(find_path, meta)
    ns['predicate_frequency'] = partial(predicate_frequency, meta)
    
    return f"{load_msg}\nCreated meta-graph '{name}_meta' with {len(meta.classes)} classes, {len(meta.properties)} properties"

## Ontology Sense Building

### What is a "Sense Document"?

When an LLM needs to work with an ontology, loading the entire graph into context is wasteful and may exceed limits. Instead, we build a **sense document** - a compact summary that captures:

- **Formalism**: Which OWL/RDFS/SKOS constructs are used
- **Metadata structure**: Which annotation properties exist (labels, descriptions, etc.)
- **Domain/scope**: What the ontology is about
- **Navigation hints**: How to effectively search and traverse

This approach was developed through experiments in `dialogs/inspect_tools.ipynb` exploring progressive disclosure patterns.

### Why Sense Building Matters

**Design Decision Response** (from ISSUE_ANALYSIS.md):
> *GraphMeta.labels only uses rdfs:label* - This is a limitation because different ontologies use different annotation properties:
> - `rdfs:label`, `skos:prefLabel`, `skos:altLabel` for labels
> - `rdfs:comment`, `skos:definition`, `dcterms:description` for descriptions  
> - `vann:preferredNamespacePrefix`, `owl:versionInfo` for metadata

Rather than hardcode support for all possible properties, `build_sense()` **detects which annotation properties this specific ontology uses**, enabling intelligent search.

### References

- [Widoco Metadata Guide](https://github.com/dgarijo/Widoco/blob/master/doc/metadataGuide/guide.md) - Recommended ontology metadata properties
- [Anthropic: Building Effective Agents](https://www.anthropic.com/engineering/building-effective-agents) - Orchestrator-workers pattern
- [Anthropic: Progressive Disclosure](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents) - Context engineering strategy

### Implementation Pattern

The sense-building workflow (not agentic):
1. **Metadata collection** - Extract prefixes, detect annotation predicates, find ontology-level metadata
2. **Structural exploration** - Build hierarchy, property signatures, detect OWL axioms
3. **LLM synthesis** - One LLM call to identify domain, patterns, navigation hints
4. **Structured storage** - Store as retrievable AttrDict in REPL namespace

In [None]:
#| export
def build_sense(path: str, name: str = 'sense', ns: dict = None) -> str:
    """Build ontology sense document using workflow + LLM synthesis.
    
    Detects annotation properties per Widoco metadata guide:
    - Label properties: rdfs:label, skos:prefLabel, skos:altLabel, dcterms:title
    - Description properties: rdfs:comment, skos:definition, dcterms:description
    - Ontology metadata: vann:preferredNamespacePrefix, owl:versionInfo, etc.
    
    This function:
    1. Loads ontology and extracts metadata/roots programmatically
    2. Detects which annotation properties are actually used
    3. Builds hierarchy (2 levels), property info, characteristics
    4. Makes one LLM call to synthesize domain/scope/patterns/hints
    5. Returns structured AttrDict stored in namespace
    
    Args:
        path: Path to ontology file
        name: Variable name for sense document (default: 'sense')
        ns: Namespace dict
        
    Returns:
        Summary string
    """
    if ns is None: ns = {}
    
    # Derive ontology name from sense name
    ont_name = name.replace('_sense', '').replace('sense', 'ont')
    
    # Setup ontology context (loads graph + creates GraphMeta)
    setup_ontology_context(path, ns, name=ont_name)
    
    # Get metadata and roots
    ont_meta(f'{ont_name}_meta', name=f'{ont_name}_metadata', ns=ns)
    ont_roots(f'{ont_name}_meta', name=f'{ont_name}_roots', ns=ns)
    
    # Get references
    meta_obj = ns[f'{ont_name}_meta']
    metadata = ns[f'{ont_name}_metadata']
    roots = ns[f'{ont_name}_roots']
    g = meta_obj.graph
    
    # Detect ontology-level metadata (Widoco guide properties)
    ont_uri = None
    for s in g.subjects(RDF.type, OWL.Ontology):
        ont_uri = str(s)
        break
    
    ont_metadata = {}
    if ont_uri:
        ont_ref = URIRef(ont_uri)
        ont_metadata['uri'] = ont_uri
        
        # Title/label
        for title_prop in [DCTERMS.title, DC.title, RDFS.label]:
            titles = list(g.objects(ont_ref, title_prop))
            if titles:
                ont_metadata['title'] = str(titles[0])
                break
        
        # Description
        for desc_prop in [DCTERMS.description, DC.description, RDFS.comment]:
            descs = list(g.objects(ont_ref, desc_prop))
            if descs:
                ont_metadata['description'] = str(descs[0])
                break
        
        # Version info
        versions = list(g.objects(ont_ref, OWL.versionInfo))
        if versions:
            ont_metadata['version'] = str(versions[0])
        
        # Preferred namespace prefix
        prefixes = list(g.objects(ont_ref, VANN.preferredNamespacePrefix))
        if prefixes:
            ont_metadata['preferred_prefix'] = str(prefixes[0])
        
        # Preferred namespace URI
        ns_uris = list(g.objects(ont_ref, VANN.preferredNamespaceUri))
        if ns_uris:
            ont_metadata['preferred_namespace'] = str(ns_uris[0])
    
    # Detect which annotation properties are actually used (per Widoco guide)
    label_props = []
    desc_props = []
    
    # Check label properties
    for label_prop in [RDFS.label, SKOS.prefLabel, SKOS.altLabel, DCTERMS.title, DC.title]:
        if list(g.triples((None, label_prop, None))):
            label_props.append(str(label_prop))
    
    # Check description properties
    for desc_prop in [RDFS.comment, SKOS.definition, SKOS.note, SKOS.example, 
                      DCTERMS.description, DC.description]:
        if list(g.triples((None, desc_prop, None))):
            desc_props.append(str(desc_prop))
    
    # Build hierarchy (2 levels deep from roots)
    hier = {}
    for r in roots[:10]:  # Limit to first 10 roots
        lbl = meta_obj.labels.get(r, r)
        children = meta_obj.subs.get(r, [])
        hier[lbl] = {
            meta_obj.labels.get(c, c): [
                meta_obj.labels.get(gc, gc) 
                for gc in meta_obj.subs.get(c, [])[:5]
            ] 
            for c in children[:10]
        }
    
    # Extract top properties with domains/ranges
    top_props = []
    for p in meta_obj.properties[:20]:
        if p.startswith('http'):
            prop_label = meta_obj.labels.get(p, p)
            dom_uri = meta_obj.doms.get(p, '')
            rng_uri = meta_obj.rngs.get(p, '')
            dom_label = meta_obj.labels.get(dom_uri, dom_uri) if dom_uri else ''
            rng_label = meta_obj.labels.get(rng_uri, rng_uri) if rng_uri else ''
            top_props.append((prop_label, dom_label, rng_label))
    
    # Detect property characteristics (OWL axioms)
    prop_chars = {}
    for p in meta_obj.properties[:50]:
        if p.startswith('http'):
            chars = []
            p_uri = URIRef(p)
            
            # Check for transitive
            if list(g.triples((p_uri, RDF.type, OWL.TransitiveProperty))):
                chars.append('transitive')
            
            # Check for symmetric
            if list(g.triples((p_uri, RDF.type, OWL.SymmetricProperty))):
                chars.append('symmetric')
            
            # Check for functional
            if list(g.triples((p_uri, RDF.type, OWL.FunctionalProperty))):
                chars.append('functional')
            
            # Check for inverse functional
            if list(g.triples((p_uri, RDF.type, OWL.InverseFunctionalProperty))):
                chars.append('inverse_functional')
            
            # Check for inverse
            if list(g.triples((p_uri, OWL.inverseOf, None))):
                chars.append('has_inverse')
            
            if chars:
                prop_chars[p] = chars
    
    # Detect OWL constructs usage
    owl_constructs = {
        'restrictions': len(list(g.subjects(RDF.type, OWL.Restriction))),
        'unions': len(list(g.subjects(OWL.unionOf, None))),
        'intersections': len(list(g.subjects(OWL.intersectionOf, None))),
        'disjointness': len(list(g.triples((None, OWL.disjointWith, None)))),
        'equivalence': len(list(g.triples((None, OWL.equivalentClass, None))))
    }
    
    # Get URI pattern samples
    uri_sample = [c for c in meta_obj.classes[:5] if c.startswith('http')]
    uri_pattern = uri_sample[0].rsplit('/', 1)[0] if uri_sample else ''
    
    # Build prompt for LLM synthesis
    prompt = f"""Analyze this ontology and provide a sense document:

**Ontology Metadata:**
{ont_metadata}

**Stats:** {len(meta_obj.classes)} classes, {len(meta_obj.properties)} properties, {len(meta_obj.labels)} labels

**Annotation Properties Detected:**
- Label properties: {label_props}
- Description properties: {desc_props}

**Structure:**
- Prefixes: {list(metadata.prefixes.keys())[:10]}
- Root classes: {[meta_obj.labels.get(r, r) for r in roots[:10]]}
- Hierarchy (2 levels): {hier}

**Properties:**
- Top properties (label, domain, range): {top_props[:10]}
- Property characteristics: {prop_chars}

**OWL Constructs Usage:**
{owl_constructs}

**URI Pattern:** {uri_pattern}
- Sample URIs: {uri_sample[:3]}

Provide a concise sense document with:
1) Domain/scope - what is this ontology about?
2) Key branches - main conceptual areas in the hierarchy
3) Important properties - key relationships to know
4) Detected patterns - reification, measurement patterns, part-whole relationships, restriction patterns, etc.
5) SPARQL navigation hints - how to effectively query this ontology given the annotation properties available"""
    
    # Use llm_query from rlm.core
    from rlm.core import llm_query
    summary = llm_query(prompt, ns=ns, name='_sense_summary')
    
    # Build structured sense document
    sense_doc = AttrDict(
        ont=ont_name,
        ont_metadata=ont_metadata,
        stats={'cls': len(meta_obj.classes), 'props': len(meta_obj.properties), 'lbls': len(meta_obj.labels)},
        prefixes=metadata.prefixes,
        label_properties=label_props,  # NEW: detected label properties
        description_properties=desc_props,  # NEW: detected description properties
        ann_preds=metadata.ann_preds,
        roots=roots,
        hier=hier,
        top_props=top_props,
        prop_chars=prop_chars,
        owl_constructs=owl_constructs,  # NEW: OWL construct usage
        uri_pattern=uri_pattern,
        summary=ns['_sense_summary']
    )
    
    ns[name] = sense_doc
    return f"Built sense document into '{name}': {len(label_props)} label properties, {len(desc_props)} description properties, {len(hier)} root branches"

In [None]:
#| eval: false
# Test build_sense with PROV ontology
# Note: Requires API key, marked eval:false to avoid CI failures

test_ns = {}
result = build_sense('ontology/prov.ttl', name='prov_sense', ns=test_ns)
print(result)
print()

# Inspect the sense document
sense = test_ns['prov_sense']
print(f"Ontology: {sense.ont}")
print(f"Ontology Metadata: {sense.ont_metadata}")
print(f"Stats: {sense.stats}")
print()

# NEW: Show detected annotation properties
print(f"Label properties detected: {sense.label_properties}")
print(f"Description properties detected: {sense.description_properties}")
print()

print(f"Roots: {sense.roots}")
print(f"Root branches: {list(sense.hier.keys())}")
print(f"Top properties (first 3): {sense.top_props[:3]}")
print(f"Property characteristics: {sense.prop_chars}")
print(f"OWL constructs: {sense.owl_constructs}")
print(f"URI pattern: {sense.uri_pattern}")
print()
print("LLM Summary:")
print(sense.summary)

## Structured Sense Data

**NEW**: JSON-schemaed sense data for ReasoningBank integration.

The original `build_sense()` produces free-form prose in the `summary` field. This new system creates:
- **sense_card**: Compact, always-injected structured data (~500 chars)
- **sense_brief**: Detailed sections retrieved when needed (~2000 chars)
- **Grounding validation**: All URIs must exist in the ontology

See `docs/ont-sense-improvements.md` for full specification.

In [None]:
#| export
import json

# Sense Card schema (always injected, compact)
SENSE_CARD_SCHEMA = {
    'ontology_id': str,
    'domain_scope': str,  # max 300 chars
    'triple_count': int,
    'class_count': int,
    'property_count': int,
    'key_classes': list,   # max 5 items, each with uri/label/why_important
    'key_properties': list, # max 5 items, each with uri/label/domain/range/role
    'label_predicates': list,
    'description_predicates': list,
    'available_indexes': dict,  # by_label, hierarchy, domains, ranges, pred_freq counts
    'quick_hints': list,   # max 3 items
    'uri_pattern': str
}

def validate_sense_grounding(sense: dict, meta: GraphMeta) -> dict:
    """Validate all URIs in sense exist in the ontology.

    Args:
        sense: Sense document with sense_card (and optional sense_brief)
        meta: GraphMeta to validate against

    Returns:
        {'valid': bool, 'errors': list[str], 'error_count': int}
    """
    errors = []

    # Extract sense_card
    card = sense.get('sense_card', {})

    # Check key_classes URIs
    for cls in card.get('key_classes', []):
        if isinstance(cls, dict) and 'uri' in cls:
            if cls['uri'] not in meta.classes:
                errors.append(f"key_class URI not found: {cls['uri']}")

    # Check key_properties URIs
    for prop in card.get('key_properties', []):
        if isinstance(prop, dict) and 'uri' in prop:
            if prop['uri'] not in meta.properties:
                errors.append(f"key_property URI not found: {prop['uri']}")

    # Check sense_brief patterns if present
    brief = sense.get('sense_brief', {})
    patterns = brief.get('patterns', {}).get('detected_patterns', [])
    for pattern in patterns:
        for entity in pattern.get('entities_involved', []):
            # Entity might be a label or URI
            if entity.startswith('http://') or entity.startswith('https://'):
                if entity not in meta.classes and entity not in meta.properties:
                    errors.append(f"pattern entity URI not found: {entity}")

    return {
        'valid': len(errors) == 0,
        'errors': errors,
        'error_count': len(errors)
    }


In [None]:
#| export
def build_sense_structured(
    path: str,
    name: str = 'sense',
    ns: dict = None
) -> dict:
    """Build structured sense document with card and brief.

    Returns JSON-schemaed output instead of free-form prose.

    Args:
        path: Path to ontology file
        name: Variable name for sense document
        ns: Namespace dict

    Returns:
        Dict with 'sense_card', 'sense_brief', and '_validation' keys
    """
    if ns is None: ns = {}

    # Derive ontology name from sense name
    ont_name = name.replace('_sense', '').replace('sense', 'ont')

    # Setup ontology context
    setup_ontology_context(path, ns, name=ont_name)

    # Get metadata and roots
    ont_meta(f'{ont_name}_meta', name=f'{ont_name}_metadata', ns=ns)
    ont_roots(f'{ont_name}_meta', name=f'{ont_name}_roots', ns=ns)

    # Get references
    meta_obj = ns[f'{ont_name}_meta']
    metadata = ns[f'{ont_name}_metadata']
    roots = ns[f'{ont_name}_roots']
    g = meta_obj.graph

    # Detect ontology-level metadata
    ont_uri = None
    for s in g.subjects(RDF.type, OWL.Ontology):
        ont_uri = str(s)
        break

    ont_metadata = {}
    domain_scope = "No description available"
    
    if ont_uri:
        ont_ref = URIRef(ont_uri)
        ont_metadata['uri'] = ont_uri

        # Get description for domain_scope
        for desc_prop in [DCTERMS.description, DC.description, RDFS.comment]:
            descs = list(g.objects(ont_ref, desc_prop))
            if descs:
                domain_scope = str(descs[0])[:300]
                ont_metadata['description'] = domain_scope
                break

    # Detect annotation properties
    label_props = []
    desc_props = []

    for label_prop in [RDFS.label, SKOS.prefLabel, SKOS.altLabel, DCTERMS.title, DC.title]:
        if list(g.triples((None, label_prop, None))):
            label_props.append(str(label_prop))

    for desc_prop in [RDFS.comment, SKOS.definition, SKOS.note, DCTERMS.description, DC.description]:
        if list(g.triples((None, desc_prop, None))):
            desc_props.append(str(desc_prop))

    # Get URI pattern
    uri_sample = [c for c in meta_obj.classes[:5] if c.startswith('http')]
    uri_pattern = uri_sample[0].rsplit('/', 1)[0] if uri_sample else ''

    # Build sense_card programmatically (100% grounded)
    sense_card = {
        'ontology_id': ont_name,
        'domain_scope': domain_scope,
        'triple_count': len(g),
        'class_count': len(meta_obj.classes),
        'property_count': len(meta_obj.properties),
        'key_classes': [],
        'key_properties': [],
        'label_predicates': label_props,
        'description_predicates': desc_props,
        'available_indexes': {
            'by_label': len(meta_obj.labels),
            'hierarchy': len(meta_obj.subs) + len(meta_obj.supers),
            'domains': len(meta_obj.doms),
            'ranges': len(meta_obj.rngs),
            'pred_freq': len(meta_obj.pred_freq)
        },
        'quick_hints': [],
        'uri_pattern': uri_pattern
    }

    # Extract key classes from roots (programmatic, grounded)
    for r in roots[:5]:
        sense_card['key_classes'].append({
            'uri': r,
            'label': meta_obj.labels.get(r, r.split('/')[-1].split('#')[-1]),
            'why_important': 'Root class in hierarchy'
        })

    # Extract key properties from metadata (programmatic, grounded)
    for p in meta_obj.properties[:5]:
        if p.startswith('http'):
            dom = meta_obj.doms.get(p, '')
            rng = meta_obj.rngs.get(p, '')
            dom_label = meta_obj.labels.get(dom, dom.split('/')[-1].split('#')[-1]) if dom else None
            rng_label = meta_obj.labels.get(rng, rng.split('/')[-1].split('#')[-1]) if rng else None
            
            sense_card['key_properties'].append({
                'uri': p,
                'label': meta_obj.labels.get(p, p.split('/')[-1].split('#')[-1]),
                'domain': dom_label,
                'range': rng_label,
                'role': f"Connects {dom_label} to {rng_label}" if dom_label and rng_label else "Common property"
            })

    # Generate quick_hints (simple programmatic rules, no LLM)
    hints = []
    if label_props:
        hints.append(f"Use {label_props[0].split('/')[-1].split('#')[-1]} for entity labels")
    if sense_card['available_indexes']['hierarchy'] > 50:
        hints.append(f"Hierarchy index has {sense_card['available_indexes']['hierarchy']} relationships")
    if sense_card['available_indexes']['by_label'] > 10:
        hints.append(f"Label index has {sense_card['available_indexes']['by_label']} entries for quick lookup")
    
    sense_card['quick_hints'] = hints[:3]

    # Build sense_brief with hierarchy overview
    sense_brief = {
        'ontology_id': ont_name,
        'hierarchy_overview': {
            'root_classes': [
                {
                    'uri': r,
                    'label': meta_obj.labels.get(r, r.split('/')[-1].split('#')[-1]),
                    'direct_subclasses': [
                        meta_obj.labels.get(c, c.split('/')[-1].split('#')[-1]) 
                        for c in meta_obj.subs.get(r, [])[:5]
                    ]
                }
                for r in roots[:5]
            ],
            'max_depth': 2,
            'branching_factor': 'medium'
        }
    }

    # Package result
    result = {
        'sense_card': sense_card,
        'sense_brief': sense_brief
    }

    # Validate grounding
    validation = validate_sense_grounding(result, meta_obj)
    result['_validation'] = validation

    # Store in namespace
    ns[name] = result

    return result


## Integration with RLM

Helper to setup ontology context for `rlm_run()`.

In [None]:
#| eval: false
# Test RLM with structured sense context
# Note: Requires API key, marked eval:false to avoid CI failures

from rlm.core import rlm_run

print("=" * 70)
print(" RLM INTEGRATION TEST: Structured Sense as Context")
print("=" * 70)

# Setup: Build structured sense for PROV ontology
ns = {}
sense_result = build_sense_structured('ontology/prov.ttl', name='prov_sense', ns=ns)

# Get formatted sense card as context
sense_context = format_sense_card(sense_result['sense_card'])

print(f"\nüìã Context Type: Structured Sense Card")
print(f"   Size: {len(sense_context)} chars")
print(f"   Grounding: {'PASS' if sense_result['_validation']['valid'] else 'FAIL'}")

# Test query
query = "What is the Activity class in PROV?"

print(f"\n‚ùì Query: {query}")
print("\n" + "-" * 70)
print("Running RLM with sense card context...")
print("-" * 70)

# Run RLM with sense context
answer, iterations, final_ns = rlm_run(
    query,
    sense_context,
    ns=ns,
    max_iters=5
)

print(f"\n‚úì Answer: {answer}")
print(f"\nüìä Iterations: {len(iterations)}")
print(f"   Max allowed: 5")

# Show iteration details
print(f"\nüîç Iteration Breakdown:")
for i, iteration in enumerate(iterations, 1):
    print(f"   {i}. {iteration.get('action', 'unknown action')}")

print("\n" + "=" * 70)
print(" TEST RESULT")
print("=" * 70)

if len(iterations) <= 5:
    print(f"\n‚úì PASS: RLM converged in {len(iterations)} iterations")
    print(f"  The structured sense card provides sufficient context for RLM")
else:
    print(f"\n‚úó FAIL: RLM did not converge within iteration limit")

print("\nüí° Benefits of Structured Sense:")
print("  ‚Ä¢ Compact context (~600 chars vs full ontology)")
print("  ‚Ä¢ 100% grounded (no hallucinated URIs)")
print("  ‚Ä¢ Ontology-aware (detects label/description predicates)")
print("  ‚Ä¢ Progressive disclosure ready (can add hierarchy brief)")

### Test RLM Integration with Structured Sense

Test if `rlm_run()` works with the new structured sense documents as context.

In [None]:
#| eval: false
# Test formatting functions (depends on previous cell state)
# Requires: result, card, brief variables from build_sense_structured() call
print("=" * 60)
print("FORMATTING FUNCTIONS TEST")
print("=" * 60)

# Get card and brief from result (from previous test cell)
card = result['sense_card']
brief = result['sense_brief']

# Test format_sense_card
formatted_card = format_sense_card(card)
print(f"\n‚úì Formatted Sense Card ({len(formatted_card)} chars):")
print("-" * 60)
print(formatted_card)
print("-" * 60)

# Test format_sense_brief_section
formatted_hier = format_sense_brief_section(brief, 'hierarchy_overview')
print(f"\n‚úì Formatted Hierarchy Overview ({len(formatted_hier)} chars):")
print("-" * 60)
print(formatted_hier)
print("-" * 60)

# Test get_sense_context
query = "What are the subclasses of Activity?"
context = get_sense_context(query, result)
print(f"\n‚úì Auto-detected Context for: '{query}'")
print(f"  Context length: {len(context)} chars")
print(f"  Includes hierarchy: {('Hierarchy Overview' in context)}")

print(f"\n{'=' * 60}")
print("FORMATTING TESTS PASSED")
print("=" * 60)

In [None]:
#| eval: false
# Test build_sense_structured with PROV ontology (depends on previous state)
test_ns = {}
result = build_sense_structured('ontology/prov.ttl', name='prov_sense_structured', ns=test_ns)

print("=" * 60)
print("STRUCTURED SENSE TEST")
print("=" * 60)

# Check validation
print(f"\n‚úì Validation: {'PASS' if result['_validation']['valid'] else 'FAIL'}")
if not result['_validation']['valid']:
    print(f"  Errors: {result['_validation']['errors']}")
else:
    print("  All URIs grounded in ontology")

# Check sense_card
card = result['sense_card']
print(f"\n‚úì Sense Card:")
print(f"  Ontology ID: {card['ontology_id']}")
print(f"  Triple count: {card['triple_count']:,}")
print(f"  Class count: {card['class_count']}")
print(f"  Property count: {card['property_count']}")
print(f"  Label predicates: {len(card['label_predicates'])}")
print(f"  Key classes: {len(card['key_classes'])}")
print(f"  Key properties: {len(card['key_properties'])}")
print(f"  Quick hints: {len(card['quick_hints'])}")

# Verify key_classes are grounded
print(f"\n‚úì Key Classes (grounded URIs):")
for cls in card['key_classes'][:3]:
    print(f"  - {cls['label']}")
    print(f"    URI: {cls['uri'][:50]}...")

# Verify key_properties are grounded
print(f"\n‚úì Key Properties (grounded URIs):")
for prop in card['key_properties'][:3]:
    print(f"  - {prop['label']}: {prop['role']}")
    print(f"    URI: {prop['uri'][:50]}...")

# Check sense_brief
brief = result['sense_brief']
print(f"\n‚úì Sense Brief:")
print(f"  Hierarchy roots: {len(brief['hierarchy_overview']['root_classes'])}")
print(f"  Max depth: {brief['hierarchy_overview']['max_depth']}")

print(f"\n{'=' * 60}")
print("ALL TESTS PASSED")
print("=" * 60)

In [None]:
#| export
def format_sense_card(card: dict) -> str:
    """Format sense card for context injection (~500 chars).

    Args:
        card: sense_card dict

    Returns:
        Formatted markdown string
    """
    lines = [
        f"# Ontology: {card['ontology_id']}",
        "",
        f"**Domain**: {card['domain_scope'][:200]}",
        "",
        f"**Stats**: {card['triple_count']:,} triples, {card['class_count']} classes, {card['property_count']} properties",
        "",
        "**Key Classes**:"
    ]

    for cls in card['key_classes'][:3]:
        lines.append(f"- {cls['label']}: {cls['why_important']}")

    lines.append("")
    lines.append("**Key Properties**:")
    for prop in card['key_properties'][:3]:
        if prop['domain'] and prop['range']:
            lines.append(f"- {prop['label']}: {prop['domain']} ‚Üí {prop['range']}")
        else:
            lines.append(f"- {prop['label']}: {prop['role']}")

    lines.append("")
    lines.append(f"**Labels via**: {', '.join([p.split('/')[-1].split('#')[-1] for p in card['label_predicates']])}")

    if card['quick_hints']:
        lines.append("")
        lines.append("**Quick Hints**:")
        for hint in card['quick_hints']:
            lines.append(f"- {hint}")

    return '\n'.join(lines)


def format_sense_brief_section(brief: dict, section: str) -> str:
    """Format a specific brief section.

    Args:
        brief: sense_brief dict
        section: Section name (e.g., 'hierarchy_overview', 'patterns')

    Returns:
        Formatted markdown string
    """
    if section not in brief:
        return ""

    if section == 'hierarchy_overview':
        lines = ["## Hierarchy Overview", ""]
        overview = brief['hierarchy_overview']
        lines.append(f"**Root Classes** (max depth: {overview.get('max_depth', 'unknown')})")
        for root in overview.get('root_classes', [])[:5]:
            lines.append(f"- **{root['label']}**")
            for sub in root.get('direct_subclasses', [])[:3]:
                lines.append(f"  - {sub}")
        return '\n'.join(lines)

    return ""


def get_sense_context(query: str, sense: dict) -> str:
    """Auto-detect and return relevant sense sections for a query.

    Args:
        query: User query
        sense: Full sense document (with sense_card and sense_brief)

    Returns:
        Formatted context string
    """
    # Always include card
    context = format_sense_card(sense['sense_card'])

    # Auto-detect relevant brief sections
    query_lower = query.lower()
    brief = sense.get('sense_brief', {})

    if any(word in query_lower for word in ['subclass', 'superclass', 'hierarchy', 'type', 'parent', 'child']):
        context += "\n\n" + format_sense_brief_section(brief, 'hierarchy_overview')

    return context


In [None]:
# NOTE: setup_ontology_context() is defined above in cell-27
# This cell previously contained a duplicate definition

In [None]:
# Test setup for RLM
test_ns = {}
result = setup_ontology_context('ontology/prov.ttl', test_ns, name='prov')
print(result)
print()
print("Namespace contains:")
for k in test_ns.keys():
    print(f"  {k}: {type(test_ns[k]).__name__}")

Loaded 1664 triples from prov.ttl into 'prov'
Created meta-graph 'prov_meta' with 59 classes, 89 properties

Namespace contains:
  prov: Graph
  prov_meta: GraphMeta
  prov_graph_stats: partial
  prov_search_by_label: partial
  prov_describe_entity: partial
  prov_search_entity: partial
  prov_probe_relationships: partial
  prov_find_path: partial
  prov_predicate_frequency: partial
  graph_stats: partial
  search_by_label: partial
  describe_entity: partial
  search_entity: partial
  probe_relationships: partial
  find_path: partial
  predicate_frequency: partial


In [None]:
# Test new exploration functions
# Reuse the test_ns from previous cell with loaded prov ontology
# Note: prov_meta is a GraphMeta object in test_ns

# Test that new indexes work
meta = test_ns['prov_meta']
assert len(meta.by_label) > 0  # inverted label index
assert len(meta.subs) > 0 or len(meta.supers) > 0  # class hierarchy
print(f"‚úì New GraphMeta indexes work: by_label has {len(meta.by_label)} entries")

# Test ont_describe (need to pass GraphMeta object as namespace entry)
result = ont_describe('prov_meta', 'http://www.w3.org/ns/prov#Activity', name='activity_desc', ns=test_ns)
assert 'activity_desc' in test_ns
print(f"‚úì ont_describe works: {result}")

# Test ont_meta  
result = ont_meta('prov_meta', name='prov_metadata', ns=test_ns)
assert 'prov_metadata' in test_ns
print(f"‚úì ont_meta works: {result}")

# Test ont_roots
result = ont_roots('prov_meta', name='prov_roots', ns=test_ns)
assert 'prov_roots' in test_ns
print(f"‚úì ont_roots works: {result}")

‚úì New GraphMeta indexes work: by_label has 156 entries
‚úì ont_describe works: Stored 10 + 34 triples about 'http://www.w3.org/ns/prov#Activity' into 'activity_desc'
‚úì ont_meta works: Stored metadata into 'prov_metadata': 29 prefixes, 16 annotation predicates, 9 imports
‚úì ont_roots works: Stored 10 root classes into 'prov_roots'


## Test with RLM

Now let's test asking a question about the PROV ontology using `rlm_run()`.

In [None]:
#| eval: false
from rlm.core import rlm_run

# Setup namespace with PROV ontology
ns = {}
setup_ontology_context('ontology/prov.ttl', ns, name='prov')

# Ask a question
# The context is the GraphMeta summary, not the full graph
context = ns['prov_meta'].summary()

answer, iterations, ns = rlm_run(
    "What is the Activity class in the PROV ontology?",
    context,
    ns=ns,
    max_iters=3
)

print(f"Answer: {answer}")
print(f"Iterations: {len(iterations)}")

## Sense Validation Gate

Validate sense data before RLM operations (precondition check).

In [None]:
#| export
def validate_sense_precondition(sense: dict, meta) -> dict:
    """Gate 0: Validate sense data before RLM operations.
    
    Checks:
    - URI grounding (all URIs exist in ontology)
    - Card size (under 800 chars)
    - Required fields present
    
    Args:
        sense: Sense document from build_sense_structured()
        meta: GraphMeta object for grounding validation
    
    Returns:
        Dictionary with proceed flag and validation details
    """
    # Re-run grounding validation
    grounding = validate_sense_grounding(sense, meta)
    
    # Check card size
    card_text = format_sense_card(sense['sense_card'])
    card_size_ok = len(card_text) <= 800  # Allow some buffer over 600
    
    # Check required fields
    card = sense.get('sense_card', {})
    required_fields = [
        'ontology_id', 'domain_scope', 'key_classes',
        'key_properties', 'label_predicates'
    ]
    has_required = all(field in card for field in required_fields)
    
    proceed = grounding['valid'] and card_size_ok and has_required
    
    reason = ''
    if not proceed:
        issues = []
        if not grounding['valid']:
            issues.append(f"grounding failed ({grounding['error_count']} errors)")
        if not card_size_ok:
            issues.append(f"card too large ({len(card_text)} chars)")
        if not has_required:
            issues.append("missing required fields")
        issues_text = ', '.join(issues)
        reason = f"Validation failed: {issues_text}"
    
    return {
        'proceed': proceed,
        'grounding_valid': grounding['valid'],
        'card_size': len(card_text),
        'card_size_ok': card_size_ok,
        'has_required_fields': has_required,
        'reason': reason
    }


In [None]:
#| eval: false
# Test sense validation gate (requires real ontology)
from rlm.ontology import setup_ontology_context, build_sense_structured

print("Test: validate_sense_precondition()")
print("=" * 60)

ns = {}
setup_ontology_context('ontology/prov.ttl', ns, name='prov')
sense = build_sense_structured('ontology/prov.ttl', name='prov_sense', ns=ns)

result = validate_sense_precondition(sense, ns['prov_meta'])

print(f"Proceed: {result['proceed']}")
print(f"Grounding valid: {result['grounding_valid']}")
print(f"Card size: {result['card_size']} chars (ok: {result['card_size_ok']})")
print(f"Has required fields: {result['has_required_fields']}")

if result['proceed']:
    print("\n‚úì Sense validation gate passed")
else:
    print(f"\n‚úó Validation failed: {result['reason']}")
