# SHACL Examples and Shape Indexing

> Enables 'retrieve example → adapt → run → inspect' workflow for discovering how to query unfamiliar datasets.

This module provides tools for detecting SHACL content in RDF graphs, indexing shapes, and retrieving shape information in a bounded manner suitable for progressive disclosure.

In [None]:
#| default_exp shacl_examples

In [None]:
#| export
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from rdflib import Graph, Namespace, RDF, RDFS, URIRef, Literal
from rdflib.namespace import SH

## SHACLIndex Dataclass

The `SHACLIndex` holds indexed SHACL shapes for efficient retrieval.

In [None]:
#| export
@dataclass
class SHACLIndex:
    """Index of SHACL shapes for retrieval.
    
    Attributes:
        shapes: List of shape URIs
        targets: Mapping from shape URI to target class URIs
        properties: Mapping from shape URI to property constraint dicts
        keywords: Inverted index from keyword to shape URIs
        paradigm: SHACL usage paradigm ('validation', 'shacl-first', 'mixed')
    """
    shapes: List[str] = field(default_factory=list)
    targets: Dict[str, List[str]] = field(default_factory=dict)
    properties: Dict[str, List[dict]] = field(default_factory=dict)
    keywords: Dict[str, List[str]] = field(default_factory=dict)
    paradigm: str = 'unknown'

    def summary(self) -> str:
        """Return bounded summary of index."""
        return f"SHACLIndex: {len(self.shapes)} shapes, {len(self.keywords)} keywords, paradigm={self.paradigm}"

## SHACL Detection

Detect whether a graph contains SHACL shapes and determine the usage paradigm.

In [None]:
#| export
def detect_shacl(graph: Graph) -> dict:
    """Detect SHACL content in a graph.

    Args:
        graph: RDF graph to analyze

    Returns:
        Dict with:
            has_shacl: True if any SHACL patterns found
            node_shapes: Count of sh:NodeShape instances
            property_shapes: Count of sh:PropertyShape instances
            paradigm: 'validation', 'shacl-first', or 'mixed'
    """
    # Count SHACL shape types
    node_shapes = list(graph.subjects(RDF.type, SH.NodeShape))
    property_shapes = list(graph.subjects(RDF.type, SH.PropertyShape))
    
    has_shacl = len(node_shapes) > 0 or len(property_shapes) > 0
    
    if not has_shacl:
        return {
            'has_shacl': False,
            'node_shapes': 0,
            'property_shapes': 0,
            'paradigm': 'none'
        }
    
    # Detect paradigm
    DASH = Namespace("http://datashapes.org/dash#")
    dash_shape_classes = list(graph.subjects(RDF.type, DASH.ShapeClass))
    
    # Check if shapes are also OWL classes (mixed paradigm)
    OWL = Namespace("http://www.w3.org/2002/07/owl#")
    shapes_as_classes = any(
        (s, RDF.type, OWL.Class) in graph or 
        (s, RDFS.subClassOf, None) in graph
        for s in node_shapes
    )
    
    if dash_shape_classes:
        paradigm = 'shacl-first'
    elif shapes_as_classes:
        paradigm = 'mixed'
    else:
        paradigm = 'validation'
    
    return {
        'has_shacl': True,
        'node_shapes': len(node_shapes),
        'property_shapes': len(property_shapes),
        'paradigm': paradigm
    }

## Keyword Extraction

Extract searchable keywords from shape metadata.

In [None]:
#| export
def extract_keywords(graph: Graph, shape: URIRef, target_classes: List[str], props: List[dict]) -> List[str]:
    """Extract searchable keywords from a shape.
    
    Args:
        graph: RDF graph containing the shape
        shape: Shape URI
        target_classes: Target class URIs
        props: Property constraint dicts
    
    Returns:
        List of lowercase keywords
    """
    keywords = set()
    
    # Extract from shape URI local name
    shape_local = str(shape).split('/')[-1].split('#')[-1]
    keywords.add(shape_local.lower())
    
    # Extract from labels
    for label in graph.objects(shape, RDFS.label):
        keywords.add(str(label).lower())
    
    # Extract from target class local names
    for tc in target_classes:
        tc_local = tc.split('/')[-1].split('#')[-1]
        keywords.add(tc_local.lower())
    
    # Extract from property paths
    for prop in props:
        if prop.get('path'):
            path_local = prop['path'].split('/')[-1].split('#')[-1]
            keywords.add(path_local.lower())
    
    return list(keywords)

## Shape Index Building

Build a searchable index from SHACL shapes in a graph.

In [None]:
#| export
def build_shacl_index(graph: Graph) -> SHACLIndex:
    """Build searchable index from SHACL shapes in graph.
    
    Args:
        graph: RDF graph containing SHACL shapes
    
    Returns:
        SHACLIndex with indexed shapes
    """
    detection = detect_shacl(graph)
    
    if not detection['has_shacl']:
        return SHACLIndex(paradigm='none')
    
    shapes = []
    targets = {}
    properties = {}
    keywords = {}
    
    # Index all NodeShapes
    for shape in graph.subjects(RDF.type, SH.NodeShape):
        shape_uri = str(shape)
        shapes.append(shape_uri)
        
        # Get target classes
        target_classes = [str(t) for t in graph.objects(shape, SH.targetClass)]
        targets[shape_uri] = target_classes
        
        # Get property constraints
        props = []
        for prop_node in graph.objects(shape, SH.property):
            path = graph.value(prop_node, SH.path)
            datatype = graph.value(prop_node, SH.datatype)
            node_kind = graph.value(prop_node, SH.nodeKind)
            min_count = graph.value(prop_node, SH.minCount)
            max_count = graph.value(prop_node, SH.maxCount)
            class_constraint = graph.value(prop_node, SH['class'])
            
            props.append({
                'path': str(path) if path else None,
                'datatype': str(datatype) if datatype else None,
                'nodeKind': str(node_kind) if node_kind else None,
                'minCount': int(min_count) if min_count else None,
                'maxCount': int(max_count) if max_count else None,
                'class': str(class_constraint) if class_constraint else None,
            })
        properties[shape_uri] = props
        
        # Build keyword index
        shape_keywords = extract_keywords(graph, shape, target_classes, props)
        for kw in shape_keywords:
            if kw not in keywords:
                keywords[kw] = []
            if shape_uri not in keywords[kw]:
                keywords[kw].append(shape_uri)
    
    return SHACLIndex(
        shapes=shapes,
        targets=targets,
        properties=properties,
        keywords=keywords,
        paradigm=detection['paradigm']
    )

## Bounded View Functions

Progressive disclosure primitives for exploring SHACL shapes.

In [None]:
#| export
def describe_shape(index: SHACLIndex, shape_uri: str, limit: int = 10) -> dict:
    """Get bounded description of a SHACL shape.

    Args:
        index: SHACL index to query
        shape_uri: URI of shape to describe
        limit: Maximum number of properties to return

    Returns:
        Dict with:
            uri: Shape URI
            targets: List of target class URIs
            properties: First `limit` property constraints
            property_count: Total property count
            truncated: True if property list was truncated
    """
    if shape_uri not in index.targets:
        return {'error': f'Shape {shape_uri} not found in index'}

    props = index.properties.get(shape_uri, [])
    return {
        'uri': shape_uri,
        'targets': index.targets[shape_uri],
        'properties': props[:limit],
        'property_count': len(props),
        'truncated': len(props) > limit
    }

In [None]:
#| export
def search_shapes(index: SHACLIndex, keyword: str, limit: int = 5) -> list:
    """Find shapes matching keyword.

    Args:
        index: SHACL index to search
        keyword: Search term
        limit: Maximum number of results

    Returns:
        List of dicts with:
            uri: Shape URI
            targets: Target class URIs
            matched_keyword: The keyword that matched
    """
    keyword_lower = keyword.lower()
    matches = []

    for kw, shape_uris in index.keywords.items():
        if keyword_lower in kw.lower():
            for uri in shape_uris:
                matches.append({
                    'uri': uri,
                    'targets': index.targets.get(uri, []),
                    'matched_keyword': kw
                })

    # Dedupe by URI, keep first match
    seen = set()
    unique = []
    for m in matches:
        if m['uri'] not in seen:
            seen.add(m['uri'])
            unique.append(m)

    return unique[:limit]

In [None]:
#| export
def shape_constraints(index: SHACLIndex, shape_uri: str) -> str:
    """Get human-readable property constraints for a shape.
    
    Args:
        index: SHACL index to query
        shape_uri: URI of shape
    
    Returns:
        Formatted string with property constraints
    """
    if shape_uri not in index.properties:
        return f"Shape {shape_uri} not found in index"

    lines = [f"Constraints for {shape_uri.split('/')[-1].split('#')[-1]}:"]
    
    props = index.properties[shape_uri]
    if not props:
        lines.append("  (no property constraints)")
        return '\n'.join(lines)
    
    for prop in props:
        path = prop.get('path', '?')
        path_local = path.split('/')[-1].split('#')[-1] if path else '?'
        
        parts = []
        if prop.get('datatype'):
            dt_local = prop['datatype'].split('#')[-1]
            parts.append(f"type={dt_local}")
        if prop.get('class'):
            cls_local = prop['class'].split('/')[-1].split('#')[-1]
            parts.append(f"class={cls_local}")
        if prop.get('nodeKind'):
            nk_local = prop['nodeKind'].split('#')[-1]
            parts.append(f"kind={nk_local}")
        if prop.get('minCount') is not None:
            parts.append(f"min={prop['minCount']}")
        if prop.get('maxCount') is not None:
            parts.append(f"max={prop['maxCount']}")
        
        constraint = ', '.join(parts) if parts else 'no constraints'
        lines.append(f"  {path_local}: {constraint}")

    return '\n'.join(lines)

## Tests

Basic tests for SHACL detection and indexing.

In [None]:
# Test detect_shacl with empty graph
g = Graph()
result = detect_shacl(g)
assert result['has_shacl'] == False
assert result['paradigm'] == 'none'
print("✓ Empty graph detection works")

In [None]:
# Test detect_shacl with a simple NodeShape
g = Graph()
EX = Namespace("http://example.org/")
g.add((EX.PersonShape, RDF.type, SH.NodeShape))
g.add((EX.PersonShape, SH.targetClass, EX.Person))
result = detect_shacl(g)
assert result['has_shacl'] == True
assert result['node_shapes'] == 1
assert result['paradigm'] == 'validation'
print("✓ NodeShape detection works")

In [None]:
# Test build_shacl_index
g = Graph()
EX = Namespace("http://example.org/")
g.add((EX.PersonShape, RDF.type, SH.NodeShape))
g.add((EX.PersonShape, SH.targetClass, EX.Person))
g.add((EX.PersonShape, RDFS.label, Literal("Person Shape")))

index = build_shacl_index(g)
assert len(index.shapes) == 1
assert str(EX.PersonShape) in index.shapes
assert str(EX.Person) in index.targets[str(EX.PersonShape)]
assert 'person' in index.keywords or 'personshape' in index.keywords
print("✓ Index building works")
print(f"  {index.summary()}")

In [None]:
# Test search_shapes
results = search_shapes(index, 'person')
assert len(results) >= 1
assert str(EX.PersonShape) == results[0]['uri']
print("✓ Shape search works")
print(f"  Found {len(results)} shapes for 'person'")

In [None]:
# Test describe_shape
desc = describe_shape(index, str(EX.PersonShape))
assert desc['uri'] == str(EX.PersonShape)
assert str(EX.Person) in desc['targets']
print("✓ Shape description works")
print(f"  Targets: {desc['targets']}")
print(f"  Property count: {desc['property_count']}")

In [None]:
# Test with DCAT-AP shapes
from pathlib import Path
dcat_path = Path('../ontology/dcat-ap/dcat-ap-SHACL.ttl')
if dcat_path.exists():
    g_dcat = Graph()
    g_dcat.parse(dcat_path)
    
    detection = detect_shacl(g_dcat)
    print(f"\n✓ DCAT-AP detection: {detection['node_shapes']} node shapes, paradigm={detection['paradigm']}")
    
    index_dcat = build_shacl_index(g_dcat)
    print(f"  {index_dcat.summary()}")
    
    # Search for Dataset shape
    dataset_shapes = search_shapes(index_dcat, 'dataset', limit=3)
    print(f"  Found {len(dataset_shapes)} shapes matching 'dataset':")
    for s in dataset_shapes[:3]:
        shape_name = s['uri'].split('/')[-1].split('#')[-1]
        print(f"    - {shape_name}")
else:
    print("\n(DCAT-AP shapes not found, skipping test)")