# sparql_handles

> SPARQL query execution with first-class result handles

In [None]:
#| default_exp sparql_handles

## Overview

This module implements Stage 3 from the trajectory: SPARQL query execution with first-class result handles.

### Result Handle Pattern

Every SPARQL execution produces a `SPARQLResultHandle` with:
- `meta`: query, endpoint/local, timestamp, row count, columns
- `rows`: stored internally as list of dicts (SELECT) or Graph (CONSTRUCT/DESCRIBE)
- Bounded view operations: `res_head()`, `res_where()`, `res_group()`, `res_sample()`

### Progressive Disclosure

Result handles enable the root model to refine queries by inspecting metadata and small slices, not rerunning blind queries.

### Dataset Integration

SPARQL results can optionally be stored in dataset work graphs with full provenance tracking.

## Imports

In [None]:
#| export
from sparqlx import SPARQLWrapper
from rdflib import Graph, URIRef, Literal
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from functools import partial
import random
from typing import Optional

## SPARQLResultHandle

Unified wrapper for all SPARQL result types with metadata and bounded view operations.

In [None]:
#| export
@dataclass
class SPARQLResultHandle:
    """Wrapper for SPARQL results with metadata and bounded view operations."""

    # Result data (never dumped wholesale to LLM)
    rows: list | Graph          # SELECT rows or CONSTRUCT/DESCRIBE graph
    result_type: str            # 'select' | 'ask' | 'construct' | 'describe'

    # Metadata
    query: str                  # Original SPARQL query
    endpoint: str               # Where executed (URL or 'local')
    timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat() + 'Z')

    # For SELECT results
    columns: list = None        # Column names
    total_rows: int = 0         # Total before limit

    # For Graph results
    triple_count: int = 0       # Number of triples

    def summary(self) -> str:
        """Bounded summary for LLM."""
        if self.result_type == 'select':
            return f"SELECT: {len(self.rows)} rows, columns={self.columns}"
        elif self.result_type == 'ask':
            return f"ASK: {self.rows}"
        else:
            return f"{self.result_type.upper()}: {self.triple_count} triples"

    def __len__(self):
        if isinstance(self.rows, bool):
            return 1  # ASK result
        return len(self.rows) if hasattr(self.rows, '__len__') else 0

    def __iter__(self):
        if isinstance(self.rows, bool):
            return iter([self.rows])
        return iter(self.rows)
    
    def __repr__(self):
        return f"SPARQLResultHandle({self.summary()})"

Test SPARQLResultHandle with different result types:

In [None]:
# Test SELECT result
select_handle = SPARQLResultHandle(
    rows=[{'s': 'http://ex.org/alice', 'age': '30'}],
    result_type='select',
    query='SELECT ?s ?age WHERE { ?s :age ?age }',
    endpoint='local',
    columns=['s', 'age'],
    total_rows=1
)
assert select_handle.summary() == "SELECT: 1 rows, columns=['s', 'age']"
assert len(select_handle) == 1
print(f"✓ SELECT handle: {select_handle}")

# Test ASK result
ask_handle = SPARQLResultHandle(
    rows=True,
    result_type='ask',
    query='ASK { ?s ?p ?o }',
    endpoint='local'
)
assert ask_handle.summary() == "ASK: True"
print(f"✓ ASK handle: {ask_handle}")

# Test CONSTRUCT result
g = Graph()
g.add((URIRef('http://ex.org/alice'), URIRef('http://ex.org/age'), Literal('30')))
construct_handle = SPARQLResultHandle(
    rows=g,
    result_type='construct',
    query='CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }',
    endpoint='local',
    triple_count=1
)
assert construct_handle.summary() == "CONSTRUCT: 1 triples"
print(f"✓ CONSTRUCT handle: {construct_handle}")

## Remote SPARQL Query

Execute SPARQL queries against remote endpoints and return result handles.

In [None]:
#| export
def sparql_query(
    query: str,
    endpoint: str = "https://query.wikidata.org/sparql",
    max_results: int = 100,
    name: str = 'res',
    ns: dict = None,
    timeout: float = 30.0,
    # Dataset integration
    ds_meta = None,
    store_in_work: bool = False,
    work_task_id: str = None
) -> str:
    """Execute SPARQL query, store SPARQLResultHandle in namespace.

    For SELECT: Stores SPARQLResultHandle with rows as list of dicts
    For CONSTRUCT/DESCRIBE: Stores SPARQLResultHandle with rdflib.Graph
    For ASK: Stores SPARQLResultHandle with boolean result

    If ds_meta provided and store_in_work=True:
    - CONSTRUCT results stored in work/<task_id> graph
    - Query logged to prov graph
    
    Args:
        query: SPARQL query string
        endpoint: SPARQL endpoint URL
        max_results: Maximum results to return (for SELECT/CONSTRUCT)
        name: Variable name to store result handle
        ns: Namespace dict (defaults to globals())
        timeout: Query timeout in seconds
        ds_meta: Optional DatasetMeta for dataset integration
        store_in_work: If True and ds_meta provided, store CONSTRUCT results in work graph
        work_task_id: Task ID for work graph (auto-generated if None)
        
    Returns:
        Summary string describing the result
    """
    if ns is None:
        ns = globals()
    
    # Configure wrapper with timeout and headers
    headers = {"User-Agent": "RLM/1.0 (https://github.com/LA3D/rlm)"}
    wrapper = SPARQLWrapper(
        sparql_endpoint=endpoint,
        client_config=dict(timeout=timeout, headers=headers)
    )
    
    # Execute query with rdflib conversion
    result = wrapper.query(query, convert=True)
    
    # Determine result type and create handle
    if isinstance(result, bool):
        # ASK query
        handle = SPARQLResultHandle(
            rows=result,
            result_type='ask',
            query=query,
            endpoint=endpoint
        )
        ns[name] = handle
        return f"ASK result: {result}, stored in '{name}'"
    
    elif hasattr(result, 'serialize'):
        # CONSTRUCT or DESCRIBE query - result is rdflib.Graph
        triples = list(result)[:max_results]
        g = Graph()
        for t in triples:
            g.add(t)
        
        # Determine if CONSTRUCT or DESCRIBE
        query_upper = query.upper()
        result_type = 'construct' if 'CONSTRUCT' in query_upper else 'describe'
        
        handle = SPARQLResultHandle(
            rows=g,
            result_type=result_type,
            query=query,
            endpoint=endpoint,
            triple_count=len(g)
        )
        ns[name] = handle
        
        # Dataset integration for CONSTRUCT results
        if ds_meta is not None and store_in_work:
            from rlm.dataset import work_create
            import uuid
            
            task_id = work_task_id if work_task_id else f"sparql_{uuid.uuid4().hex[:8]}"
            graph_uri, work_graph = work_create(ds_meta, task_id)
            
            # Copy triples to work graph
            for s, p, o in g:
                work_graph.add((s, p, o))
            
            # Log query to prov
            from rdflib import Namespace, RDF, XSD
            RLM_PROV = Namespace('urn:rlm:prov:')
            event_uri = URIRef(f'urn:rlm:prov:sparql_{uuid.uuid4().hex[:8]}')
            ds_meta.prov.add((event_uri, RDF.type, RLM_PROV.SPARQLQuery))
            ds_meta.prov.add((event_uri, RLM_PROV.query, Literal(query)))
            ds_meta.prov.add((event_uri, RLM_PROV.endpoint, Literal(endpoint)))
            ds_meta.prov.add((event_uri, RLM_PROV.resultGraph, URIRef(graph_uri)))
            ds_meta.prov.add((event_uri, RLM_PROV.timestamp, Literal(handle.timestamp, datatype=XSD.dateTime)))
            ds_meta.prov.add((event_uri, RLM_PROV.session, Literal(ds_meta.session_id)))
            
            return f"Graph with {len(g)} triples stored in '{name}' and work/{task_id}" + \
                   (f" (truncated from {len(result)})" if len(result) > max_results else "")
        
        return f"Graph with {len(g)} triples stored in '{name}'" + \
               (f" (truncated from {len(result)})" if len(result) > max_results else "")
    
    else:
        # SELECT query - result is list of dicts
        result = result[:max_results]
        cols = list(result[0].keys()) if result else []
        
        handle = SPARQLResultHandle(
            rows=result,
            result_type='select',
            query=query,
            endpoint=endpoint,
            columns=cols,
            total_rows=len(result)
        )
        ns[name] = handle
        
        return f"SELECT result with {len(result)} rows, columns: {cols}, stored in '{name}'"

Test against Wikidata:

In [None]:
#| eval: false
# Test SELECT query against Wikidata
test_ns = {}
result = sparql_query(
    "SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 5",
    ns=test_ns,
    name='wikidata_test'
)
print(result)
assert 'wikidata_test' in test_ns
assert isinstance(test_ns['wikidata_test'], SPARQLResultHandle)
assert test_ns['wikidata_test'].result_type == 'select'
assert len(test_ns['wikidata_test'].rows) == 5
print(f"✓ SELECT query works: {test_ns['wikidata_test'].summary()}")

# Test CONSTRUCT query
result = sparql_query(
    "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o } LIMIT 3",
    ns=test_ns,
    name='graph_test'
)
print(result)
assert test_ns['graph_test'].result_type == 'construct'
assert isinstance(test_ns['graph_test'].rows, Graph)
print(f"✓ CONSTRUCT query works: {test_ns['graph_test'].summary()}")

## Local Graph Query

Execute SPARQL queries against local rdflib graphs (mounted ontologies or work graphs).

In [None]:
#| export
def sparql_local(
    query: str,
    graph: Graph | str,
    max_results: int = 100,
    name: str = 'res',
    ns: dict = None
) -> str:
    """Execute SPARQL query on local rdflib Graph.

    Useful for querying mounted ontologies or work graphs.
    Returns SPARQLResultHandle same as sparql_query().
    
    Args:
        query: SPARQL query string
        graph: rdflib.Graph object or name of graph in namespace
        max_results: Maximum results to return
        name: Variable name to store result handle
        ns: Namespace dict (defaults to globals())
        
    Returns:
        Summary string describing the result
    """
    if ns is None:
        ns = globals()
    
    # Resolve graph if string name provided
    if isinstance(graph, str):
        if graph not in ns:
            return f"Error: Graph '{graph}' not found in namespace"
        graph_obj = ns[graph]
        # Handle GraphMeta wrapper
        if hasattr(graph_obj, 'graph'):
            graph_obj = graph_obj.graph
    else:
        graph_obj = graph
    
    if not isinstance(graph_obj, Graph):
        return f"Error: Expected rdflib.Graph, got {type(graph_obj)}"
    
    # Execute query on local graph
    result = graph_obj.query(query)
    
    # Determine result type
    query_upper = query.upper()
    
    if 'ASK' in query_upper:
        # ASK query
        ask_result = bool(result)
        handle = SPARQLResultHandle(
            rows=ask_result,
            result_type='ask',
            query=query,
            endpoint='local'
        )
        ns[name] = handle
        return f"ASK result: {ask_result}, stored in '{name}'"
    
    elif 'CONSTRUCT' in query_upper or 'DESCRIBE' in query_upper:
        # CONSTRUCT or DESCRIBE query
        result_type = 'construct' if 'CONSTRUCT' in query_upper else 'describe'
        g = Graph()
        for triple in list(result)[:max_results]:
            g.add(triple)
        
        handle = SPARQLResultHandle(
            rows=g,
            result_type=result_type,
            query=query,
            endpoint='local',
            triple_count=len(g)
        )
        ns[name] = handle
        return f"Graph with {len(g)} triples stored in '{name}'"
    
    else:
        # SELECT query
        rows = []
        for row in list(result)[:max_results]:
            row_dict = {}
            for var in result.vars:
                row_dict[str(var)] = row[var] if row[var] else None
            rows.append(row_dict)
        
        cols = [str(v) for v in result.vars] if result.vars else []
        
        handle = SPARQLResultHandle(
            rows=rows,
            result_type='select',
            query=query,
            endpoint='local',
            columns=cols,
            total_rows=len(rows)
        )
        ns[name] = handle
        
        return f"SELECT result with {len(rows)} rows, columns: {cols}, stored in '{name}'"

Test with local graph:

In [None]:
# Create test graph
test_graph = Graph()
test_graph.add((URIRef('http://ex.org/alice'), URIRef('http://ex.org/age'), Literal('30')))
test_graph.add((URIRef('http://ex.org/bob'), URIRef('http://ex.org/age'), Literal('25')))
test_graph.add((URIRef('http://ex.org/alice'), URIRef('http://ex.org/city'), Literal('Boston')))

test_ns = {'my_graph': test_graph}

# Test SELECT query on local graph
result = sparql_local(
    "SELECT ?s ?age WHERE { ?s <http://ex.org/age> ?age }",
    'my_graph',
    ns=test_ns,
    name='local_res'
)
print(result)
assert 'local_res' in test_ns
assert test_ns['local_res'].result_type == 'select'
assert len(test_ns['local_res'].rows) == 2
print(f"✓ Local SELECT query works: {test_ns['local_res'].rows}")

# Test CONSTRUCT on local graph
result = sparql_local(
    "CONSTRUCT { ?s <http://ex.org/age> ?age } WHERE { ?s <http://ex.org/age> ?age }",
    test_graph,
    ns=test_ns,
    name='local_graph'
)
print(result)
assert test_ns['local_graph'].result_type == 'construct'
assert len(test_ns['local_graph'].rows) == 2
print(f"✓ Local CONSTRUCT query works")

## View Operations

Bounded view functions for progressive disclosure over result sets.

These functions work with `SPARQLResultHandle`, `ResultTable`, or plain lists.

In [None]:
#| export
def res_sample(result, n: int = 10, seed: int = None) -> list:
    """Get random sample of N rows from result.

    Args:
        result: SPARQLResultHandle, ResultTable, or list
        n: Number of rows to sample
        seed: Optional random seed for reproducibility

    Returns:
        List of sampled rows
    """
    if seed is not None:
        random.seed(seed)

    # Extract rows from different result types
    if isinstance(result, SPARQLResultHandle):
        if result.result_type in ['construct', 'describe']:
            # For graphs, sample triples
            rows = list(result.rows)
        elif result.result_type == 'ask':
            # ASK has no rows to sample
            return [result.rows]
        else:
            rows = result.rows
    elif hasattr(result, 'rows'):
        # ResultTable or similar
        rows = result.rows
    else:
        # Plain list
        rows = result

    if len(rows) <= n:
        return list(rows)
    return random.sample(list(rows), n)

Test res_sample:

In [None]:
# Test with list
test_list = [{'x': i} for i in range(20)]
sample = res_sample(test_list, n=5, seed=42)
assert len(sample) == 5
assert all(isinstance(item, dict) for item in sample)
print(f"✓ res_sample works with list: {len(sample)} items")

# Test with SPARQLResultHandle
handle = SPARQLResultHandle(
    rows=[{'s': f'http://ex.org/item{i}'} for i in range(15)],
    result_type='select',
    query='SELECT ?s WHERE { ?s ?p ?o }',
    endpoint='local',
    columns=['s'],
    total_rows=15
)
sample = res_sample(handle, n=3, seed=42)
assert len(sample) == 3
print(f"✓ res_sample works with SPARQLResultHandle")

# Test with small result (no sampling needed)
small_list = [1, 2, 3]
sample = res_sample(small_list, n=10)
assert len(sample) == 3
print(f"✓ res_sample handles small results correctly")

## Setup Function

Initialize SPARQL tools in namespace for RLM sessions.

In [None]:
#| export
def setup_sparql_context(
    ns: dict,
    default_endpoint: str = "https://query.wikidata.org/sparql",
    ds_meta = None
) -> str:
    """Initialize SPARQL tools in namespace.

    Binds:
    - sparql_query() with default endpoint
    - sparql_local() if ds_meta provided
    - res_head(), res_where(), res_group(), res_distinct(), res_sample()

    Args:
        ns: Namespace dict where functions will be bound
        default_endpoint: Default SPARQL endpoint URL
        ds_meta: Optional DatasetMeta for dataset integration
        
    Returns:
        Status message
    """
    # Import view functions from dataset module
    try:
        from rlm.dataset import res_head, res_where, res_group, res_distinct
    except ImportError:
        # Fallback if dataset module not available
        res_head = res_where = res_group = res_distinct = None
    
    # Bind sparql_query with default endpoint and dataset integration
    if ds_meta is not None:
        ns['sparql_query'] = partial(sparql_query, endpoint=default_endpoint, ns=ns, ds_meta=ds_meta)
    else:
        ns['sparql_query'] = partial(sparql_query, endpoint=default_endpoint, ns=ns)
    
    # Bind sparql_local
    ns['sparql_local'] = partial(sparql_local, ns=ns)
    
    # Bind view operations
    if res_head is not None:
        ns['res_head'] = res_head
        ns['res_where'] = res_where
        ns['res_group'] = res_group
        ns['res_distinct'] = res_distinct
    ns['res_sample'] = res_sample
    
    bound_funcs = ['sparql_query', 'sparql_local', 'res_sample']
    if res_head is not None:
        bound_funcs.extend(['res_head', 'res_where', 'res_group', 'res_distinct'])
    
    msg = f"SPARQL context initialized with endpoint: {default_endpoint}"
    if ds_meta is not None:
        msg += f"\nDataset integration enabled (session: {ds_meta.session_id})"
    msg += f"\nBound functions: {', '.join(bound_funcs)}"
    
    return msg

Test setup function:

In [None]:
# Test basic setup
test_ns = {}
result = setup_sparql_context(test_ns)
print(result)
assert 'sparql_query' in test_ns
assert 'sparql_local' in test_ns
assert 'res_sample' in test_ns
print(f"✓ Setup function works")

# Test with dataset integration
try:
    from rlm.dataset import DatasetMeta
    from rdflib import Dataset
    
    ds = Dataset()
    ds_meta = DatasetMeta(ds, name='test')
    
    test_ns2 = {}
    result = setup_sparql_context(test_ns2, ds_meta=ds_meta)
    print(result)
    assert 'session:' in result
    print(f"✓ Setup with dataset integration works")
except ImportError:
    print("⊘ Dataset module not available, skipping integration test")

## Usage Examples

End-to-end examples showing SPARQL handles in RLM context.

In [None]:
#| eval: false
# Example 1: Basic SPARQL workflow
ns = {}
setup_sparql_context(ns)

# Execute query (LLM would do this)
ns['sparql_query']('SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10', name='results')

# Inspect results
print(ns['results'].summary())
print(res_head(ns['results'], 5))
print(res_sample(ns['results'], 3))

In [None]:
#| eval: false
# Example 2: Dataset integration
from rlm.dataset import setup_dataset_context

ns = {}
setup_dataset_context(ns)
setup_sparql_context(ns, ds_meta=ns['ds_meta'])

# Query and store in work graph
ns['sparql_query'](
    'CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o } LIMIT 5',
    name='discovered_triples',
    store_in_work=True,
    work_task_id='discovery_1'
)

# Check provenance
print(ns['dataset_stats']())

In [None]:
#| eval: false
# Example 3: Local graph queries
from rlm.ontology import setup_ontology_context

ns = {}
setup_sparql_context(ns)
setup_ontology_context('ontology/prov.ttl', ns, name='prov')

# Query mounted ontology
ns['sparql_local'](
    'SELECT ?c WHERE { ?c a <http://www.w3.org/2002/07/owl#Class> }',
    'prov',
    name='classes'
)

print(f"Found {len(ns['classes'].rows)} classes")
print(res_head(ns['classes'], 10))