# core

> RLM core implementation using claudette with namespace-explicit design

In [None]:
#| default_exp core

## Overview

This module implements the RLM (Recursive Language Model) protocol from [rlmpaper](https://github.com/alexzhang13/rlm) using [claudette](https://claudette.answer.ai) as the LLM backend.

### Design Principles

- **Namespace-explicit**: All functions take `ns: dict` parameter (no frame walking)
- **Protocol-faithful**: Uses prompts and types from `_rlmpaper_compat.py`
- **Solveit-independent**: Core works anywhere; Solveit integration is separate
- **Return everything useful**: Functions return `(answer, iterations, ns)` for inspection

## Imports

In [None]:
#| export
from claudette import Chat, contents
from functools import partial
import asyncio
import sys
import time
from io import StringIO

# Protocol artifacts from rlmpaper
from rlm._rlmpaper_compat import (
    RLM_SYSTEM_PROMPT,
    QueryMetadata,
    REPLResult, CodeBlock, RLMIteration,
    build_rlm_system_prompt, build_user_prompt,
    find_code_blocks, find_final_answer, format_iteration,
)

## Environment Detection

In [None]:
#| export
def in_solveit() -> bool:
    """Check if running in Solveit environment.
    
    Solveit injects `__msg_id` into the call stack. This function tests for that.
    """
    try:
        from dialoghelper.inspecttools import _find_frame_dict
        _find_frame_dict('__msg_id')
        return True
    except (ValueError, ImportError):
        return False

In [None]:
# Test environment detection
is_solveit = in_solveit()
print(f"Running in Solveit: {is_solveit}")
assert isinstance(is_solveit, bool)

## Core LLM Functions

These functions provide the REPL environment's `llm_query` and `llm_query_batched` capabilities.
They require an explicit namespace dict and store results there.

In [None]:
#| export
def llm_query(prompt: str, ns: dict, name: str = 'llm_res', 
              model: str = 'claude-sonnet-4-5') -> str:
    """Query a sub-LLM and store the result in namespace.
    
    Args:
        prompt: The prompt to send to the LLM
        ns: Namespace dict where result will be stored
        name: Variable name for storing the result
        model: Claude model to use
        
    Returns:
        Summary string describing what was stored
    """
    result = contents(Chat(model)(prompt))
    ns[name] = result
    return f"Stored response in '{name}' ({len(result)} chars)"

In [None]:
# Test llm_query with explicit namespace
test_ns = {}
# Note: Commented out to avoid API calls during CI
# result = llm_query("Say 'hello' and nothing else", test_ns, name='greeting')
# assert 'greeting' in test_ns
# assert len(test_ns['greeting']) > 0
print("✓ llm_query signature test passed")

In [None]:
#| export
async def _query_one(prompt: str, model: str) -> str:
    """Execute a single LLM query asynchronously."""
    return contents(Chat(model)(prompt))

In [None]:
#| export
def llm_query_batched(prompts: list, ns: dict, name: str = 'batch_res',
                      model: str = 'claude-sonnet-4-5') -> str:
    """Query LLM with multiple prompts concurrently.
    
    Much faster than sequential `llm_query` calls when you have multiple
    independent queries. Results are returned in the same order as prompts.
    
    Args:
        prompts: List of prompt strings
        ns: Namespace dict where results will be stored
        name: Variable name for storing the list of results
        model: Claude model to use
        
    Returns:
        Summary string describing what was stored
    """
    try:
        loop = asyncio.get_running_loop()
        import nest_asyncio
        nest_asyncio.apply()
        results = loop.run_until_complete(
            asyncio.gather(*[_query_one(p, model) for p in prompts])
        )
    except RuntimeError:
        results = asyncio.run(
            asyncio.gather(*[_query_one(p, model) for p in prompts])
        )
    
    ns[name] = list(results)
    return f"Stored {len(results)} responses in '{name}'"

In [None]:
# Test llm_query_batched signature
test_ns = {}
# Note: Commented out to avoid API calls during CI
# prompts = ["Say 'one'", "Say 'two'"]
# result = llm_query_batched(prompts, test_ns, name='batch')
# assert 'batch' in test_ns
# assert len(test_ns['batch']) == 2
print("✓ llm_query_batched signature test passed")

## REPL Execution

Execute Python code in a namespace and capture stdout/stderr.

In [None]:
#| export
def exec_code(code: str, ns: dict) -> REPLResult:
    """Execute code in namespace and return result.
    
    Captures stdout, stderr, and any exceptions. The namespace is mutated
    with any variables created during execution.
    
    Args:
        code: Python code to execute
        ns: Namespace dict for execution
        
    Returns:
        REPLResult with stdout, stderr, locals, execution_time
    """
    stdout_capture = StringIO()
    stderr_capture = StringIO()
    old_stdout, old_stderr = sys.stdout, sys.stderr
    start = time.time()
    
    try:
        sys.stdout, sys.stderr = stdout_capture, stderr_capture
        exec(compile(code, '<repl>', 'exec'), ns)
        stderr_out = stderr_capture.getvalue()
    except Exception as e:
        stderr_out = stderr_capture.getvalue() + f"\n{type(e).__name__}: {e}"
    finally:
        sys.stdout, sys.stderr = old_stdout, old_stderr
    
    return REPLResult(
        stdout=stdout_capture.getvalue(),
        stderr=stderr_out,
        locals=ns,
        execution_time=time.time() - start
    )

In [None]:
# Test exec_code with explicit namespace
test_ns = {}
result = exec_code("x = 2 + 2\nprint(x)", test_ns)
assert test_ns['x'] == 4
assert '4' in result.stdout
assert result.execution_time > 0
print("✓ exec_code works")

# Test error handling
test_ns = {}
result = exec_code("raise ValueError('test error')", test_ns)
assert 'ValueError: test error' in result.stderr
print("✓ exec_code error handling works")

## RLM Loop

The main RLM iteration loop. Follows the rlmpaper protocol:
1. Build system prompt with metadata
2. Add first-iteration safeguard
3. Execute REPL code blocks
4. Check for FINAL/FINAL_VAR
5. Repeat until answer or max iterations

In [None]:
#| export
def rlm_run(query: str, context, ns: dict = None, 
            model: str = 'claude-sonnet-4-5', 
            max_iters: int = 10) -> tuple:
    """Run RLM loop until FINAL or max iterations.
    
    This implements the RLM protocol: the root LLM emits ```repl``` code blocks
    which are executed in a namespace with `context`, `llm_query`, and 
    `llm_query_batched` available. The loop continues until the model returns
    FINAL(...) or FINAL_VAR(...).
    
    Args:
        query: User's question to answer
        context: Context data (str, list of str, or dict)
        ns: Namespace dict (if None, creates fresh namespace)
        model: Claude model to use
        max_iters: Maximum iterations before giving up
        
    Returns:
        (answer, iterations, namespace) tuple where:
        - answer: Final answer string or None if didn't converge
        - iterations: List of RLMIteration objects
        - namespace: The dict containing all REPL variables
    """
    if ns is None:
        ns = {}
    
    # Setup namespace with REPL environment
    meta = QueryMetadata(context)
    ns['context'] = context
    
    # Bind llm_query functions to this namespace and model
    ns['llm_query'] = partial(llm_query, ns=ns, model=model)
    ns['llm_query_batched'] = partial(llm_query_batched, ns=ns, model=model)
    
    # Build initial messages with rlmpaper system prompt
    messages = build_rlm_system_prompt(query_metadata=meta)
    chat = Chat(model, sp=messages[0]['content'])
    
    # Add metadata message if present
    if len(messages) > 1:
        chat.h.append(messages[1])
    
    iterations = []
    
    for i in range(max_iters):
        start = time.time()
        
        # Build user prompt (includes first-iteration safeguard)
        user_msg = build_user_prompt(root_prompt=query, iteration=i)
        
        # Get response from root LLM
        response = contents(chat(user_msg['content']))
        
        # Extract and execute code blocks
        code_strs = find_code_blocks(response)
        code_blocks = []
        for code in code_strs:
            result = exec_code(code, ns)
            code_blocks.append(CodeBlock(code=code, result=result))
        
        # Check for final answer
        answer = find_final_answer(response, ns=ns)
        
        # Record iteration
        iteration = RLMIteration(
            prompt=user_msg['content'],
            response=response,
            code_blocks=code_blocks,
            final_answer=answer,
            iteration_time=time.time() - start
        )
        iterations.append(iteration)
        
        # If we have an answer, we're done
        if answer is not None:
            return answer, iterations, ns
        
        # Add iteration to chat history for next round
        for msg in format_iteration(iteration):
            chat.h.append(msg)
    
    # Reached max iterations without final answer
    return None, iterations, ns

## Tests

In [None]:
# Test QueryMetadata (from _rlmpaper_compat)
meta = QueryMetadata(["chunk1", "chunk2", "chunk3"])
assert meta.context_type == "list"
assert len(meta.context_lengths) == 3
assert meta.context_total_length == sum(len(c) for c in ["chunk1", "chunk2", "chunk3"])
print("✓ QueryMetadata works")

In [None]:
# Test find_code_blocks (from _rlmpaper_compat)
text = """Here's some code:
```repl
x = 1 + 1
print(x)
```
And more text."""
blocks = find_code_blocks(text)
assert len(blocks) == 1
assert 'x = 1 + 1' in blocks[0]
print("✓ find_code_blocks works")

In [None]:
# Test find_final_answer (from _rlmpaper_compat)
assert find_final_answer("FINAL(42)") == "42"
assert find_final_answer("FINAL(The answer is 42)") == "The answer is 42"

# Test FINAL_VAR
test_ns = {'result': 'hello world'}
assert find_final_answer("FINAL_VAR(result)", ns=test_ns) == "hello world"

# Test no final
assert find_final_answer("Just some text") is None
print("✓ find_final_answer works")

In [None]:
# Test rlm_run with simple mock scenario
# Note: This doesn't call LLM APIs, just tests the structure
context = ["The capital of France is Paris."]
test_ns = {}

# We can't easily test without API calls, but we can verify the function signature
# and that it sets up the namespace correctly
meta = QueryMetadata(context)
test_ns['context'] = context
test_ns['llm_query'] = partial(llm_query, ns=test_ns, model='claude-sonnet-4-5')
test_ns['llm_query_batched'] = partial(llm_query_batched, ns=test_ns, model='claude-sonnet-4-5')

assert 'context' in test_ns
assert 'llm_query' in test_ns
assert 'llm_query_batched' in test_ns
assert callable(test_ns['llm_query'])
assert callable(test_ns['llm_query_batched'])
print("✓ rlm_run namespace setup works")

## Usage Examples

In [None]:
#| eval: false
# Simple usage (requires API key)
context = ["The speed of light is 299,792,458 m/s."]
answer, iterations, ns = rlm_run("What is the speed of light?", context)
print(f"Answer: {answer}")
print(f"Iterations: {len(iterations)}")
print(f"Variables in namespace: {[k for k in ns.keys() if not k.startswith('_')]}")

In [None]:
#| eval: false
# Persistent namespace across runs
ns = {}
answer1, iters1, ns = rlm_run("What is X?", context1, ns=ns)
answer2, iters2, ns = rlm_run("What about Y?", context2, ns=ns)
# ns now contains variables from both runs