## 1. Setting Up the Environment
We will start with a few imports needed for this demo only.

In [15]:
import uuid
import logging

from llama_stack_client import LlamaStackClient
from pydantic import NonNegativeFloat
from pydantic_settings import BaseSettings, SettingsConfigDict

# pretty print of the results returned from the model/agent
from rich.console import Console
from rich.pretty import pprint

console = Console()

logger = logging.getLogger(__name__)
if not logger.hasHandlers():  
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(message)s')
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

In [16]:
class Settings(BaseSettings):
    base_url: str

    vdb_provider: str
    vdb_embedding: str
    vdb_embedding_dimension: int
    vdb_embedding_window: int

    inference_model_id: str
    max_tokens: int
    temperature: NonNegativeFloat
    top_p: float
    stream: bool

    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")

In [17]:
settings = Settings(
    base_url="http://localhost:8321",
    inference_model_id="meta-llama/Llama-3.2-3B-Instruct",
    max_tokens=4096,
    temperature=0.0,
    top_p=0.95,
    stream=True,
    vdb_provider="milvus",
    vdb_embedding="all-MiniLM-L6-v2",
    vdb_embedding_dimension=384,
    vdb_embedding_window=256,
)
print(settings)

base_url='http://localhost:8321' vdb_provider='milvus' vdb_embedding='all-MiniLM-L6-v2' vdb_embedding_dimension=384 vdb_embedding_window=256 inference_model_id='meta-llama/Llama-3.2-3B-Instruct' max_tokens=4096 temperature=0.0 top_p=0.95 stream=True


In [18]:
if settings.temperature > 0.0:
    strategy = {
        "type": "top_p",
        "temperature": settings.temperature,
        "top_p": settings.top_p,
    }
else:
    strategy = {"type": "greedy"}

# sampling_params will later be used to pass the parameters to Llama Stack Agents/Inference APIs
sampling_params = {
    "strategy": strategy,
    "max_tokens": settings.max_tokens,
}

print(sampling_params)

{'strategy': {'type': 'greedy'}, 'max_tokens': 4096}


In [19]:
client = LlamaStackClient(base_url=settings.base_url)
print(f"Connected to Llama Stack server @ {client.base_url}")

Connected to Llama Stack server @ http://localhost:8321


## Validate tools are available in our llama-stack instance

When an instance of llama-stack is redeployed your tools need to re-registered. Also if a tool is already registered with a llama-stack instance, if you try to register one with the same `toolgroup_id`, llama-stack will throw you an error.

For this reason it is recommended to include some code to validate your tools and toolgroups. This is where the `mcp_url` comes into play. The following code will check that the `mcp::docling-llamastack` tool is registered, or it will be registered directly from the mcp url.

If you are running the MCP server from source, the default value for this is: `http://localhost:8000/sse`.

If you are running the MCP server from a container, the default value for this is: `http://host.containers.internal:8000/sse`.

Make sure to pass the corresponding MCP URL for the server you are trying to register/validate tools for.

In [20]:
docling_mcp_url = "http://localhost:8000/sse"

registered_tools = client.tools.list()
registered_toolgroups = [t.toolgroup_id for t in registered_tools]

if "mcp::docling-llamastack" not in registered_toolgroups:
    client.toolgroups.register(
        toolgroup_id="mcp::docling-llamastack",
        provider_id="model-context-protocol",
        mcp_endpoint={"uri":docling_mcp_url},
    )

registered_tools = client.tools.list()
registered_toolgroups = [t.toolgroup_id for t in registered_tools]
logger.info(f"Your Llama Stack server is already registered with the following tool groups @ {set(registered_toolgroups)} \n")

Your Llama Stack server is already registered with the following tool groups @ {'mcp::docling-llamastack', 'builtin::rag', 'builtin::websearch'} 



## 2. Setup the Ingest + RAG-aware agent
- Initialize the collection in the vectordb
- Initialize the agent the required tools:
    - Docling Ingest will be responsible to take care of instructions like "Ingest the document https://arxiv.org/pdf/2503.11576".
    - RAG/Knowledge search will respond to user queries by running RAG on the documents ingested in the vectordb.

In [21]:
from llama_stack_client import Agent, AgentEventLogger
from llama_stack_client.lib.agents.event_logger import EventLogger


In [22]:
# define the name of the vectordb collection to use
vector_db_id = f"test_vector_db_{uuid.uuid4()}"

# define and register the document collection to be used
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=settings.vdb_embedding,
    embedding_dimension=settings.vdb_embedding_dimension,
    provider_id=settings.vdb_provider,
)


agent = Agent(
    client,
    model=settings.inference_model_id,
    instructions="You are a helpful assistant.",
    sampling_params=sampling_params,
    tools=[
        dict(
            name="mcp::docling-llamastack",
            args={
                "vector_db_id": vector_db_id,
            },
        ),
        dict(
            name="builtin::rag/knowledge_search",
            args={
                "vector_db_ids": [vector_db_id],  # list of IDs of document collections to consider during retrieval
            },
        )
    ],

)

## 3. Executing ingest and RAG queries
- For each prompt, initialize a new agent session, execute a turn during which a retrieval call may be requested, and output the reply received from the agent.

In [23]:
queries = [
    "Ingest the document https://arxiv.org/pdf/2503.11576",
    "Lookup the documents to answer the question: How does the system compare to humans when analyzing the layout?",
]

for prompt in queries:
    console.print(f"\n[cyan]User> {prompt}[/cyan]")
    
    # create a new turn with a new session ID for each prompt
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=agent.create_session(f"rag-session_{uuid.uuid4()}"),
        stream=settings.stream,
    )
    
    # print the response, including tool calls output
    if settings.stream:
        for log in EventLogger().log(response):
            log.print()
    else:
        pprint(response.steps)

[33minference> [0m[33m[[0m[33ming[0m[33mest[0m[33m_document[0m[33m_to[0m[33m_vect[0m[33mord[0m[33mb[0m[33m(source[0m[33m='[0m[33mhttps[0m[33m://[0m[33mar[0m[33mxiv[0m[33m.org[0m[33m/pdf[0m[33m/[0m[33m250[0m[33m3[0m[33m.[0m[33m115[0m[33m76[0m[33m',[0m[33m vector[0m[33m_db[0m[33m_id[0m[33m='[0m[33myour[0m[33m_vector[0m[33m_db[0m[33m_id[0m[33m')][0m[97m[0m
[32mtool_execution> Tool:ingest_document_to_vectordb Args:{'source': 'https://arxiv.org/pdf/2503.11576', 'vector_db_id': 'your_vector_db_id'}[0m
[32mtool_execution> Tool:ingest_document_to_vectordb Response:{"type":"text","text":"2503.11576v1.pdf","annotations":null}[0m
[33minference> [0m[33mWhat[0m[33m is[0m[33m the[0m[33m metadata[0m[33m of[0m[33m the[0m[33m ing[0m[33mested[0m[33m document[0m[33m?[0m[97m[0m
[30m[0m

[33minference> [0m[33m[k[0m[33mnowledge[0m[33m_search[0m[33m(query[0m[33m="[0m[33msystem[0m[33m vs[0m[33m human[0m[33m analysis[0m[33m of[0m[33m layout[0m[33m")][0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'system vs human analysis of layout'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:17402\nContent: C.2 Layout analysis samples\nTo illustrate and compare the visual grounding capabilities of SmolDocling, a set of sample predictions with SmolDocling and Qwen2.5-VL from DocLayNet [70] is shown in Table 7. Note that the element location results are independent from the correct reproduction of document content and structure.\nTable 7: Visualizations of layout output from SmolDocling and QwenVL2.5 compared to the DocLayNet ground truth. Examples are chosen to be re

## 4. Choose the documents and answer the question

In this section we add one more tool which is retrieving the documents matching some metadata, e.g. the publication year.


In [24]:
# define the name of the vectordb collection to use
vector_db_id = f"test_vector_db_{uuid.uuid4()}"

# define and register the document collection to be used
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=settings.vdb_embedding,
    embedding_dimension=settings.vdb_embedding_dimension,
    provider_id=settings.vdb_provider,
)

# define the same agent (with a new vectordb)
agent = Agent(
    client,
    model=settings.inference_model_id,
    instructions="You are a helpful assistant.",
    sampling_params=sampling_params,
    tools=[
        dict(
            name="mcp::docling-llamastack",
            args={
                "vector_db_id": vector_db_id,
            },
        ),
        dict(
            name="builtin::rag/knowledge_search",
            args={
                "vector_db_ids": [vector_db_id],  # list of IDs of document collections to consider during retrieval
            },
        )
    ],
)

In [25]:
queries = [
    "Find publications of 2022",
    "Ingest the selected documents",
    # "How does the system compare to humans when analyzing the layout?",
    "Lookup the documents to answer the question: How does the system compare to humans when analyzing the layout?",
]

session_id=agent.create_session(f"rag-session_{uuid.uuid4()}")
for prompt in queries:
    console.print(f"\n[cyan]User> {prompt}[/cyan]")
    
    # create a new turn with a new session ID for each prompt
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=session_id,
        stream=settings.stream,
    )
    
    # print the response, including tool calls output
    if settings.stream:
        for log in EventLogger().log(response):
            log.print()
    else:
        pprint(response.steps)

[33minference> [0m[33m[[0m[33msearch[0m[33m_files[0m[33m_metadata[0m[33m(year[0m[33m='[0m[33m202[0m[33m2[0m[33m')][0m[97m[0m
[32mtool_execution> Tool:search_files_metadata Args:{'year': '2022'}[0m
[32mtool_execution> Tool:search_files_metadata Response:{"type":"text","text":"file:///Users/dol/projects/aimasterclass-workshop/ai-masterclass-2025/unit2/data/2022/IBM/2206.01062.pdf","annotations":null}[0m
[33minference> [0m[33mIt[0m[33m seems[0m[33m like[0m[33m the[0m[33m search[0m[33m_files[0m[33m_metadata[0m[33m function[0m[33m returned[0m[33m a[0m[33m single[0m[33m file[0m[33m.[0m[33m If[0m[33m you[0m[33m want[0m[33m to[0m[33m find[0m[33m all[0m[33m publications[0m[33m from[0m[33m [0m[33m202[0m[33m2[0m[33m,[0m[33m you[0m[33m can[0m[33m use[0m[33m the[0m[33m following[0m[33m command[0m[33m:

[0m[33m[[0m[33msearch[0m[33m_files[0m[33m_metadata[0m[33m(year[0m[33m="[0m[33m202[0m[33m2[0

[33minference> [0m[33m[[0m[33ming[0m[33mest[0m[33m_document[0m[33m_to[0m[33m_vect[0m[33mord[0m[33mb[0m[33m(source[0m[33m="[0m[33mfile[0m[33m:///[0m[33mUsers[0m[33m/d[0m[33mol[0m[33m/projects[0m[33m/[0m[33maim[0m[33master[0m[33mclass[0m[33m-work[0m[33mshop[0m[33m/[0m[33mai[0m[33m-master[0m[33mclass[0m[33m-[0m[33m202[0m[33m5[0m[33m/unit[0m[33m2[0m[33m/data[0m[33m/[0m[33m202[0m[33m2[0m[33m/[0m[33mIBM[0m[33m/[0m[33m220[0m[33m6[0m[33m.[0m[33m010[0m[33m62[0m[33m.pdf[0m[33m",[0m[33m vector[0m[33m_db[0m[33m_id[0m[33m="[0m[33myour[0m[33m_vector[0m[33m_db[0m[33m_id[0m[33m")][0m[97m[0m
[32mtool_execution> Tool:ingest_document_to_vectordb Args:{'source': 'file:///Users/dol/projects/aimasterclass-workshop/ai-masterclass-2025/unit2/data/2022/IBM/2206.01062.pdf', 'vector_db_id': 'your_vector_db_id'}[0m
[32mtool_execution> Tool:ingest_document_to_vectordb Response:{"type":"text","text":"2

[33minference> [0m[33m[k[0m[33mnowledge[0m[33m_search[0m[33m(query[0m[33m="[0m[33msystem[0m[33m vs[0m[33m human[0m[33m analysis[0m[33m of[0m[33m layout[0m[33m")][0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'system vs human analysis of layout'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text='Result 1:\nDocument_id:71562\nContent: DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\nBirgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com\nChristoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com\nMichele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com\nAhmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com\nPeter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com\n', type='text'), TextContentIt

---

## ReAct Agent

In the following section we use the reasoning agent `ReActAgent`. In this scenario, the model orchestrator the tools execution is reasoning on the sequence of tools to be executed in order to perform the task.

This allows to have a single user query which triggers multiple independent steps, e.g.

1. Search for the files matching the metadata (e.g. which publication year)
2. Ingest the documents
3. Run a search query on the documents


In [None]:
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput


In [None]:
vector_db_id = f"test_vector_db_{uuid.uuid4()}"

# define and register the document collection to be used
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=settings.vdb_embedding,
    embedding_dimension=settings.vdb_embedding_dimension,
    provider_id=settings.vdb_provider,
)



In [None]:

agent = ReActAgent(
            client=client,
            model=settings.inference_model_id,
            tools=[
                dict(
                    name="mcp::docling-llamastack",
                    args={
                        "vector_db_id": vector_db_id,
                    },
                ),
                dict(
                    name="builtin::rag/knowledge_search",
                    args={
                        "vector_db_ids": [vector_db_id],  # list of IDs of document collections to consider during retrieval
                    },
                )
            ],
            response_format={
                "type": "json_schema",
                "json_schema": ReActOutput.model_json_schema(),
            },
            sampling_params=sampling_params,
        )
user_prompts = [
    "I would like to ingest publications of 2025 and understand how does SmolDocling system compare to humans when analyzing the layout?"
]

for prompt in user_prompts:
    print("\n"+"="*50)
    console.print(f"[cyan]Processing user query: {prompt}[/cyan]")
    print("="*50)
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=agent.create_session(f"rag-session_{uuid.uuid4()}"),
        stream=settings.stream
    )
    if settings.stream:
        for log in EventLogger().log(response):
            log.print()
    else:
        pprint(response.steps) # print the steps of an agent's response in a formatted way. 