# RAG from scratch

- Adapted from https://github.com/opendatahub-io/llama-stack-demos/
- Requires distributions/masterclass-agents/run.yaml

In [8]:
import rich
from llama_stack_client import LlamaStackClient, RAGDocument


In [9]:
client = LlamaStackClient(
    base_url="http://localhost:8321",
)

In [10]:
client.models.list()

[Model(identifier='all-MiniLM-L6-v2', metadata={'embedding_dimension': 384.0}, api_model_type='embedding', provider_id='ollama', provider_resource_id='all-minilm:latest', type='model', model_type='embedding'),
 Model(identifier='meta-llama/Llama-3.2-3B-Instruct', metadata={}, api_model_type='llm', provider_id='ollama', provider_resource_id='llama3.2:3b-instruct-fp16', type='model', model_type='llm')]

In [11]:
vector_db_id = "my-vector-db"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
)

VectorDBRegisterResponse(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='my-vector-db', provider_id='milvus', provider_resource_id='my-vector-db', type='vector_db', access_attributes=None)

Ingesting documents into a vector database:

In [12]:
urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]


In [13]:
rich.print(documents)

In [14]:

client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

In [25]:
prompt = "What are the top 5 topics that were explained? Only list succinct bullet points."

In [16]:
# higher level tool provides packaged results, can span multiple dbs
tool_response = client.tool_runtime.rag_tool.query(
    content=prompt, vector_db_ids=[vector_db_id]
)
rich.print(tool_response)

In [17]:
# we can also query the vector db directly
db_response = client.vector_io.query(
    vector_db_id=vector_db_id,
    query=prompt,
)
rich.print(db_response)

In [18]:
# prompt_context = tool_response.content
prompt_context = "\n".join([c.content for c in db_response.chunks])

In [26]:
messages = [{"role": "system", "content": "You are a helpful assistant."}]
extended_prompt = f"""
Please answer the given query using the context below.

QUERY:
{prompt}

CONTEXT:
{prompt_context}
"""
messages.append({"role": "user", "content": extended_prompt})
rich.print(messages)

In [27]:
response = client.inference.chat_completion(
    messages=messages,
    model_id="meta-llama/Llama-3.2-3B-Instruct",
)
rich.print(response)

In [34]:
# Left as an extra exercise for the reader
from llama_stack_client import Agent, AgentEventLogger

rag_agent = Agent(
    client, 
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    tools = [
        {
          "name": "builtin::rag/knowledge_search",
          "args" : {
            "vector_db_ids": [vector_db_id],
          }
        }
    ],
)