In [14]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://localhost:8321")

# List available models
models = client.models.list()

# Select the first LLM
llm = next(m for m in models if m.model_type == "llm")
model_id = llm.identifier

print("Model:", model_id)

response = client.inference.chat_completion(
    model_id=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Tell me something funny"},
    ],
)
print(response.completion_message.content)

Model: llama3.2:3b-instruct-fp16
Here's one:

A man walked into a library and asked the librarian, "Do you have any books on Pavlov's dogs and Schrödinger's cat?"

The librarian replied, "It rings a bell, but I'm not sure if it's here or not."


In [15]:
from llama_stack_client import LlamaStackClient
from llama_stack_client import Agent, AgentEventLogger
import uuid

client = LlamaStackClient(base_url=f"http://localhost:8321")

models = client.models.list()
llm = next(m for m in models if m.model_type == "llm")
model_id = llm.identifier

agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")

s_id = agent.create_session(session_name=f"s{uuid.uuid4().hex}")

print("Streaming response...")
stream = agent.create_turn(
    messages=[{"role": "user", "content": "Give me a short technical overview of LLM"}], session_id=s_id, stream=True
)
for event in AgentEventLogger().log(stream):
    event.print()

Streaming response...
[33minference> [0m[33mHere[0m[33m's[0m[33m a[0m[33m brief[0m[33m technical[0m[33m overview[0m[33m of[0m[33m Large[0m[33m Language[0m[33m Models[0m[33m ([0m[33mLL[0m[33mMs[0m[33m):

[0m[33m**[0m[33mWhat[0m[33m is[0m[33m a[0m[33m Large[0m[33m Language[0m[33m Model[0m[33m?[0m[33m**

[0m[33mA[0m[33m Large[0m[33m Language[0m[33m Model[0m[33m ([0m[33mLL[0m[33mM[0m[33m)[0m[33m is[0m[33m a[0m[33m type[0m[33m of[0m[33m artificial[0m[33m intelligence[0m[33m ([0m[33mAI[0m[33m)[0m[33m model[0m[33m that[0m[33m uses[0m[33m deep[0m[33m learning[0m[33m techniques[0m[33m to[0m[33m process[0m[33m and[0m[33m generate[0m[33m human[0m[33m-like[0m[33m language[0m[33m.[0m[33m These[0m[33m models[0m[33m are[0m[33m trained[0m[33m on[0m[33m vast[0m[33m amounts[0m[33m of[0m[33m text[0m[33m data[0m[33m,[0m[33m which[0m[33m enables[0m[33m them[0m[33m to[0

In [16]:
from llama_stack_client import LlamaStackClient
from llama_stack_client import Agent, AgentEventLogger
from llama_stack_client.types import Document
import uuid

client = LlamaStackClient(base_url="http://localhost:8321")

# Create a vector database instance
embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
embedding_model = embed_lm.identifier
vector_db_id = f"v{uuid.uuid4().hex}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model,
)

# Create Documents
urls = [
    "memory_optimizations.rst",
    "chat.rst",
    "llama3.rst",
    "qat_finetune.rst",
    "lora_finetune.rst",
]
documents = [
    Document(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]

# Insert documents
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

# Get the model being served
llm = next(m for m in client.models.list() if m.model_type == "llm")
model = llm.identifier

# Create the RAG agent
rag_agent = Agent(
    client,
    model=model,
    instructions="You are a helpful assistant. Use the RAG tool to answer questions as needed.",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
)

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

turns = ["what is torchtune", "tell me about dora"]

for t in turns:
    print("user>", t)
    stream = rag_agent.create_turn(
        messages=[{"role": "user", "content": t}], session_id=session_id, stream=True
    )
    for event in AgentEventLogger().log(stream):
        event.print()

user> what is torchtune
[33minference> [0m[33m[[0m[33mknowledge[0m[33m_search[0m[33m(query[0m[33m='[0m[33mtorch[0m[33mt[0m[33mune[0m[33m')][0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'torchtune'}[0m
[33minference> [0m[33mThis[0m[33m text[0m[33m is[0m[33m a[0m[33m documentation[0m[33m for[0m[33m torch[0m[33mt[0m[33mune[0m[33m,[0m[33m a[0m[33m library[0m[33m used[0m[33m for[0m[33m hyper[0m[33mparameter[0m[33m tuning[0m[33m and[0m[33m model[0m[33m selection[0m[33m.[0m[33m It[0m[33m provides[0m[33m various[0m[33m techniques[0m[33m for[0m[33m optimizing[0m[33m models[0m[33m,[0m[33m including[0m[33m Lo[0m[33mRA[0m[33m ([0m[33mLow[0m[33m-R[0m[33mank[0m[33m Adapt[0m[33mation[0m[33m)[0m[33m fin[0m[33met[0m[33muning[0m[33m,[0m[33m activation[0m[33m checkpoint[0m[33ming[0m[33m,[0m[33m and[0m[33m activation[0m[33m off[0m[33mloading[0m[33m.

[0m[