# Basic Use Cases

## Requirements

```bash
INFERENCE_MODEL=llama3.2:3b-instruct-fp16 llama stack build --template ollama --image-type venv --image-name venv --run
```

* You can use your own virtual environment, but in this case, we recommend using the one that Llama Stack will create for you.

You should see output like the following:

```
INFO:     Application startup complete.
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
```


### (Optional) Use a Container Instead of a Local Process

Set environment variables:

```bash
export INFERENCE_MODEL="llama3.2:3b"
export LLAMA_STACK_PORT=8321
mkdir -p ~/.llama
```

Start the server using Docker or Podman:

```bash
podman run --privileged -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  --network=host \
  llamastack/distribution-ollama \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://localhost:11434
```

## Getting Started

Go through the different cases and execute the Jupyter cell

### Chat Completion

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://localhost:8321")

# List available models
models = client.models.list()

# Select the first LLM
llm = next(m for m in models if m.model_type == "llm")
model_id = llm.identifier

print("Model:", model_id)

response = client.inference.chat_completion(
    model_id=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Tell me something funny"},
    ],
)
print(response.completion_message.content)

### Basic Agent usage

In [None]:
from llama_stack_client import LlamaStackClient
from llama_stack_client import Agent, AgentEventLogger
import uuid

client = LlamaStackClient(base_url=f"http://localhost:8321")

models = client.models.list()
llm = next(m for m in models if m.model_type == "llm")
model_id = llm.identifier

agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")

s_id = agent.create_session(session_name=f"s{uuid.uuid4().hex}")

print("Streaming response...")
stream = agent.create_turn(
    messages=[{"role": "user", "content": "Give me a short technical overview of LLM"}], session_id=s_id, stream=True
)
for event in AgentEventLogger().log(stream):
    event.print()

### Basic RAG usage

In [None]:
from llama_stack_client import LlamaStackClient
from llama_stack_client import Agent, AgentEventLogger
from llama_stack_client.types import Document
import uuid

client = LlamaStackClient(base_url="http://localhost:8321")

# Create a vector database instance
embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
embedding_model = embed_lm.identifier
vector_db_id = f"v{uuid.uuid4().hex}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model,
)

# Create Documents
urls = [
    "memory_optimizations.rst",
    "chat.rst",
    "llama3.rst",
    "qat_finetune.rst",
    "lora_finetune.rst",
]
documents = [
    Document(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]

# Insert documents
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

# Get the model being served
llm = next(m for m in client.models.list() if m.model_type == "llm")
model = llm.identifier

# Create the RAG agent
rag_agent = Agent(
    client,
    model=model,
    instructions="You are a helpful assistant. Use the RAG tool to answer questions as needed.",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
)

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

turns = ["what is torchtune", "tell me about dora"]

for t in turns:
    print("user>", t)
    stream = rag_agent.create_turn(
        messages=[{"role": "user", "content": t}], session_id=session_id, stream=True
    )
    for event in AgentEventLogger().log(stream):
        event.print()

### Web search tool

In [None]:
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(
    base_url=f"http://localhost:8321",
    provider_data={
        "tavily_search_api_key": "xxxx"
    },  # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
)

agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions=(
        "You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
    ),
    tools=["builtin::websearch"],
)

session_id = agent.create_session("websearch-session")

response = agent.create_turn(
    messages=[
        {"role": "user", "content": "How did the USA perform in the last Olympics?"}
    ],
    session_id=session_id,
)
for log in EventLogger().log(response):
    log.print()

### Code interpreter tool

In [None]:
from llama_stack_client import Agent

client = LlamaStackClient(base_url="http://localhost:8321")


agent = Agent(
    client,
    instructions="""
    You are a highly reliable, concise, and precise assistant.
    Always show the generated code, never generate your own code, and never anticipate results.
    """,
    model="meta-llama/Llama-3.2-3B-Instruct",
    tools=["builtin::code_interpreter"],
)

session_id = agent.create_session("tool_session")

response = agent.create_turn(
    messages=[{"role": "user", "content": "Can you generate code to say Hello World in C++ and then execute?"}],
    session_id=session_id,
)

for log in EventLogger().log(response):
    log.print()