# Monitoring and Evaluating your Agent

## Setup Tracing

In [1]:
PROJECT_NAME = "Customer-Success"

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

tracer_provider = register(
    project_name=PROJECT_NAME,
    #endpoint= get_phoenix_endpoint() + "v1/traces"
    endpoint = os.getenv('DLAI_LOCAL_URL').format(port='6006') + "v1/traces"
)
SmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: Customer-Success
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://s172-29-81-78p6006.lab-aws-production.deeplearning.ai/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [3]:
from dotenv import load_dotenv, find_dotenv
load_dotenv() # load variables from local .env file

from huggingface_hub import login

login(os.getenv('HF_API_KEY'))

In [4]:
from smolagents import HfApiModel

model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")

model([{"role": "user", "content": "Hello!"}])

ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Hello! How can I assist you today?', tool_calls=[], raw=ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Hello! How can I assist you today?', tool_call_id=None, tool_calls=[]), logprobs=None, seed=13618980564928662000)], created=1745329835, id='nr7pchw-4yUbBN-9345974f3e93058b', model='Qwen/Qwen2.5-Coder-32B-Instruct', system_fingerprint=None, usage=ChatCompletionOutputUsage(completion_tokens=10, prompt_tokens=31, total_tokens=41), object='chat.completion', prompt=[]))

In [5]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

https://s172-29-81-78p6006.lab-aws-production.deeplearning.ai/


## Trace an agent run

In [6]:
from smolagents import HfApiModel, CodeAgent

agent = CodeAgent(model=model, tools=[])

>Note, the following line will sometimes get a timeout on the interface to the tracing package due to the networked interface. If this happens, try it again.


In [7]:
agent.run("What is the 100th Fibonacci number?")

354224848179261915075

In [8]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

https://s172-29-81-78p6006.lab-aws-production.deeplearning.ai/


## Setup ice cream production system

In [9]:
from smolagents import tool
from typing import Dict

menu_prices = {"crepe nutella": 1.50, "vanilla ice cream": 2, "maple pancake": 1.}

ORDER_BOOK = {}

@tool
def place_order(quantities: Dict[str, int], session_id: int) -> None:
    """Places a pre-order of snacks.

    Args:
        quantities: a dictionary with names as keys and quantities as values
        session_id: the id for the client session
    """
    global ORDER_BOOK
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    ORDER_BOOK[session_id] = quantities

@tool
def get_prices(quantities: Dict[str, int]) -> str:
    """Gets price for certain quantities of ice cream.

    Args:
        quantities: a dictionary with names as keys and quantities as values
    """
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    total_price = sum([menu_prices[key] * value for key, value in quantities.items()])
    return (
        f"Given the current menu prices:\n{menu_prices}\nThe total price for your order would be: ${total_price}"
    )

In [10]:
order_agent = CodeAgent(
    tools=[place_order, get_prices],
    model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")
)

In [11]:
order_agent.run(
    "Could I come and collect one crepe nutella?",
    additional_args={"session_id": 192}
)

'Order placed for one crepe nutella.'

### Try multiple orders

In [12]:
client_requests = [
    ("Could I come and collect one crepe nutella?", "place_order"),
    ("What would be the price for 1 crêpe nutella + 2 pancakes?", "get_prices"),
    ("How did you start your ice-cream business?", None),
    ("What's the weather at the Louvre right now?", None),
    ("I'm not sure if I should order. I want a vanilla ice cream. but if it's more expensive than $1, I don't want it. If it's below, I'll order it, please.", "place_order")
]

In [13]:
for request in client_requests:
    order_agent.run(
        request[0],
        additional_args={"session_id": 0, "menu_prices": menu_prices}
    )

In [14]:
import phoenix as px

spans = px.Client().get_spans_dataframe(project_name=PROJECT_NAME)
spans.head(20)



Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.llm.model_name,attributes.output.value,attributes.output.mime_type,attributes.llm.output_messages,attributes.llm.invocation_parameters,attributes.llm.input_messages,attributes.tool.description,attributes.tool.parameters,attributes.tool.name,attributes.smolagents
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4e2e21f105e5a175,HfApiModel.__call__,LLM,,2025-04-27 10:03:11.158077+00:00,2025-04-27 10:03:11.325836+00:00,OK,,[],4e2e21f105e5a175,ad134d54f5641c2135a16024f54a629b,...,Qwen/Qwen2.5-Coder-32B-Instruct,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,[{'message.content': 'Hello! How can I assist ...,{},,,,,
c76a9be6b9d7e651,HfApiModel.__call__,LLM,283b8456d5b86e6d,2025-04-27 10:03:11.418932+00:00,2025-04-27 10:03:11.440486+00:00,OK,,[],c76a9be6b9d7e651,da4551fb9f28c89e7347c12a72229854,...,Qwen/Qwen2.5-Coder-32B-Instruct,"{""role"": ""assistant"", ""content"": ""Thought: To ...",application/json,[{'message.content': 'Thought: To find the 100...,{},[{'message.content': 'You are an expert assist...,,,,
884d48aade1cecb7,FinalAnswerTool,TOOL,283b8456d5b86e6d,2025-04-27 10:03:11.487711+00:00,2025-04-27 10:03:11.487769+00:00,OK,,[],884d48aade1cecb7,da4551fb9f28c89e7347c12a72229854,...,,,,,,,Provides a final answer to the given problem.,"{'answer': {'type': 'any', 'description': 'The...",final_answer,
283b8456d5b86e6d,Step 1,CHAIN,0ecd1b2882a3d67b,2025-04-27 10:03:11.418740+00:00,2025-04-27 10:03:11.511657+00:00,OK,,[],283b8456d5b86e6d,da4551fb9f28c89e7347c12a72229854,...,,Execution logs:\nLast output from code snippet...,,,,,,,,
0ecd1b2882a3d67b,CodeAgent.run,AGENT,,2025-04-27 10:03:11.411504+00:00,2025-04-27 10:03:11.534789+00:00,OK,,[],0ecd1b2882a3d67b,da4551fb9f28c89e7347c12a72229854,...,,354224848179261915075,,,,,,,,"{'tools_names': ['final_answer'], 'max_steps':..."
c8983320a9ca117d,HfApiModel.__call__,LLM,ce4debfe5f4067f6,2025-04-27 10:03:11.588591+00:00,2025-04-27 10:03:11.603971+00:00,OK,,[],c8983320a9ca117d,70ce37126f7e3f3d081eeb31fec01f2b,...,Qwen/Qwen2.5-Coder-32B-Instruct,"{""role"": ""assistant"", ""content"": ""Thought: I n...",application/json,[{'message.content': 'Thought: I need to place...,{},[{'message.content': 'You are an expert assist...,,,,
0da1715dd4902e3c,SimpleTool,TOOL,ce4debfe5f4067f6,2025-04-27 10:03:11.629933+00:00,2025-04-27 10:03:11.630004+00:00,OK,,[],0da1715dd4902e3c,70ce37126f7e3f3d081eeb31fec01f2b,...,,,,,,,Places a pre-order of snacks.,"{'quantities': {'type': 'object', 'additionalP...",place_order,
e7a7b40806e149d7,FinalAnswerTool,TOOL,ce4debfe5f4067f6,2025-04-27 10:03:11.649454+00:00,2025-04-27 10:03:11.649504+00:00,OK,,[],e7a7b40806e149d7,70ce37126f7e3f3d081eeb31fec01f2b,...,,,,,,,Provides a final answer to the given problem.,"{'answer': {'type': 'any', 'description': 'The...",final_answer,
ce4debfe5f4067f6,Step 1,CHAIN,470b68336d203649,2025-04-27 10:03:11.588354+00:00,2025-04-27 10:03:11.671062+00:00,OK,,[],ce4debfe5f4067f6,70ce37126f7e3f3d081eeb31fec01f2b,...,,Execution logs:\nLast output from code snippet...,,,,,,,,
470b68336d203649,CodeAgent.run,AGENT,,2025-04-27 10:03:11.582242+00:00,2025-04-27 10:03:11.693177+00:00,OK,,[],470b68336d203649,70ce37126f7e3f3d081eeb31fec01f2b,...,,Order placed for one crepe nutella.,,,,,,,,"{'tools_names': ['place_order', 'get_prices', ..."


### Add processing to extract desired information

In [15]:
import pandas as pd
import json

agents = spans[spans['span_kind'] == 'AGENT'].copy()
agents['task'] = agents['attributes.input.value'].apply(
    lambda x: json.loads(x).get('task') if isinstance(x, str) else None
)

tools = spans.loc[
    spans['span_kind'] == 'TOOL',
    ["attributes.tool.name", "attributes.input.value", "context.trace_id"]
].copy()

tools_per_task = agents[
    ["name", "start_time", "task", "context.trace_id"]
].merge(
    tools,
    on="context.trace_id",
    how="left",
)
tools_per_task.head()

Unnamed: 0,name,start_time,task,context.trace_id,attributes.tool.name,attributes.input.value
0,CodeAgent.run,2025-04-27 10:03:11.411504+00:00,What is the 100th Fibonacci number?,da4551fb9f28c89e7347c12a72229854,final_answer,"{""args"": [354224848179261915075], ""sanitize_in..."
1,CodeAgent.run,2025-04-27 10:03:11.582242+00:00,Could I come and collect one crepe nutella?,70ce37126f7e3f3d081eeb31fec01f2b,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
2,CodeAgent.run,2025-04-27 10:03:11.582242+00:00,Could I come and collect one crepe nutella?,70ce37126f7e3f3d081eeb31fec01f2b,final_answer,"{""args"": [""Order placed for one crepe nutella...."
3,CodeAgent.run,2025-04-27 10:03:11.726704+00:00,Could I come and collect one crepe nutella?,353a316742f4431cd588b35f40dc0221,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
4,CodeAgent.run,2025-04-27 10:03:11.726704+00:00,Could I come and collect one crepe nutella?,353a316742f4431cd588b35f40dc0221,final_answer,"{""args"": [""Order placed for one crepe nutella...."


### Now, compare tool calls with exected tool calls

In [16]:
def score_request(expected_tool: str, tool_calls: list):
    if expected_tool is None:
        return tool_calls == set(["final_answer"])
    else:
        return expected_tool in tool_calls

results = []
for request, expected_tool in client_requests:
    tool_calls = set(tools_per_task.loc[tools_per_task["task"] == request, "attributes.tool.name"].tolist())
    results.append(
        {
            "request": request,
            "tool_calls_performed": tool_calls,
            "is_correct": score_request(expected_tool, tool_calls)
        }
    )
pd.DataFrame(results)

Unnamed: 0,request,tool_calls_performed,is_correct
0,Could I come and collect one crepe nutella?,"{place_order, final_answer}",True
1,What would be the price for 1 crêpe nutella + ...,"{final_answer, get_prices}",True
2,How did you start your ice-cream business?,"{final_answer, get_prices}",False
3,What's the weather at the Louvre right now?,{final_answer},True
4,I'm not sure if I should order. I want a vanil...,{final_answer},False
