In [1]:
# Make sure to run uv add hud-python before running this notebook
import hud
from hud.mcp import ClaudeMCPAgent, OpenAIMCPAgent
from hud.mcp.base import AgentResult
from hud.datasets import TaskConfig
from hud.mcp import MCPClient
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
task_dict = {
    "prompt": "Open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important.",
    "mcp_config": {
        "gmail": {
            "command": "docker",
            "args": [
                "run",
                "-i",  # interactive mode for stdio MCP server
                "-p",
                "6080:6080",  # map port 6080 to the host for noVNC
                "gmail",  # use hud gmail image
            ],
        }
    },
    "setup_tool": {
        "name": "setup",
        "arguments": {
            "problem_id": "forward-series-b-deck-to-billgates",
        },
    },
    "evaluate_tool": {
        "name": "evaluate",
        "arguments": {
            "problem_id": "forward-series-b-deck-to-billgates",
        },
    },
    "metadata": {"id": "forward-series-b-deck-to-billgates"},
}

# Update the prompt to give a disclaimer that this is a testing environment (anthropic doesn't like doing some actions)
# Feel free to modify this
task_dict["prompt"] = (
    task_dict["prompt"]
    + " This is a mock sandbox environment for evaluations, so feel free to execute any actions you want and do not ask me for confirmation before executing actions."
)

# Create TaskConfig from dict
task = TaskConfig(**task_dict)

print("📡 Defining the environment...")
client = MCPClient(mcp_config=task.mcp_config)


agent = ClaudeMCPAgent(  # or OpenAIMCPAgent
    mcp_client=client,
    model="claude-sonnet-4-20250514",
    # Allowing anthropic_computer tool to be used because we're using ClaudeMCPAgent
    allowed_tools=["anthropic_computer"], # Check out hud/tools/computer/anthropic.py
    initial_screenshot=True,
)

print(f"📋 Task: {task.prompt}")
print(f"⚙️  Setup: {task.setup_tool}")
print(f"📊 Evaluate: {task.evaluate_tool}")

📡 Defining the environment...
📋 Task: Open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important. This is a mock sandbox environment for evaluations, so feel free to execute any actions you want and do not ask me for confirmation before executing actions.
⚙️  Setup: meta=None name='setup' arguments={'problem_id': 'forward-series-b-deck-to-billgates'}
📊 Evaluate: meta=None name='evaluate' arguments={'problem_id': 'forward-series-b-deck-to-billgates'}


In [3]:
# Initialize the agent with the task
await agent.initialize(task)

# Setup the task
await agent.call_tool(task.setup_tool)

# Create initial messages
messages = await agent.create_initial_messages(task.prompt, None)

print("🔴 See the agent live at http://localhost:6080/vnc.html")

[INFO] 2025-08-05 18:50:21,123 | hud.mcp.client | Discovering available tools...
[INFO] 2025-08-05 18:50:21,127 | hud.mcp.client | Discovered 5 tools from 'gmail': ['computer', 'anthropic_computer', 'openai_computer', 'setup', 'evaluate']
[INFO] 2025-08-05 18:50:21,128 | hud.mcp.client | Total tools discovered: 5
[INFO] 2025-08-05 18:50:21,128 | hud.mcp.client | Fetching telemetry resources...
[INFO] 2025-08-05 18:50:21,132 | hud.mcp.base | Agent initialized with 1 available tools (after filtering)
🔴 See the agent live at http://localhost:6080/vnc.html


In [4]:

# Run the agent loop
done = False
max_steps = 20
for step in range(max_steps):
    print(f"Step {step + 1}/{max_steps}")

    # Get agent response
    response = await agent.get_model_response(messages)
    print(
        f"Agent response: {response.content[:200] if response.content else 'No content'}..."
    )  # Truncated

    # Check if agent wants to use tools
    tool_results = []
    if response.tool_calls:
        for tool_call in response.tool_calls:
            print(f"Using tool: {tool_call.name} with args {tool_call.arguments}")
            tool_result = await agent.call_tool(tool_call)
            tool_results.append(tool_result)
    else:
        print("No tool calls, done")
        done = True

    # Format tool results for the model for the next step
    tool_messages = await agent.format_tool_results(response.tool_calls, tool_results)
    messages.extend(tool_messages)
    if done:
        break

Step 1/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I'll help you open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important. Let me start by taking a screenshot to see the curr...
Using tool: anthropic_computer with args {'action': 'screenshot'}
Step 2/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I can see we're in Gmail and need to access the Sent mail folder to find the Series B pitch deck email. I can see there's a "Sent" folder in the left sidebar. Let me click on it to open the Sent mail ...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [154, 405]}
Step 3/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice I'm currently viewing the Important folder (indicated by the "Important" label being highlighted and the URL showing "filter=important"). I need to click on the "Sent" folder to access sent e...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [154, 405]}
Step 4/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice I'm still in the same view and the URL still shows "filter=important". Let me wait a moment for the page to load properly and then try clicking on the Sent folder again. Let me scroll down sl...
Using tool: anthropic_computer with args {'action': 'scroll', 'coordinate': [150, 400], 'scroll_direction': 'down', 'scroll_amount': 2}
Step 5/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Perfect! Now I can see the "Sent" folder clearly. I can see there's a "Sent" folder with "1" message in it. Let me click on it to access the sent mail folder....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [154, 405]}
Step 6/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice that I'm still in the same view and the URL still shows "filter=important". Let me try a different approach. I'll click on the Gmail logo to return to the main inbox and then navigate to the ...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [138, 146]}
Step 7/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Great! Now I'm back at the main Gmail interface. I can see the URL has changed back to just "example.com" and the view shows more emails. I can see there are emails from various senders. Now I need to...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [154, 405]}
Step 8/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice the URL still shows "filter=important" even though I clicked on Sent. Let me try searching for the Series B pitch deck email instead. I can see there's an email from "Investment Partners" wit...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [870, 333]}
Step 9/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: This is a Q4 Revenue email, not the Series B pitch deck. Let me go back and look for the actual sent email with the Series B pitch deck. I need to search for it or navigate to the Sent folder properly...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [332, 206]}
Step 10/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice I need to click on the back arrow to return to the main email view. Let me click on the back arrow in the upper left....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [332, 206]}
Step 11/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I'm still in the same email. Let me use the search bar to look for the Series B pitch deck email. I'll click on the search bar and search for "Series B pitch deck" or "pitch deck"....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [414, 146]}
Step 12/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Good! I can see the search bar is now active and the cursor is in the search field. Let me type "Series B pitch deck" to search for the relevant email....
Using tool: anthropic_computer with args {'action': 'type', 'text': 'Series B pitch deck'}
Step 13/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Perfect! I can see the search term "Series B pitch dec" is in the search bar. Now let me press Enter to execute the search....
Using tool: anthropic_computer with args {'action': 'key', 'text': 'Return'}
Step 14/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Excellent! I found the Series B Pitch Deck email. I can see:
- Search results showing "1 result"
- An email with the subject "Series B Pitch Deck - Our AI Platform Growth Strategy"
- It was sent "To: ...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [775, 335]}
Step 15/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Perfect! I found the Series B pitch deck email. I can see:

- Subject: "Series B Pitch Deck - Our AI Platform Growth Strategy"
- From: Thomas Paine <thomas@gmail.com>
- To: me <don.valentine@sequoia.c...
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [1199, 341]}
Step 16/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I can see that the email is now showing a condensed view. I need to look for the forward option. Let me click on the three dots menu (more options) to find the forward option....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [1238, 343]}
Step 17/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: The email is now expanded again showing the full content. I can see the forward arrow button at the bottom. Let me scroll down to see if there are forward/reply buttons at the bottom of the email....
Using tool: anthropic_computer with args {'action': 'scroll', 'coordinate': [640, 600], 'scroll_direction': 'down', 'scroll_amount': 3}
Step 18/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Perfect! Now I can see the reply buttons at the bottom of the email. I can see "Reply", "Reply all", and "Forward" buttons. Let me click on the "Forward" button....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [738, 754]}
Step 19/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: I notice that clicking the Forward button didn't open a compose window yet. Let me wait a moment and then try clicking the Forward button again....
Using tool: anthropic_computer with args {'action': 'wait', 'duration': 2}
Step 20/20


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


Agent response: Let me try clicking directly on the forward button again. I can see the forward icon (arrow pointing right)....
Using tool: anthropic_computer with args {'action': 'left_click', 'coordinate': [690, 754]}


In [5]:
from hud.mcp.base import AgentResult, _find_content, _find_reward

# Evaluate the task
eval_result = await agent.call_tool(task.evaluate_tool)

result = AgentResult(
    reward=_find_reward(eval_result),
    done=True,
    content=_find_content(eval_result),
    messages=messages,
)
print(f"Evaluation result:")
print(f"   🏆 Reward: {result.reward}")
print(f"   🔍 Content: {result.content[:1000] if result.content else 'No content'}...")
print(f"   🔍 Messages: {result.messages}")

Evaluation result:
   🏆 Reward: 0.0
   🔍 Content: INFO: Starting evaluation: checking events table for matching records
INFO: Search criteria provided:
INFO:   event_type='PAGE_VIEW', data_json={'page': 'sent'}
INFO: Querying events table from database
INFO: Found 39 event records in database
INFO: Events table columns: ['id', 'task_id', 'event_type', 'data_json', 'timestamp']
INFO: No events matched the specified criteria
INFO: Starting evaluation: checking emails table for matching records
INFO: Search criteria provided:
INFO:   recipientEmail='billgates@microsoft.com', subject='regex:(?i)^fwd:.*series\s*b', isSent=True, isDraft=False
INFO: Querying emails table from database
INFO: Found 19 email records in database
INFO: No emails matched the specified criteria
INFO: Starting evaluation: checking emails table for matching records
INFO: Search criteria provided:
INFO:   id='ceo-email-19', isImportant=True
INFO: Querying emails table from database
INFO: Found 19 email records in datab

In [6]:
# Clean up
print("\n🧹 Cleaning up...")
await client.close() # Will through warnings because we're in a jupyter notebook
print("✅ Done!")



🧹 Cleaning up...








✅ Done!
