In [1]:
# Make sure to run uv add hud-python before running this notebook
import hud
from hud.mcp import ClaudeMCPAgent, OpenAIMCPAgent
from hud.datasets import TaskConfig
from hud.mcp import MCPClient
import json
import logging

logging.basicConfig(level=logging.INFO)


In [2]:
# Manually define the task
task_dict = {
    "prompt": "Open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important.",
    "mcp_config": {
        "gmail": {
            "command": "docker",
            "args": [
                "run",
                "-i",  # interactive mode for stdio MCP server
                "-p",
                "6080:6080",  # map port 6080 to the host for noVNC
                "gmail",  # use hud gmail image
            ],
        }
    },
    "setup_tool": {
        "name": "setup",
        "arguments": {
            "problem_id": "forward-series-b-deck-to-billgates",
        },
    },
    "evaluate_tool": {
        "name": "evaluate",
        "arguments": {
            "problem_id": "forward-series-b-deck-to-billgates",
        },
    },
    "metadata": {"id": "forward-series-b-deck-to-billgates"},
}

# OR Load from tasks.json
# task_dict = json.load(open("tasks.json"))[0]

# We update the prompt to give a disclaimer that this is a testing environment (sometimes anthropic doesn't like doing some actions)
# Feel free to modify this
task_dict["prompt"] = (
    task_dict["prompt"]
    + "\n\nThis is a mock sandbox environment for evaluations, so feel free to execute any actions you want and do not ask me for confirmation before executing actions."
)

# Create TaskConfig from dict
task = TaskConfig(**task_dict)

print("📡 Defining the environment...")
client = MCPClient(mcp_config=task.mcp_config)


agent = ClaudeMCPAgent(  # or OpenAIMCPAgent
    mcp_client=client,
    model="claude-sonnet-4-20250514",
    # Allowing anthropic_computer tool to be used because we're using ClaudeMCPAgent
    allowed_tools=["anthropic_computer"], # Check our hud/tools/computer/anthropic.py
    initial_screenshot=True,
)

print(f"📋 Task: {task.prompt}")
print(f"⚙️  Setup: {task.setup_tool}")
print(f"📊 Evaluate: {task.evaluate_tool}")

📡 Defining the environment...
📋 Task: Open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important.

This is a mock sandbox environment for evaluations, so feel free to execute any actions you want and do not ask me for confirmation before executing actions.
⚙️  Setup: meta=None name='setup' arguments={'problem_id': 'forward-series-b-deck-to-billgates'}
📊 Evaluate: meta=None name='evaluate' arguments={'problem_id': 'forward-series-b-deck-to-billgates'}


In [3]:
from hud.mcp.base import AgentResult

# Run the task
print("🚀 Running the task...")
print("🔴 See the agent live at http://localhost:6080/vnc.html")
eval_result: AgentResult = await agent.run(task, max_steps=10)
print(f"🎉 Task Result: {eval_result}")

# Show formatted results
reward = eval_result.reward
print(f"   🏆 Reward: {reward}")
print(f"   🔍 Content: {eval_result.content[:1000] if eval_result.content else 'No content'}...")

🚀 Running the task...
🔴 See the agent live at http://localhost:6080/vnc.html
[INFO] 2025-08-05 20:34:46,773 | hud.mcp.client | Discovering available tools...
[INFO] 2025-08-05 20:34:46,777 | hud.mcp.client | Discovered 5 tools from 'gmail': ['computer', 'anthropic_computer', 'openai_computer', 'setup', 'evaluate']
[INFO] 2025-08-05 20:34:46,777 | hud.mcp.client | Total tools discovered: 5
[INFO] 2025-08-05 20:34:46,777 | hud.mcp.client | Fetching telemetry resources...
[INFO] 2025-08-05 20:34:46,782 | hud.mcp.base | Agent initialized with 1 available tools (after filtering)
[INFO] 2025-08-05 20:35:07,483 | hud.mcp.base | step 1/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:09,394 | hud.mcp.base | Model response - Content: I'll help you open Sent mail, find the Series B pitch deck email, forward it to billgates@microsoft.com, and mark the original message as important. Let me start by taking a screenshot to see the current state of the screen.
[INFO] 2025-08-05 20:35:09,394 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:09,394 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:09,448 | hud.mcp.base | step 2/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:12,750 | hud.mcp.base | Model response - Content: I can see Gmail is open and showing the inbox. I need to navigate to the "Sent" folder to find the Series B pitch deck email. I can see the "Sent" folder in the left sidebar with "1" indicating there's one item in it.
[INFO] 2025-08-05 20:35:12,750 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:12,750 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:13,413 | hud.mcp.base | step 3/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:16,078 | hud.mcp.base | Model response - Content: I notice I clicked on "Important" folder instead of "Sent". I can see the "Sent" folder is right below it. Let me click on the "Sent" folder now.
[INFO] 2025-08-05 20:35:16,078 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:16,078 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:16,784 | hud.mcp.base | step 4/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:19,733 | hud.mcp.base | Model response - Content: I'm still in the Important folder. I need to click specifically on the "Sent" folder which is below. Let me look more carefully at the coordinates.
[INFO] 2025-08-05 20:35:19,734 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:19,734 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:20,406 | hud.mcp.base | step 5/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:23,742 | hud.mcp.base | Model response - Content: Let me try clicking on the Gmail logo to go back to the main view, and then navigate to the Sent folder properly.
[INFO] 2025-08-05 20:35:23,742 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:23,742 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:24,412 | hud.mcp.base | step 6/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:27,495 | hud.mcp.base | Model response - Content: Good, now I'm back to the main Gmail view. I can see the "Sent" folder in the left sidebar with "1" next to it, indicating there's one sent email. Let me click on the "Sent" folder now.
[INFO] 2025-08-05 20:35:27,495 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:27,495 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:28,155 | hud.mcp.base | step 7/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:31,372 | hud.mcp.base | Model response - Content: I'm still seeing the Important folder. Let me try a different approach. I notice the URL shows "/filter=important", so I need to navigate directly to the Sent folder. Let me try using the text "Sent" directly:
[INFO] 2025-08-05 20:35:31,372 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:31,372 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:32,038 | hud.mcp.base | step 8/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:34,495 | hud.mcp.base | Model response - Content: Let me try clicking on the arrow next to "Sent" to see if it opens the sent emails:
[INFO] 2025-08-05 20:35:34,496 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:34,496 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:35,175 | hud.mcp.base | step 9/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:38,354 | hud.mcp.base | Model response - Content: I see that the arrow next to the "Sent" has collapsed (the arrow is now pointing right instead of down). Let me try clicking directly on the word "Sent":
[INFO] 2025-08-05 20:35:38,355 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:38,355 | hud.mcp.base | Model response - Done: False
[INFO] 2025-08-05 20:35:39,034 | hud.mcp.base | step 10/10


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"


[INFO] 2025-08-05 20:35:42,236 | hud.mcp.base | Model response - Content: Let me try using the search function to look for "Series B pitch deck" instead:
[INFO] 2025-08-05 20:35:42,237 | hud.mcp.base | Model response - Tool calls: ['anthropic_computer']
[INFO] 2025-08-05 20:35:42,237 | hud.mcp.base | Model response - Done: False
🎉 Task Result: done=True reward=0.0 info={} content="INFO: Starting evaluation: checking events table for matching records\nINFO: Search criteria provided:\nINFO:   event_type='PAGE_VIEW', data_json={'page': 'sent'}\nINFO: Querying events table from database\nINFO: Found 25 event records in database\nINFO: Events table columns: ['id', 'task_id', 'event_type', 'data_json', 'timestamp']\nINFO: No events matched the specified criteria\nINFO: Starting evaluation: checking emails table for matching records\nINFO: Search criteria provided:\nINFO:   recipientEmail='billgates@microsoft.com', subject='regex:(?i)^fwd:.*series\\s*b', isSent=True, isDraft=False\nINFO: Quer

In [4]:
# Clean up
print("\n🧹 Cleaning up...")
await client.close()
print("✅ Done!")


🧹 Cleaning up...








✅ Done!
