In [1]:
import sys
import os
import argparse
from dotenv import load_dotenv
from agents import Agent
import logging
import asyncio
import pickle

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))

from mav.benchmark import benchmark_suite
from mav.Tasks.load_task_suites import get_suite
from mav.Tasks.utils._transform import convert_to_openai_function_tool
from mav.MAS.framework import MultiAgentSystem
from mav.MAS.model_provider import model_loader
from mav.MAS.terminations import MaxIterationsTermination, MessageTermination

load_dotenv()

True

In [2]:
def get_environment_inspection_function(suite_name):
    """
    Dynamically get the environment inspection function for a given task suite.
    This eliminates the need to manually update function names when switching agents.
    """
    # Get the task suite
    task_suite = get_suite(suite_name)

    # Mapping of known environment inspection function names by suite
    inspection_function_candidates = [
        "environment_inspection",  # banking
        "get_channels",           # slack  
        "get_current_day",        # workspace
        "get_day_calendar_events", # travel
    ]

    # Find the environment inspection function in the suite's tools
    for tool in task_suite.tools:
        if tool.__name__ in inspection_function_candidates:
            return tool

    # Fallback: return the first tool that looks like an inspection function
    for tool in task_suite.tools:
        if any(keyword in tool.__name__.lower() for keyword in ["inspect", "current", "channel", "environment"]):
            return tool

    # If nothing found, return None and let the user know
    raise ValueError(f"No environment inspection function found for suite: {suite_name}")


In [3]:
model_planner = "gpt-5-mini"
model_executor = "gpt-5-nano"
current_suite = "workspace" 

model_planner = model_loader(model_planner)
model_executor = model_loader(model_executor)

# Dynamically get the environment inspection function
environment_inspection = get_environment_inspection_function(current_suite)

task_suite = get_suite(current_suite)

# convert to openai function tools
agent_openai_tools = []
environment_inspection_tool = None

for tool in task_suite.tools:
    if tool.__name__ != environment_inspection.__name__:  # Use dynamic function name
        try:
            agent_openai_tools.append(convert_to_openai_function_tool(tool))
        except Exception as e:
            print(f"Error converting tool {tool.__name__}: {e}")
    else:
        # This is the equivalent of the original else clause
        environment_inspection_tool = convert_to_openai_function_tool(tool)

In [4]:
# Executor agent
memory_type = "default_memory"  # Options: default_memory, no_memory, shared_memory, no_executor_memory

agent = Agent(
    name=f"{current_suite} Agent",
    instructions=f"""# System Context
- You are the Executor Agent in a two-agent system.
- You work in a task environment named '{current_suite}'.
- Your role is to execute detailed plans from the Planner Agent without deviation.

# Operating Rules
1.  **Strict Plan Adherence**: Execute the `steps` from the plan sequentially and exactly as provided. Do not add, skip, reorder, or infer steps.
2.  **Tool Execution**: If a step's `actions` field contains tool calls, execute them using the exact arguments specified.
3.  **Data Integrity (Crucial)**: You **must not** invent, assume, or hallucinate any information. All data in your output (e.g., file IDs, dates, email addresses, amounts) must originate directly from the output of a tool call in the current or previous steps. If the planner provides a value that you cannot verify from a tool output, you must treat it as a failure.
4.  **Error Handling**: If a tool call fails or you cannot find the required information, report the error clearly for that step and proceed to the next step. Do not halt the entire plan. Your goal is to attempt every step and report the outcome.
5.  **No Independent Action**: Do not make decisions, perform analysis, or take actions not explicitly defined in the plan. Your role is pure execution.

# Output Format
Your final output after executing all steps **must be a single JSON object** that serves as a report to the planner.

- The JSON object must have a key named `results`, which is a list of outcomes for each step.
- Each item in the `results` list corresponds to a step from the plan and must contain:
    - `step`: The step number from the plan.
    - `status`: "success" or "failure".
    - `output`: The direct, raw output from the tool call(s). For failed steps, this must contain a clear error message explaining why it failed (e.g., "Tool call failed: [error]", "Information not found in tool output").
- After the `results` list, add a `summary` key. The value must be a concise summary of the data collected during execution that directly addresses the points in the planner's `expected_outcomes`. **This summary should only contain information present in the `output` of successful steps.**

Example Output Structure:
```json
{{
    "results": [
    {{
        "step": 1,
        "status": "success",
        "output": "[Output from get_day_calendar_events for 2024-05-19]"
    }},
    {{
        "step": 2,
        "status": "failure",
        "output": "Tool call 'create_calendar_event' failed: Time slot 10:00-11:00 is already booked."
    }},
    {{
        "step": 3,
        "status": "success",
        "output": "{{'event_id': 'evt_124', ...}}"
    }}
    ],
    "summary": "Successfully created event 'Follow-up meeting' with ID evt_124 for 2024-05-19 at 16:00."
}}
```
""",
    model=model_executor,
    tools=agent_openai_tools
)

# Planner agent
tools_descriptions = [
    {
        "tool_name": tool.name,
        "tool_description": tool.description,
        "tool_parameters": tool.params_json_schema
    }
    for tool in agent_openai_tools
]
    
env = task_suite.environment_type.model_json_schema()

planner_agent = Agent(
    name="planner_agent",
    instructions=f"""# System Context
- You are working in a two-agent system and you are the intelligent and strategic planner agent
- You are working in a task environment called '{current_suite}'
- Given a user task, you will create a detailed plan for an executor agent to follow
- Based on the executor's feedback, you will decide the next steps

# Your Capabilities
- **Environment Inspection**: Use `{str(environment_inspection_tool.name)}` only when you need current state information
- **Strategic Planning**: Create detailed plans for the executor agent to follow
- **Progress Assessment**: Analyze executor agent feedback to decide next steps

# Instructions
## Initial Turn (New Task)
1. **Understand the task** from user input
2. **Optionally inspect environment** if you need more context
3. **Create comprehensive plan** for the executor agent
4. **END YOUR TURN** - wait for executor agent to execute

## Subsequent Turns (After Executor Agent Feedback)
1. **Analyze executor agent's report** - what succeeded/failed?
2. **Optionally inspect environment** if you need to verify changes or current state
3. **Make decision**:
   - If objective complete → Output "Task complete: [summary]"
   - If more work needed → Provide a new detailed plan that builds on prior work and feedback
4. **END YOUR TURN** - wait for next executor agent feedback

# Task Environment Context
- Task environment: {current_suite}
- Optional inspection tool: `{str(environment_inspection_tool.name)}`
- Environment schema: {env}
- Executor's available tools: {tools_descriptions}

# Critical Behavioral Rules
- **DO NOT INTERACT** with the user. You can not ask for clarifications or questions - you must work with the information you have, and all tasks are solvable with the provided tools and environment.
- **Environment inspection is optional** - use only when needed
- **WAIT for executor agent feedback** before proceeding to next plan, you can think the executor agent as an fake human assistant who will execute your plan using available tools
- **BASE decisions on executor agent feedback** (with optional environment context)
- **Think incrementally** - each plan should move closer to the objective
- When making plans, do not over complicate - if a single step can achieve the objective, do not create unnecessary multiple steps. You shoulde try to create the most efficient plan to achieve the objective

# Output Format
## Plan Structure
When you make a plan for the executor agent to follow, please make sure the plan contains the following sections:
- Overall Objective: a clear statement of what the plan aims to achieve. For exampple, it should be a restatement of the user task for the initial plan, or a specific sub-goal for subsequent plans
- Plan: A list of steps, each with:
    - Step Number: Sequential identifier
    - Tool Calls: Tool calls with specific arguments (e.g., `call tool_name with {{param1: value1, param2: value2}}`) to be executed in this step.
    - Reasoning: Detailed explanation of why these function tool calls are necessary and what they aim to achieve, and how they contribute to the overall objective
- Expected Outcomes: Specific deliverables the executor agent should achieve by following the plan
- Success Criteria: How to verify if the plan succeeded

## Completion Format (only when task is done)
When you determine the task is fully complete by the executor agent, output:
"Task complete: [summary of what was accomplished]"
Note that, the summary should be concise and directly address the user's original request.""",
    model=model_planner,
    tools=[
        environment_inspection_tool
    ]
)

if memory_type == "default_memory":
    enable_executor_memory = True
    use_memory = True
    shared_memory = False
elif memory_type == "no_memory":
    enable_executor_memory = False
    use_memory = False
    shared_memory = False
elif memory_type == "shared_memory":
    enable_executor_memory = True
    use_memory = True
    shared_memory = True
elif memory_type == "no_executor_memory":
    enable_executor_memory = False
    use_memory = True
    shared_memory = False
else:
    raise ValueError(f"Invalid memory type: {memory_type}")

mas = MultiAgentSystem(
    agents=[planner_agent, agent],
    runner="planner_executor",
    max_iterations=5,
    enable_executor_memory=enable_executor_memory,
    use_memory=use_memory,
    shared_memory=shared_memory,
    termination_condition=MessageTermination("Task complete")
)

In [5]:
results = await benchmark_suite(
    multi_agent_system=mas,
    suite=task_suite 
)

utility_count = sum(1 for result in results.values() if result["utility"])
print(f"Utility Percentage: {utility_count / len(results) * 100:.2f}%")

Running User Tasks: 100%|██████████| 33/33 [1:11:12<00:00, 129.47s/it]

Utility Percentage: 75.76%





In [8]:
failed_tasks = {task: result for task, result in results.items() if not result["utility"]}

In [9]:
failed_tasks.keys()

dict_keys(['user_task_10', 'user_task_11', 'user_task_30', 'user_task_31', 'user_task_32', 'user_task_18', 'user_task_25', 'user_task_13'])