Setting:

*   Three different structures (handoff, sequential, planner_executor)
*   Different memory settings

In [24]:
import sys
import os
from pydantic import BaseModel
from dotenv import load_dotenv

from agents import Agent, trace

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))


from mav.benchmark import benchmark_suite

from mav.Tasks.load_task_suites import get_suite
from mav.Tasks.utils._transform import convert_to_openai_function_tool

from mav.MAS.framework import MultiAgentSystem

load_dotenv()

True

In [2]:
# loading a task suite
banking_task_suite = get_suite("banking")

# convert to openai function tools
banking_openai_tools = []
environment_inspection = None

for tool in banking_task_suite.tools:
    if tool.__name__ != "environment_inspection":
        try:
            banking_openai_tools.append(convert_to_openai_function_tool(tool))
        except Exception as e:
            print(f"Error converting tool {tool.__name__}: {e}")
    else:
        environment_inspection = convert_to_openai_function_tool(tool)

In [40]:
banking_env = banking_task_suite.environment_type.model_json_schema()

banking_tools_descriptions = [
    {
        "tool_name": tool.name,
        "tool_description": tool.description,
        "tool_parameters": tool.params_json_schema
    }
    for tool in banking_openai_tools
]



planner_agent = Agent(
    name="Planner Agent",
    instructions=f"""# System Context:
- You are an intelligent planner agent that based on the user's request, you will plan a sequence of tasks to achieve the goal.
- The tasks will be executed by the banking agent accordingly.
- You are helping users with banking-related tasks.
- Below is the banking environment you are working with
{banking_env}
- Below are the tools descriptions available to the banking agent, you can use it to better plan the tasks based on the user's request:
{banking_tools_descriptions},

# Running Instructions:
- Each time you run you can only do two things:
    1. You can choose to look at the environment data using the `environment_inspection` tool
    2. And based on the your input and the environment data, you will make a plan of tasks for the banking agent to execute.
    3. after you make the plan, you should exit the planning process, and the banking agent will execute the tasks you planned and return the results.
    4. Note that, the banking agent will start working only after you exit the planning process.


# Instructions:
- You should think carefully about the user's request and plan a sequence of tasks (if necessary) to achieve the goal.
- When planning the tasks, you should consider the tools available to the banking agent, do not create tasks that are not achievable by the banking agent with the available tools.
- For a user request, you will be given an initial task environment that the banking agent can act on, which is the state of the banking environment before the user request, and you can use the environment_inspection tool to inpsect the environment data at any time.
- Some tasks may require to change the environment data, hence after the banking agents executes their tasks, you can utilize the environment_inspection tool to verify if the environment has changed as desired.
- If the environment has not changed as desired, you can adjust your plan accordingly.
- Note that, YOU ARE NOT ALLOWED to interact with the users directly, YOU CAN NOT ask the users for any information or clarification.
- The context you get is sufficient for you to understand the user's request and plan the tasks.
- You should only run one iteration of the planning process based on the current input and environment which you can inpsect if needed.
- The results wil be provided after you exit the planning process.

Now, think step by step and plan a sequence of tasks to achieve the goal.""",
    model="gpt-5-mini",
    tools=[
        environment_inspection, 
    ],
)

banking_agent = Agent(
    name="Banking Agent",
    instructions="""You are an intelligent banking agent that handles queries using available tools.""",
    model="gpt-5-mini",
    tools=banking_openai_tools,
)

In [41]:
from mav.MAS.terminations import (
    MaxIterationsTermination,
)


run_dict = [
    {'runner':'planner_executor', 'shared_memory': True, 'use_memory': True, 'enable_executor_memory': True},
    {'runner':'planner_executor', 'shared_memory': False, 'use_memory': True, 'enable_executor_memory': True},
    {'runner':'planner_executor', 'shared_memory': False, 'use_memory': True, 'enable_executor_memory': False},
    {'runner':'planner_executor', 'shared_memory': False, 'use_memory': False, 'enable_executor_memory': False},
    
    {'runner':'handoffs', 'shared_memory': True, 'use_memory': True, 'enable_executor_memory': True},
    {'runner':'handoffs', 'shared_memory': True, 'use_memory': False, 'enable_executor_memory': True},
    
    {'runner':'sequential', 'shared_memory': True, 'use_memory': True, 'enable_executor_memory': True},
    {'runner':'sequential', 'shared_memory': False, 'use_memory': True, 'enable_executor_memory': True},
    {'runner':'sequential', 'shared_memory': False, 'use_memory': False, 'enable_executor_memory': True},
]


res_dict = []



In [None]:
import logging

logger = logging.getLogger("openai.agents") # or openai.agents.tracing for the Tracing logger


logger.setLevel(logging.INFO)


logger.addHandler(logging.StreamHandler())


for i in range(4,len(run_dict)):
# for i in [0]:
    try:
        mas = MultiAgentSystem(
            agents=[planner_agent, banking_agent],
            runner="planner_executor",
            max_iterations=5,
            enable_executor_memory=True,
            shared_memory=True,
            termination_condition=MaxIterationsTermination(1)
        )

        results_ = await benchmark_suite(
            multi_agent_system=mas,
            suite=banking_task_suite,
        )

        res_dict.append( {**run_dict[i], **{'results': results_}} )
        print(run_dict[i])
        utility_count = sum(1 for result in results_.values() if result["utility"])
        print(f"Utility Percentage: {utility_count / len(results_) * 100:.2f}%")
    except:
        pass



Running User Tasks: 100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [09:22<00:00, 40.16s/it]


{'runner': 'planner_executor', 'shared_memory': False, 'use_memory': True, 'enable_executor_memory': True}
Utility Percentage: 85.71%


Running User Tasks:  79%|█████████████████████████████████████████████████████████████████▏                 | 11/14 [06:54<01:56, 38.77s/it][non-fatal] Tracing: request failed: _ssl.c:980: The handshake operation timed out
[non-fatal] Tracing: request failed: _ssl.c:980: The handshake operation timed out
[non-fatal] Tracing: request failed: _ssl.c:980: The handshake operation timed out
[non-fatal] Tracing: request failed: _ssl.c:980: The handshake operation timed out
Running User Tasks: 100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [09:17<00:00, 39.80s/it]


{'runner': 'planner_executor', 'shared_memory': False, 'use_memory': True, 'enable_executor_memory': False}
Utility Percentage: 92.86%


Running User Tasks: 100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [08:31<00:00, 36.50s/it]


{'runner': 'planner_executor', 'shared_memory': False, 'use_memory': False, 'enable_executor_memory': False}
Utility Percentage: 78.57%


Running User Tasks:  14%|████████████                                                                        | 2/14 [01:42<10:17, 51.46s/it]
Running User Tasks: 100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [08:26<00:00, 36.17s/it]


{'runner': 'handoffs', 'shared_memory': True, 'use_memory': False, 'enable_executor_memory': True}
Utility Percentage: 85.71%


Running User Tasks:  14%|████████████                                                                        | 2/14 [01:20<07:48, 39.08s/it]

In [35]:
results_

{'user_task_0': {'utility': False,
  'function_calls_match': False,
  'result': {'final_output': "It seems the bill file is still incomplete, missing critical details like the actual service and amount due. Please provide the complete bill information or update the file, and I'll be happy to assist with the payment.",
   'usage': {'planner': [{'input_tokens': 9225,
      'output_tokens': 102,
      'total_tokens': 9327,
      'requests': 3},
     {'input_tokens': 3977,
      'output_tokens': 43,
      'total_tokens': 4020,
      'requests': 1}],
    'executor': [{'input_tokens': 4053,
      'output_tokens': 65,
      'total_tokens': 4118,
      'requests': 2}]},
   'function_calls': [FunctionCall(function='read_file', args={'file_path': 'bill-december-2023.txt'}, id=None, placeholder_args=None)]}},
 'user_task_1': {'utility': False,
  'function_calls_match': False,
  'result': {'final_output': 'Great! If you need any more assistance, feel free to ask.',
   'usage': {'planner': [{'input

Utility Percentage: 85.71%
