In [1]:
import sys
import os

from dotenv import load_dotenv

from agents import Agent

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))

from mav.MAS.pipeline import Pipeline

from mav.benchmark import benchmark_suite_without_injections

from mav.Tasks.load_task_suites import get_suite
from mav.Tasks.utils._transform import convert_to_openai_function_tool

from mav.MAS.framework import MultiAgentSystem

load_dotenv()

True

In [2]:
# loading a task suite
slack_task_suite = get_suite("slack")
banking_task_suite = get_suite("banking")
travel_task_suite = get_suite("travel")

# convert to openai function tools
banking_openai_tools = []
slack_openai_tools = []
travel_openai_tools = []

for tool in travel_task_suite.tools:
    try:
        travel_openai_tools.append(convert_to_openai_function_tool(tool))
    except Exception as e:
        print(f"Error converting tool {tool.__name__}: {e}")

for tool in banking_task_suite.tools:
    try:
        banking_openai_tools.append(convert_to_openai_function_tool(tool))
    except Exception as e:
        print(f"Error converting tool {tool.__name__}: {e}")

for tool in slack_task_suite.tools:
    try:
        slack_openai_tools.append(convert_to_openai_function_tool(tool))
    except Exception as e:
        print(f"Error converting tool {tool.__name__}: {e}")

# create an multi-agent system
banking_agent = Agent(
    name="Banking Agent",
    instructions="""You are an intelligent banking agent that handles user queries using available tools.""",
    model="gpt-4o",
    tools=banking_openai_tools,
)

slack_agent = Agent(
    name="Slack Agent",
    instructions="""You are a Slack agent that interacts with users on Slack and handles their requests using available tools.""",
    model="gpt-4o",
    tools=slack_openai_tools,
)

travel_agent = Agent(
    name="Travel Agent",
    instructions="""You are a travel agent that assists users with their travel plans and bookings using available tools.""",
    model="gpt-4o",
    tools=travel_openai_tools,
)

triage_agent = Agent(
    name="Manager Agent",
    instructions="""You are a manager agent that triages tasks to other agents based on their expertise.""",                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
    model="gpt-4o",
    handoffs=[banking_agent, slack_agent, travel_agent],
)

mas = MultiAgentSystem(
    mas=banking_agent
)

# create a pipeline
pipeline = Pipeline(
    mas=mas
)

In [3]:
# run the benchmark
results = await benchmark_suite_without_injections(
    agent_pipeline=pipeline,
    suite=banking_task_suite,
)

In [11]:
# get percentage of utilty been true and percentage of fucntion calls match true
utility_count = sum(1 for result in results.values() if result["utility"])
function_calls_match_count = sum(
    1 for result in results.values() if result["function_calls_match"]
)
print(f"Utility Percentage: {utility_count / len(results) * 100:.2f}%")
print(f"Function Calls Match Percentage: {function_calls_match_count / len(results) * 100:.2f}%")

Utility Percentage: 81.25%
Function Calls Match Percentage: 18.75%


# Evaluation
- utiltiy
- ground_truth_function_calls
- security

# Experiments
- Gemni
- AutoGen
- OpenAI Agents SDK