In [6]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Getting Started with Vertex AI Python SDK for Gen AI Evaluation Service

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fmulti_agent_evals_with_arize_and_crewai.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/multi_agent_evals_with_arize_and_crewai" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) | [John Gilhuly](https://github.com/jgilhuly),  [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |
|-----------|----------------------------------------------|

# Overview

This notebook demonstrates how to evaluate multi-agent systems using [Arize Phoenix](https://phoenix.arize.com), Google Evals, and [CrewAI](https://www.crewai.com/). It shows how to:

1. Set up a multi-agent system using CrewAI for collaborative AI agents
2. Instrument the agents with Phoenix for tracing and monitoring
3. Evaluate agent performance and interactions using Google Gen AI
4. Analyze the results using Arize's observability platform

The notebook uses a practical example of agents working together to solve a task, with detailed tracing of their interactions and performance metrics. This enables:

- Monitoring agent behavior and decision-making
- Identifying bottlenecks and inefficiencies  
- Measuring system performance and reliability
- Gaining insights into agent collaboration patterns
 
## Key Technologies
 
- **CrewAI**: For orchestrating multi-agent systems
- **Arize Phoenix**: For observability and tracing
- **Google Cloud Vertex AI**: For model hosting and execution
- **OpenAI**: For agent LLM capabilities

# Install dependencies and set API keys

Create an account on [Arize Phoenix](https://phoenix.arize.com) to access a free hosted instance. Alternatively, you can [self-host](https://docs.arize.com/phoenix/self-hosting) the platform.

In [1]:
%pip install -q arize-phoenix crewai crewai_tools openinference-instrumentation-crewai

In [13]:
import getpass
import os

# Prompt the user for their API keys if they haven't been set
serper_key = os.getenv("SERPER_API_KEY", "SERPER_API_KEY")
phoenix_api_key = os.getenv("PHOENIX_API_KEY", "PHOENIX_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY", "GEMINI_API_KEY")

if serper_key == "SERPER_API_KEY":
    serper_key = getpass.getpass("Please enter your SERPER_API_KEY: ")

if phoenix_api_key == "PHOENIX_API_KEY":
    phoenix_api_key = getpass.getpass("Please enter your PHOENIX_API_KEY: ")

if gemini_api_key == "GEMINI_API_KEY":
    gemini_api_key = getpass.getpass("Please enter your GEMINI_API_KEY: ")

# Set the environment variables with the provided keys
os.environ["SERPER_API_KEY"] = serper_key
os.environ["GEMINI_API_KEY"] = gemini_api_key
os.environ["PHOENIX_API_KEY"] = phoenix_api_key
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com/"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={phoenix_api_key}"

In [None]:
!gcloud auth login

# Connect to Phoenix for Tracing

The auto_instrument parameter below will automatically call any `openinference-instrumentation-xxx` packages you have installed to capture traces from the corresponding packages. In this example, the call below will automatically capture any calls to CrewAI.

In [None]:
from phoenix.otel import register

tracer_provider = register(auto_instrument=True)

# Define your CrewAI Crew of Agents

This crew consists of three specialized agents working together to analyze and report on a given topic:
1. A Senior Research Analyst who uncovers and identifies emerging trends
2. A Specialist Fact Checker who verifies claims and assesses implications
3. A Writer who transforms the verified information into compelling content
Together, these agents form a collaborative team that researches, validates, and communicates insights on the specified topic.


In [14]:
from crewai import Agent, Crew, Process, Task
from crewai_tools import SerperDevTool

search_tool = SerperDevTool()
llm = "gemini-2.0-flash"


def create_research_crew(topic="AI and data science advancements in 2024"):
    """
    Creates a crew of agents to analyze a given topic.

    Args:
        topic (str): The topic to analyze. Defaults to "AI and data science advancements in 2024".

    Returns:
        Crew: A crew instance configured to analyze the given topic.
    """
    # Define your agents with roles and goals
    researcher = Agent(
        role="Senior Research Analyst",
        goal=f"Uncover cutting-edge developments in {topic}",
        backstory="""You work at a leading tech think tank.
      Your expertise lies in identifying emerging trends.
      You have a knack for dissecting complex data and presenting actionable insights.""",
        verbose=True,
        allow_delegation=False,
        tools=[search_tool],
        llm=llm,
    )

    fact_checker = Agent(
        role="Specialist Fact Checker",
        goal=f"Verify claims and assess implications of {topic}",
        backstory="""You are a respected authority with a background in both technical and ethical aspects.
      You scrutinize developments for potential societal impacts and ethical concerns.
      Your analysis helps ensure balanced reporting on technological advancements.""",
        verbose=True,
        allow_delegation=True,
        tools=[search_tool],
        llm=llm,
    )

    writer = Agent(
        role="Writer",
        goal=f"Craft compelling content on {topic}",
        backstory="""You are a renowned Writer, known for your insightful and engaging articles.
      You transform complex concepts into compelling narratives while maintaining factual accuracy.
      You can only work with confirmed facts, not speculations.
      You require that your input is confirmed by the fact checker.""",
        verbose=True,
        allow_delegation=True,
        llm=llm,
    )

    # Create tasks for your agents with explicit context
    conduct_analysis_task = Task(
        description=f"""Conduct a comprehensive analysis of the latest developments in {topic}.
      Identify key trends, breakthrough technologies, and potential industry impacts.
      Focus on both research breakthroughs and commercial applications.""",
        expected_output="Full analysis report in bullet points with citations to sources",
        agent=researcher,
        context=[],  # Explicitly set empty context
    )

    fact_checking_task = Task(
        description=f"""Review the research findings and verify the accuracy of claims about {topic}.
      Identify any potential ethical concerns or societal implications.
      Highlight areas where hype may exceed reality and provide a balanced assessment.
      Suggest frameworks that should be considered for each major advancement.""",
        expected_output="Fact-checking report with verification status for each major claim",
        agent=fact_checker,
        context=[conduct_analysis_task],  # Set context to previous task
    )

    writer_task = Task(
        description=f"""Using the insights provided from research, fact-checking, and market assessment,
      develop a comprehensive and engaging blog post that presents a holistic view of {topic}.
      Your post should be informative yet accessible, catering to a knowledgeable audience.
      Include balanced perspectives on both the potential and limitations.
      Make it sound cool, avoid complex words so it doesn't sound like AI.
      Incorporate ethical considerations and market projections to provide readers with a complete picture.""",
        expected_output="Full blog post of at least 6 paragraphs with sections covering innovations, considerations, and market outlook",
        agent=writer,
        context=[fact_checking_task],  # Set context to previous task
    )

    # Instantiate your crew with a sequential process
    crew = Crew(
        agents=[researcher, fact_checker, writer],
        tasks=[conduct_analysis_task, fact_checking_task, writer_task],
        verbose=False,
        process=Process.sequential,
    )

    return crew

# Evaluating your Crew of Agents

Now you'll built an experiment set to test your CrewAI crew with Phoenix and Google Gen AI evals.

Experiments are made up of three pieces:
- A **Dataset** to house your inputs and expected outputs, and to centralize your tests.
- A **Task** to perform on each row of the dataset. Usually this is an invokation of your agent or crew of agents.
- A set of **Evaluators** to measure the outputs of your task on each row. These can be of your own creation, or pulled from a framework like Google Gen AI evals.

When run, an Experiment will send each row of your dataset through your task, then apply each of your evaluators to the result. All traces and metrics will then be stored in Phoenix for reference and comparison.


## Define Dataset of Test Cases

First define a set of test inputs for your crew and their expected agent trajectories.

In [5]:
import pandas as pd
import phoenix as px

reference_trajectory = {
    # example 1 - AI and data science advancements
    "AI and data science advancements in 2024": [
        {
            "tool_name": "Senior Research Analyst",
            "tool_input": "Conduct comprehensive research on AI and data science advancements in 2024, analyzing key innovations, trends, and potential impacts.",
        },
        {
            "tool_name": "Specialist Fact Checker",
            "tool_input": "Verify the accuracy of research findings on AI and data science advancements in 2024, ensuring all claims are supported by credible sources.",
        },
        {
            "tool_name": "Writer",
            "tool_input": "Develop a comprehensive blog post about AI and data science advancements in 2024, incorporating verified research and balanced perspectives.",
        },
    ],
    # example 2 - Quantum computing breakthroughs
    "Quantum computing breakthroughs": [
        {
            "tool_name": "Senior Research Analyst",
            "tool_input": "Conduct comprehensive research on quantum computing breakthroughs, analyzing key innovations, trends, and potential impacts.",
        },
        {
            "tool_name": "Specialist Fact Checker",
            "tool_input": "Verify the accuracy of research findings on quantum computing breakthroughs, ensuring all claims are supported by credible sources.",
        },
        {
            "tool_name": "Writer",
            "tool_input": "Develop a comprehensive blog post about quantum computing breakthroughs, incorporating verified research and balanced perspectives.",
        },
    ],
    # example 3 - Climate tech innovations
    "Climate tech innovations": [
        {
            "tool_name": "Senior Research Analyst",
            "tool_input": "Conduct comprehensive research on climate tech innovations, analyzing key developments, trends, and potential impacts.",
        },
        {
            "tool_name": "Specialist Fact Checker",
            "tool_input": "Verify the accuracy of research findings on climate tech innovations, ensuring all claims are supported by credible sources.",
        },
        {
            "tool_name": "Writer",
            "tool_input": "Develop a comprehensive blog post about climate tech innovations, incorporating verified research and balanced perspectives.",
        },
    ],
    # example 4 - Biotechnology and gene editing
    "Biotechnology and gene editing progress": [
        {
            "tool_name": "Senior Research Analyst",
            "tool_input": "Conduct comprehensive research on biotechnology and gene editing progress, analyzing key innovations, trends, and potential impacts.",
        },
        {
            "tool_name": "Specialist Fact Checker",
            "tool_input": "Verify the accuracy of research findings on biotechnology and gene editing progress, ensuring all claims are supported by credible sources.",
        },
        {
            "tool_name": "Writer",
            "tool_input": "Develop a comprehensive blog post about biotechnology and gene editing progress, incorporating verified research and balanced perspectives.",
        },
    ],
    # example 5 - Renewable energy solutions
    "Renewable energy solutions": [
        {
            "tool_name": "Senior Research Analyst",
            "tool_input": "Conduct comprehensive research on renewable energy solutions, analyzing key innovations, trends, and potential impacts.",
        },
        {
            "tool_name": "Specialist Fact Checker",
            "tool_input": "Verify the accuracy of research findings on renewable energy solutions, ensuring all claims are supported by credible sources.",
        },
        {
            "tool_name": "Writer",
            "tool_input": "Develop a comprehensive blog post about renewable energy solutions, incorporating verified research and balanced perspectives.",
        },
    ],
}

df = pd.DataFrame(
    [
        {
            "topic": "AI and data science advancements in 2024",
            "reference_trajectory": reference_trajectory[
                "AI and data science advancements in 2024"
            ],
        },
        {
            "topic": "Quantum computing breakthroughs",
            "reference_trajectory": reference_trajectory[
                "Quantum computing breakthroughs"
            ],
        },
        {
            "topic": "Climate tech innovations",
            "reference_trajectory": reference_trajectory["Climate tech innovations"],
        },
        {
            "topic": "Biotechnology and gene editing progress",
            "reference_trajectory": reference_trajectory[
                "Biotechnology and gene editing progress"
            ],
        },
        {
            "topic": "Renewable energy solutions",
            "reference_trajectory": reference_trajectory["Renewable energy solutions"],
        },
    ]
)

Create a dataset in Phoenix to track your various experiments on that dataset.

In [None]:
phoenix_client = px.Client()
try:
    dataset = phoenix_client.get_dataset(name="crewai-researcher-test-topics")
except ValueError:
    dataset = phoenix_client.upload_dataset(
        dataframe=df,
        dataset_name="crewai-researcher-test-topics",
        input_keys=["topic"],
        output_keys=["reference_trajectory"],
    )

## Define your Experiment Task

This method will be run on each row of your test cases dataset:

In [16]:
def call_crew_with_topic(input):
    crew = create_research_crew(topic=input.get("topic"))
    result = crew.kickoff()
    return result

## Define your Evaluators

Define as many evaluators as you'd need to evaluate your agent. In this case, you'll use Google Gen AI's eval library to evaluate the crew's trajectory.

In [8]:
from vertexai.preview.evaluation import EvalTask


def create_trajectory_from_response(response):
    tasks = response.get("tasks_output")
    actual_trajectory = []
    if tasks is not None:  # Check if tasks is not None and iterable
        for task in tasks:
            if isinstance(task, dict):
                agent_name = task.get("agent", "Unknown Agent")
                description = task.get("description", "No description")
                new_entry = {"tool_name": agent_name, "tool_input": description}
                actual_trajectory.append(new_entry)
    return actual_trajectory


def eval_trajectory_with_google_gen_ai(
    output, expected, metric_name="trajectory_exact_match"
) -> float:
    eval_dataset = pd.DataFrame(
        {
            "predicted_trajectory": [create_trajectory_from_response(output)],
            "reference_trajectory": [expected.get("reference_trajectory")],
        }
    )
    eval_task = EvalTask(
        dataset=eval_dataset,
        metrics=[metric_name],
    )
    eval_result = eval_task.evaluate()
    metric_value = eval_result.summary_metrics.get(f"{metric_name}/mean")
    if metric_value is None:
        return 0.0
    return metric_value


def trajectory_exact_match(output, expected):
    return eval_trajectory_with_google_gen_ai(
        output, expected, metric_name="trajectory_exact_match"
    )


def trajectory_precision(output, expected):
    return eval_trajectory_with_google_gen_ai(
        output, expected, metric_name="trajectory_precision"
    )


def trajectory_in_order_match(output, expected):
    return eval_trajectory_with_google_gen_ai(
        output, expected, metric_name="trajectory_in_order_match"
    )


def trajectory_any_order_match(output, expected):
    return eval_trajectory_with_google_gen_ai(
        output, expected, metric_name="trajectory_any_order_match"
    )


# This final evaluator is a custom code based check:
def agent_names_match(output, expected):
    predicted_trajectory = create_trajectory_from_response(output)
    reference_trajectory = expected.get("reference_trajectory")

    # Check if the predicted agents appear in the same order as in the reference
    if reference_trajectory is None or len(predicted_trajectory) != len(
        reference_trajectory
    ):
        return 0

    for i, predicted_task in enumerate(predicted_trajectory):
        if predicted_task["tool_name"] != reference_trajectory[i]["tool_name"]:
            return 0
    return 1

## Run your Experiment and Visualize Results in Phoenix

In [None]:
from phoenix.experiments import run_experiment
import nest_asyncio

nest_asyncio.apply()

experiment = run_experiment(
    dataset,
    call_crew_with_topic,
    experiment_name="agent-experiment",
    evaluators=[
        trajectory_exact_match,
        trajectory_precision,
        trajectory_in_order_match,
        trajectory_any_order_match,
        agent_names_match,
    ],
)