In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create a Gen AI Agent Evaluation for a Deployed Agent

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fcreate_genai_agent_evaluation.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/create_genai_agent_evaluation.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<p>
<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/create_genai_agent_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
</p>

| Author(s) |
| --- |
| [Kelsi Lakey](https://github.com/lakeyk) |
| [Bo Zheng](https://github.com/coolalexzb) |

## Overview

This Colab notebook demonstrates how to use the Gen AI Eval SDK to evaluate a deployed Agent.

- **Run Agent Inference:** First run agent inference to retrieve real responses and traces from the deployed agent.
- **Create Evaluation Run:** Then create an Evaluation Run to perform the Gen AI Agent Evaluation. This Evaluation will be persisted and accessible later.

If you do not have a deployed Agent, please see:
- [Create & Deploy Agent and Run Gen AI Agent Evaluation]()

## Get started

### Install Google Gen AI SDK and other required packages
Restart runtime after installation to load latest packages


In [None]:
%pip install --upgrade --force-reinstall -q google-cloud-aiplatform[evaluation]

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
import vertexai
from google.cloud import storage
from google.genai import types as genai_types
from vertexai import Client

def get_or_create_gcs_bucket(project_id: str, gcs_dest: str) -> str:
    """Retrieves GCS bucket or creates a default."""
    full_path = gcs_dest or f"{project_id}/agent-evaluation"
    path_no_prefix = full_path.removeprefix("gs://")
    bucket_name = path_no_prefix.split("/")[0]

    storage_client = storage.Client(project=project_id)
    if storage_client.lookup_bucket(bucket_name) is None:
        print(f"Creating bucket: {bucket_name}")
        storage_client.create_bucket(bucket_name)

    return f"gs://{path_no_prefix}"


# Configuration
# fmt: off
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION = ""  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
GCS_DEST = ""  # @param {type: "string", placeholder: "[your-gcs-bucket]", isTemplate: true}
# fmt: on
GCS_DEST = get_or_create_gcs_bucket(PROJECT_ID, GCS_DEST)
AGENT = ""  # @param {type: "string", placeholder: "[your-agent]", isTemplate: true}

# Initialize SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)

client = Client(
    project=PROJECT_ID,
    location=LOCATION,
    http_options=genai_types.HttpOptions(api_version="v1beta1"),
)

### Import libraries


In [None]:
import time

import pandas as pd
from vertexai import types

# Step 1: Prepare Agent Dataset

## Define Agent Info
You will need to manually update the below `AgentInfo` definition with the input data specific to your agent. \
An example definition is provided below.

In [None]:
# Define agent functions
search_products = genai_types.FunctionDeclaration(
    description="Searches for products based on a query.",
    name="search_products",
    parameters={
        "properties": {"query": {"type": "string"}},
        "required": ["query"],
        "type": "object",
    },
)

# Define agent info
agent_info = types.evals.AgentInfo(
    agent_resource_name=AGENT,
    name="ecommerce_agent",
    instruction="You are an ecommerce expert",
    tool_declarations=[genai_types.Tool(function_declarations=[search_products])],
)

## Define Agent Dataset
Define a dataset that is specific to your agent. \
`agent_prompts` should consist of prompts or requests to be made to your agent. A few example prompts are shown below. \
`session_inputs` are required for traces. For more information see [Session](https://google.github.io/adk-docs/sessions/session/).

In [None]:
session_inputs = types.evals.SessionInput(
    user_id="user_123",
    state={},
)

agent_prompts = [
    "I'm looking for some good noise-cancelling headphones. Can you find a pair for me, check the details, and if they have a long battery life, add them to my cart?",
    "Find me a product with ID 'B08H8H8H8H', and add two of them to my cart.",
    "I want to buy some wireless ear buds. Please find a good pair and add it to my cart.",
    "Search for a laptop with 16GB of RAM, and then find a cheaper one with similar specs.",
    "I need a new laptop for work, can you find one with at least 16GB of RAM?",
]

agent_dataset = pd.DataFrame(
    {
        "prompt": agent_prompts,
        "session_inputs": [session_inputs] * len(agent_prompts),
    }
)

# Step 2: Run Agent Inference

Run inference using your deployed agent. This will add `intermediate_events` and `response` columns to your dataset to be evaluated in the next step.

In [None]:
agent_dataset_with_inference = client.evals.run_inference(
    agent=AGENT,
    src=agent_dataset,
)
# Display agent inference results
agent_dataset_with_inference.show()

# Step 3: Run Gen AI Agent Evaluation

Run Gen AI Agent Evaluation using the Evaluation Management Service. \
This will persist your dataset and evaluation results which can be retrieved via the Agent Engine UI.

In [None]:
evaluation_run = client.evals.create_evaluation_run(
    dataset=agent_dataset_with_inference,
    agent_info=agent_info,
    metrics=[
        types.RubricMetric.FINAL_RESPONSE_QUALITY,
        types.RubricMetric.TOOL_USE_QUALITY,
        types.RubricMetric.HALLUCINATION,
        types.RubricMetric.SAFETY,
    ],
    dest=GCS_DEST,
)
# Display the Evaluation Run status and results
evaluation_run.show()

### Poll Evaluation Run for Completion and Display Results
Retrieve the Evaluation Run and directly display the results using the .show() command. If the Evaluation Run failed the error message will be displayed. Otherwise the following results data will be displayed in an embedded report.

- **Summary metrics:** An aggregated view of all metrics, showing the mean score and standard deviation across the entire dataset.
- **Agent info:** Information describing the evaluated agent, including developer instruction, agent description, tool definitions, etc. Applied for agent evaluation only.
- **Detailed results:** A case-by-case breakdown, allowing you to inspect the prompt, reference, candidate response, and the specific score and explanation for each metric. For agent evaluation, detailed results will also include traces showing the agent interactions.

For more information, see [Visualizing Evaluation Reports](https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/view-evaluation#visualizing-evaluation-reports).

In [None]:
while evaluation_run.state not in {"SUCCEEDED", "FAILED", "CANCELLED"}:
    evaluation_run.show()
    evaluation_run = client.evals.get_evaluation_run(name=evaluation_run.name)
    time.sleep(5)


evaluation_run = client.evals.get_evaluation_run(
    name=evaluation_run.name, include_evaluation_items=True
)

# Display the Evaluation Run status and results
evaluation_run.show()