In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Placeholder of Gen AI Agent Eval SDK (Colab 1 for UI)

**WARNING:**

This colab contains features under development. Please don't use it for production work.

**Goals:**

This Colab notebook demonstrates how to use the Gen AI Eval SDK to evaluate your agents. It covers one primary use cases:

1. Run Agent + Create Evaluatuation Run: The SDK will first run the agent and then create evaluation run to perform the evaluation.

# Set up

## Authenticate

In [None]:
from google.colab import auth
auth.authenticate_user()

## Install Vertex AI SDK for Gen AI Evaluation Service

In [None]:
%pip install --upgrade --force-reinstall -q google-cloud-aiplatform[evaluation]

## Initialize Variables

In [None]:
import vertexai
import pandas as pd
from vertexai.preview import reasoning_engines
from vertexai import agent_engines
import os

PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
LOCATION = ""  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", LOCATION)
# e.g. gs://my-bucket/my-folder
GCS_DEST = ""  # @param {type: "string", placeholder: "[your-gcs-bucket]", isTemplate: true}
# e.g. projects/{project_id}/locations/us-central1/reasoningEngines/{reasoning_engine_id}
AGENT = ""  # @param {type: "string", placeholder: "[your-agent]", isTemplate: true}


from vertexai import Client, types
from google.genai import types as genai_types

vertexai.init(project=PROJECT_ID, location=LOCATION)
client = Client(project=PROJECT_ID, location=LOCATION)

## Define Dataset and Agent Info

In [None]:
session_inputs1 = types.SessionInput(
    user_id="user_123",
    state={"my_key": "my_value"},
)
session_inputs2 = session_inputs1.copy()
session_inputs3 = session_inputs1.copy()
session_inputs4 = session_inputs1.copy()

agent_eval_dataset = pd.DataFrame({
    "prompt": [
        "Write a four-line poem about a lonely robot, where every line must be a question and the word 'and' cannot be used.",
        "Write a Python function to find the nth Fibonacci number using recursion with memoization, but without using any imports.",
        "Check if 20 is prime or not",
        "You are an agent, Take 2 steps \n 1. Check if 20 is prime or not \n 2. Check if 10 is prime or not"
    ],
    "response": [
        "response 1",
        "response 2",
        "response 3",
        "response 4",
    ],
  "session_inputs": [session_inputs1, session_inputs2, session_inputs3, session_inputs4]
})

agent_info = {
  "name": AGENT,
  "instruction": "example agent developer instruction",
  "description": "example agent description",
  "tool_declarations": [
    {
      "function_declarations": [
        {
          "name": "check_chime",
          "description": "Check chime.",
          "parameters": {
            "type": "object",
            "properties": {
              "nums": {
                "type": "string",
                "description": "List of numbers to be verified."
              },
            },
            "required": ["nums"]
          }
        }
      ]
    }
  ]
}

# Run Gen AI Agent Evaluation

## Option 1: Run Gen AI Evaluation with Evaluation Management Service

In [None]:
# Run Gen AI Agent Evaluation using the Evaluation Management Service.
# This will persist your dataset and evaluation results.
evaluation_run = client.evals.create_evaluation_run(
    name="test",
    dataset=types.EvaluationDataset(
        eval_dataset_df=agent_eval_dataset,
        candidate_name=agent_info.name,
    ),
    agent_info=agent_info,
    metrics=[
        types.RubricMetric.SAFETY,
    ],
    dest=GCS_DEST,
)
evaluation_run.show()

### Poll Evaluation Run for Completion

In [None]:
import time

completed_states = set(
    [
        'SUCCEEDED',
        'FAILED',
        'CANCELLED',
    ]
)

while evaluation_run.state not in completed_states:
    evaluation_run.show()
    evaluation_run = client.evals.get_evaluation_run(name=evaluation_run.name)
    time.sleep(5)
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run.name, include_evaluation_items=True)

evaluation_run.show()

## [Optional] Option 2: Run Gen AI Evaluation Locally

In [None]:
eval_result = client.evals.evaluate(
    dataset=types.EvaluationDataset(
        eval_dataset_df=agent_eval_dataset,
        candidate_name=agent_info.name,
    ),
    agent_info=agent_info,
    metrics=[
        types.RubricMetric.SAFETY,
    ],
)

eval_result.show()