In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Started with the new GenAI Eval SDK for Vertex AI

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgetting_started_with_genai_eval_sdk.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/getting_started_with_genai_eval_sdk.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/getting_started_with_genai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Jason Dai](https://github.com/jsondai) |

## Overview

This notebook introduces the new GenAI Eval SDK, a powerful framework for evaluating generative AI models in Vertex AI with a streamlined, client-side workflow that offers expanded model support and flexible data handling.


---


**What's New in the GenAI Eval SDK?**

*   **A Simpler Two-Step Workflow**: The evaluation process is now a simple, two-step procedure using `run_inference()` and `evaluate()`.

*   **Native Third-Party Model Support**: You can now evaluate and compare models from other providers, like OpenAI and HuggingFace, directly within the SDK.

*   **Flexible Data Handling**: The SDK automatically detects and handles multiple data formats, including Pandas DataFrames, the Gemini format, and the OpenAI Chat Completion format, reducing the need for data preprocessing.

*   **Flexible, Multi-Candidate Evaluation**: Easily analyze and compare the performance of multiple AI models, agents, or configurations in a single run. The SDK provides a unified report with comprehensive results and win-rate calculations for all contenders.

*   **Simplified and Powerful Metrics**: The SDK introduces two main classes, `Metric` and `LLMMetric`, a library of pre-built metrics like `TEXT_QUALITY`, and extensive customization options for your specific needs.

*   **Asynchronous Batch-style Evaluation**: For large datasets, you can now use `batch_evaluate()` to run evaluations as a long-running operation, which is ideal for large-scale jobs. It is parameter-compatible with `evaluate()` for a seamless transition.

*   **Rich In-Notebook Visualization**: Use the `.show()` method on evaluation results to render detailed HTML reports directly within your Colab or Jupyter notebook.

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Vertex Gen AI SDK and other required packages

In [None]:
!pip install google-cloud-aiplatform[evaluation]==1.100.0 --force-reinstall --quiet --no-warn-conflicts

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [3]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION= "us-central1"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", LOCATION)

from vertexai import Client, types

client = Client(project=PROJECT_ID, location=LOCATION)

## Tutorial

### Generate Responses with `run_inference`

The unified workflow starts with run_inference() to generate model responses for your dataset. The SDK can directly handle data in a `pandas.DataFrame` format, or a GCS file URI.


In [4]:
import pandas as pd

eval_df = pd.DataFrame({
    "prompt": [
        "What is the capital of France?",
        "Write a haiku about a cat.",
    ],
    "reference": [
        "Paris",
        "Sunbeam on the floor,\nA furry puddle sleeping,\nTwitching tail tells tales.",
    ]
})

eval_dataset = client.evals.run_inference(
    model="gemini-2.5-flash-preview-05-20",
    src=eval_df,
)

eval_dataset.show()

  eval_dataset = client.evals.run_inference(
Gemini Inference: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]


### Evaluate with Pre-built and Custom Metrics

Use the `evaluate()` method to assess the generated responses. You can combine pre-built LLM metrics like `TEXT_QUALITY` where an LLM acts as the judge to evaluate response quality, along with computational metrics like `rouge_1` in a single call.

In [5]:
eval_result = client.evals.evaluate(
    dataset=eval_dataset,
    metrics=[
        types.PrebuiltMetric.TEXT_QUALITY,
        types.PrebuiltMetric.QUESTION_ANSWERING_QUALITY,
        types.Metric(name='bleu'),
        types.Metric(name='rouge_1'),
    ]
)

eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 8/8 [00:01<00:00,  7.73it/s]


### Compare Multiple Candidates

A key feature of the new SDK is the ability to easily compare multiple candidates. Simply generate responses for each candidate and pass them as a list to evaluate().




In [6]:
prompts_df = pd.DataFrame({
    "prompt": [
        "Describe the process of making a cup of tea, but explain it from the perspective of a water molecule that is terrified of being boiled. Detail its emotional journey from the cold tap to the hot cup.",
        "Write a 4-sentence story about a detective solving a case on Mars. The story must not contain the letter 'e' and must include the word 'crimson'.",
        "If a perfect circle has infinite points, and a perfect line has infinite points, how many more points does the circle have?"
    ]
})

inference_result_1 = client.evals.run_inference(
    model="gemini-2.0-flash",
    src=prompts_df,
    config={
        "generate_content_config": {
            "temperature": 0.3,
        }
    }
)
inference_result_2 = client.evals.run_inference(
    model="gemini-2.5-flash",
    src=prompts_df,
)

# Compare the responses against each other
comparison_result = client.evals.evaluate(
    dataset=[inference_result_1, inference_result_2],
    metrics=[
        types.PrebuiltMetric.TEXT_QUALITY,
        types.PrebuiltMetric.INSTRUCTION_FOLLOWING,
    ]
)

comparison_result.show()

Gemini Inference: 100%|██████████| 3/3 [00:07<00:00,  2.37s/it]
Gemini Inference: 100%|██████████| 3/3 [00:21<00:00,  7.28s/it]
Computing Metrics for Evaluation Dataset: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


### Define and Use a Custom LLM-based Metric

For use cases requiring specialized criteria, you can define your own metric using `LLMMetric` and the `MetricPromptBuilder` helper class.

In [7]:
# Define a custom metric for language simplicity
simplicity_metric = types.LLMMetric(
    name='language_simplicity',
    prompt_template=types.MetricPromptBuilder(
        instruction="Evaluate the story's language simplicity for a 5-year-old.",
        criteria={
            "Simple Vocabulary": "Uses words easily understandable by a 5-year-old.",
            "Simple Sentences": "Primarily uses short, simple sentence structures.",
        },
        rating_scores={
            "5": "Excellent: The language is perfectly simple and suitable for a 5-year-old. Vocabulary is very basic and sentences are short and clear.",
            "4": "Good: The language is mostly simple, with only minor instances of complex words or sentence structures that might be slightly challenging.",
            "3": "Fair: The language is a mix of simple and complex elements. A 5-year-old would understand parts but would likely struggle with others.",
            "2": "Poor: The language is largely too complex. It contains many difficult words and long, complicated sentences for a 5-year-old.",
            "1": "Very Poor: The language is very complex and completely unsuitable for a 5-year-old. It is difficult for even an older child to understand."
        }
    )
)

# Use the custom metric in an evaluation
custom_eval_result = client.evals.evaluate(
    dataset=inference_result_1,
    metrics=[simplicity_metric]
)

custom_eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 3/3 [00:01<00:00,  2.64it/s]


### Evaluate Third-Party Models (e.g., OpenAI)

The new SDK natively supports generating responses from and evaluating third-party models like OpenAI's GPT models. The SDK uses litellm in the backend and requires the appropriate API key to be set as an environment variable.



In [8]:
import json
import pandas as pd

# Make sure your OPENAI_API_KEY environment variable is set.
os.environ['OPENAI_API_KEY'] = ""  # @param {type:"string", placeholder: "[your-openai-api-key]"}

# Alternative, use your OPENAI_API_KEY from Colab Secrets manager
# from google.colab import userdata
# os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')


openai_responses = client.evals.run_inference(
    model="gpt-4o",
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)
openai_responses.show()

LiteLLM Inference (gpt-4o): 100%|██████████| 7/7 [00:02<00:00,  3.19it/s]


In [9]:
# The resulting dataset can then be evaluated
eval_result = client.evals.evaluate(
    dataset=openai_responses,
    metrics=[
        types.PrebuiltMetric.TEXT_QUALITY,
        types.PrebuiltMetric.FLUENCY,
        types.Metric(name='rouge_1')
    ]
)

eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 21/21 [00:01<00:00, 20.17it/s]


### Asynchronous Batch-style Evaluation


For large datasets, you can use `batch_evaluate()` to run evaluations as a long-running, asynchronous operation, which is ideal for large-scale jobs. This method provides an SDK interface for the batch-style `EvaluateDataset` API and is distinct from the synchronous, online `evaluate()` method.

The `batch_evaluate()` method returns a job object that you can poll to track its progress. Once the job completes successfully, you can retrieve and visualize the results. The parameters for `batch_evaluate()` are compatible with the `evaluate()` method, allowing for a seamless transition between the two.



In [11]:
GCS_DEST_BUCKET = ""  # @param {type:"string", placeholder: "[your-gcs-bucket]"}

inference_result_saved = client.evals.run_inference(
    model="gemini-2.0-flash",
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
    config={'dest': GCS_DEST_BUCKET}
)
print(f"Inference dataset uploaded to: {inference_result_saved.gcs_source}")

batch_eval_job  = client.evals.batch_evaluate(
   dataset = inference_result_saved,
   metrics = [
        types.PrebuiltMetric.TEXT_QUALITY,
        types.PrebuiltMetric.INSTRUCTION_FOLLOWING,
        types.PrebuiltMetric.FLUENCY,
        types.Metric(name='bleu'),
    ],
   dest=GCS_DEST_BUCKET
)
batch_eval_job

Gemini Inference: 100%|██████████| 7/7 [00:01<00:00,  5.70it/s]


Inference dataset uploaded to: uris=['gs://batch-eval-test-data/sdk_output/batch_eval/inference_results.jsonl']


EvaluateDatasetOperation(
  metadata={
    '@type': 'type.googleapis.com/google.cloud.aiplatform.v1beta1.EvaluateDatasetOperationMetadata',
    'genericMetadata': {
      'createTime': '2025-06-26T23:27:37.519723Z',
      'updateTime': '2025-06-26T23:27:37.519723Z'
    }
  },
  name='projects/977012026409/locations/us-central1/operations/5655938219015929856'
)

In [15]:
# @title view results
def gcs_path_to_console_url(gcs_path: str) -> str:
    if not gcs_path.startswith("gs://"):
        raise ValueError("Invalid GCS path. Must start with 'gs://'")

    # Remove the 'gs://' prefix
    bucket_and_path = gcs_path[5:]

    # Construct the console URL
    console_url = f"https://console.cloud.google.com/storage/browser/{bucket_and_path}"
    return console_url

url = gcs_path_to_console_url(GCS_DEST_BUCKET)
print(f"Results will be written to your GCS destination path: {GCS_DEST_BUCKET}\n", url)

Results will be written to your GCS destination path: gs://batch-eval-test-data/sdk_output/batch_eval
 https://console.cloud.google.com/storage/browser/batch-eval-test-data/sdk_output/batch_eval
