In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Migrating Foundation Models: A Practical Guide with Gen AI Evaluation Serivce


 <table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fmodel_migration_with_gen_ai_eval.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/model_migration_with_gen_ai_eval.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>


| Author(s) |
| --- |
| [Jason Dai](https://github.com/jsondai) |

## Overview

This notebook demonstrates how to use the Vertex AI SDK for Gen AI Evaluation Service to compare two first-party models, for example, when considering a migration (e.g., `Gemini 2.0 Flash` to `Gemini 2.5 Flash`). We will use various predefined adaptive rubric-based metrics. Additionally, we'll touch upon how evaluation results can guide prompt optimization.

---

Key features highlighted in this notebook include:


*   **A Complete Evaluation Workflow**: The SDK provides a seamless experience from generating model responses with `run_inference()` to detailed assessment with `evaluate()`.

*   **Flexible, Multi-Candidate Evaluation**: Easily analyze and compare the performance of multiple AI models, agents, or configurations in a single run. The SDK provides a unified report with comprehensive results and win-rate calculations for all contenders.

*   **Rich In-Notebook Visualization**: The `.show()` method, available on both `EvaluationDataset` and `EvaluationResult` objects, renders an interactive HTML report for analysis directly within your Colab and Jupyter notebooks.

*   **Broad Model and Data Support**: Natively evaluate models from Google, OpenAI, and other providers supported by LiteLLM, and handle various data formats automatically.

*   **Asynchronous Batch-style Evaluation**: For large datasets, you can now use `batch_evaluate()` to run evaluations as a long-running operation, which is ideal for large-scale jobs.


*   **Integrated Prompt Optimization**: Iteratively improve your prompts using the built-in `prompt_optimizer` module and immediately re-evaluate to quantify the impact of your changes.


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started



In [None]:
# @title ### Install Vertex AI SDK for Gen AI Evaluation Service

%pip install --upgrade "google-cloud-aiplatform[evaluation]>=1.111.0" --force-reinstall --quiet --no-warn-conflicts

In [1]:
# @title ### Authenticate your notebook environment (Colab only)
# @markdown If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [1]:
# @title ### Set Google Cloud project information
# @markdown To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).
# @markdown Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

# @markdown ---

import os
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
LOCATION= "us-central1"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", LOCATION)


from vertexai import Client, types
client = Client(project=PROJECT_ID, location=LOCATION)

In [45]:
# @title ### Prepare Dataset and Generate Rubrics
# @markdown Rubrics would be saved in a group named `general_quality_rubrics`.

import pandas as pd

prompts_df = pd.DataFrame({
    "prompt": [
        "Explain the difference between 'emergent behavior' in AI systems and 'unintended consequences' in software engineering, with examples.",
        "Write a short story (under 150 words) that includes a paradox, irony, and an emotional twist—without explicitly naming any of them.",
        "Summarize the philosophical implications of Gödel’s incompleteness theorems for modern AI research in 3 clear bullet points.",
        "Given a fictional planet where time flows backward for 50% of its inhabitants, design a basic legal system that works for all citizens.",
        "Critically compare Nietzsche’s idea of 'eternal recurrence' with the concept of simulation theory in modern tech culture."
    ]
})

data_with_rubrics = client.evals.generate_rubrics(
    src=prompts_df,
    rubric_group_name="general_quality_rubrics",
    predefined_spec_name=types.RubricMetric.GENERAL_QUALITY,
)

In [46]:
# @title ### Run Inference for Both Models

# @markdown ---
# @markdown **Models to Compare**
MODEL_1_ID = "gemini-2.0-flash"  # @param {type: "string"}
MODEL_2_ID = "gemini-2.5-flash"  # @param {type: "string"}
# @markdown ---

print(f"Generating responses for {MODEL_1_ID}...")
candidate_1 = client.evals.run_inference(
    model=MODEL_1_ID,
    src=data_with_rubrics,
    config={
        "generate_content_config": {"temperature": 1.6}
    }
)

print(f"Generating responses for {MODEL_2_ID}...")
candidate_2 = client.evals.run_inference(
    model=MODEL_2_ID,
    src=data_with_rubrics,
)
candidate_2.show()

Generating responses for gemini-2.0-flash...


Gemini Inference: 100%|██████████| 5/5 [00:12<00:00,  2.48s/it]


Generating responses for gemini-2.5-flash...


Gemini Inference: 100%|██████████| 5/5 [00:27<00:00,  5.46s/it]


In [47]:
# @title ### Evaluate and Compare
# @markdown Use a list of datasets to compare the candidates.

comparison_result = client.evals.evaluate(
    dataset=[candidate_1, candidate_2],
    metrics=[
        types.RubricMetric.GENERAL_QUALITY(
            rubric_group_name="general_quality_rubrics",
        ),
    ]
)
comparison_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 10/10 [00:12<00:00,  1.25s/it]
  PydanticSerializationUnexpectedValue(Expected `WinRateStats` - serialized value may not be as expected [input_value={'win_rates': [0.0, 0.0], 'tie_rate': 1.0}, input_type=dict])
  return self.__pydantic_serializer__.to_python(


### Evaluate Third-Party Models (e.g., OpenAI)

You can use the Gen AI evaluation service to evaluate and compare models from providers such as OpenAI by passing the model name string to the `run_inference` method. The Gen AI evaluation service uses the `litellm` library to call the model API.

Make sure to set the required API key as an environment variable (such as `OPENAI_API_KEY`):

In [2]:
import json
import pandas as pd

# Make sure your OPENAI_API_KEY environment variable is set.
os.environ['OPENAI_API_KEY'] = ""  # @param {type:"string", placeholder: "[your-openai-api-key]"}
# WARNING: Setting API keys directly in code is insecure. Use environment variables or secure storage.

# Alternative, use your OPENAI_API_KEY from Colab Secrets manager
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')


openai_responses = client.evals.run_inference(
    model="gpt-4o",
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)
openai_responses.show()

LiteLLM Inference (gpt-4o): 100%|██████████| 7/7 [00:02<00:00,  2.52it/s]


In [4]:
eval_result = client.evals.evaluate(
    dataset=openai_responses,
    metrics=[
        types.RubricMetric.COHERENCE,
        types.RubricMetric.FLUENCY,
        types.Metric(name='rouge_1'),
        types.Metric(name='bleu'),
    ]
)
eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 28/28 [00:01<00:00, 26.27it/s]


### Asynchronous and large-scale evaluation

For large datasets, the Gen AI evaluation service provides an asynchronous, long-running batch evaluation method. This is ideal for scenarios where you don't need immediate results and want to offload the computation.
The `batch_evaluate()` method returns an operation object that you can poll to track its progress. The parameters are compatible with the `evaluate()` method.

In [None]:
GCS_DEST_BUCKET = ""  # @param {type:"string", placeholder: "[your-gcs-bucket]"}

inference_result_saved = client.evals.run_inference(
    model="gemini-2.5-flash",
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
    config={'dest': GCS_DEST_BUCKET}
)
print(f"Eval Dataset uploaded to: {inference_result_saved.gcs_source}")

batch_eval_job  = client.evals.batch_evaluate(
   dataset = inference_result_saved,
   metrics = [
        types.RubricMetric.FLUENCY,
        types.Metric(name='bleu'),
    ],
   dest=GCS_DEST_BUCKET
)
batch_eval_job

In [None]:
# @title #### View results
def gcs_path_to_console_url(gcs_path: str) -> str:
    if not gcs_path.startswith("gs://"):
        raise ValueError("Invalid GCS path. Must start with 'gs://'")

    # Remove the 'gs://' prefix
    bucket_and_path = gcs_path[5:]

    # Construct the console URL
    console_url = f"https://console.cloud.google.com/storage/browser/{bucket_and_path}"
    return console_url

url = gcs_path_to_console_url(GCS_DEST_BUCKET)
print(f"Results will be written to your GCS destination path: {GCS_DEST_BUCKET}\n", url)

### Prompt Optimization


The Vertex AI SDK includes a `prompt_optimizer` module designed to enhance your prompts. The typical workflow involves utilizing the optimizer to generate improved prompts and then re-evaluating your models using these optimized prompts.

In [None]:
prompt_template = "The below texts are extracted from a page of bank statement with position tags at the end of each line. The position tags are in the format XX|YY. The origin 00|00 is at the top left of the page. For example, 10|20 is directly above 10|24, and 10|20 is directly to the left of 20|20.\nFrom the bank statement, extract the named entities in json format.\n{\"account_number\": [], \"account_type\": [], \"bank_address\": \"\", \"bank_name\": \"\", \"client_address\": \"\", \"client_name\": [], \"currency\": [], \"ending_balance\": [], \"page_number\": [], \"starting_balance\": [], \"statement_date\": \"\", \"statement_end_date\": \"\", \"statement_start_date\": \"\"}\n- The extraction must respect the JSON schema.\n- The values must only include text strings found in the document and their respective line tags.\n- The line tags appear at the end of each line. They are the line's X and Y location on the document page, in the format XX|YY. The origin 00|00 is at the top left of the page. For example, 10|20 is directly above 10|24, and 10|20 is directly to the left of 20|20.\n- Examples of valid string value format: \"$ 1234.50 40|12\", \"John Do 55|03\", \"Jane Johan 53|89\nDoe 54|91\", null.\n- Examples of invalid string value format: \"$ 1234.50\", \"John Do\".\n- Examples of valid list value format: [\"1 Imaginary St 34|61\nMA02140 35|62\", \"7 Heaven Rd 44|03\nNY10011 44|06\"], [].\n- Do not normalize any entity value.\n- Do not generate \"0\" or \"0.00\" for missing numerical values.\n"
print(prompt_template)

The below texts are extracted from a page of bank statement with position tags at the end of each line. The position tags are in the format XX|YY. The origin 00|00 is at the top left of the page. For example, 10|20 is directly above 10|24, and 10|20 is directly to the left of 20|20.
From the bank statement, extract the named entities in json format.
{"account_number": [], "account_type": [], "bank_address": "", "bank_name": "", "client_address": "", "client_name": [], "currency": [], "ending_balance": [], "page_number": [], "starting_balance": [], "statement_date": "", "statement_end_date": "", "statement_start_date": ""}
- The extraction must respect the JSON schema.
- The values must only include text strings found in the document and their respective line tags.
- The line tags appear at the end of each line. They are the line's X and Y location on the document page, in the format XX|YY. The origin 00|00 is at the top left of the page. For example, 10|20 is directly above 10|24, and 

In [None]:
PROJECT_NUMBER = ""  # @param {type: "string"}
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"
INPUT_GCS_SOURCE = "gs://vertex-ai-generative-ai-eval-sdk-resources/test_data/prompt_optimization_input_data.jsonl"  # @param {type: "string"}
GCS_DEST_BUCKET = ""  # @param {type:"string", placeholder: "[your-gcs-bucket]"}


import pandas as pd
df = pd.read_json(INPUT_GCS_SOURCE, lines=True)
df.head()

vapo_config = {
    "project": PROJECT_ID,
    "eval_metric": "bleu",
    "target_model": "gemini-1.5-flash-002",
    "target_model_qps": 5,
    "target_model_location":  LOCATION,
    "optimizer_model": "gemini-1.5-pro-002",
    "optimizer_model_qps": 3,
    "optimizer_model_location": LOCATION,
    "optimization_mode": "instruction",
    "instruction_optimization_method": "reflect",
    "input_data_path": INPUT_GCS_SOURCE,
    "data_limit": 3,
    "output_path": GCS_DEST_BUCKET,
    "prompt_template": prompt_template,
    "demo_and_query_template": "Texts and Position Tags: {{document}}\nEntities: {{target}}",
    "num_steps": 2,
    "num_template_eval_per_step": 2,
    "num_demo_set_candidates": 10,
    "demo_set_size": 3,
    "test_split_ratio": 0.5,
    "eval_qps": 5
}

# Write the vapo config to output gcs path.
import gcsfs
import json
gcs_file_system = gcsfs.GCSFileSystem(project=PROJECT_ID)
vapo_config_json_path = f"{GCS_DEST_BUCKET}/config.json"
with gcs_file_system.open(vapo_config_json_path, 'w') as f:
    json.dump(vapo_config, f)

import logging
logging.basicConfig(encoding='utf-8', level=logging.INFO, force=True)

In [None]:
prompt_optimizer_result = client.prompt_optimizer.optimize(
    method="vapo",
    config={
        "config_path": vapo_config_json_path,
        "service_account": SERVICE_ACCOUNT,
        "wait_for_completion": True
    }
)

In [None]:
# @title #### Results in output path

eval_results_file_path = f"gs://{GCS_DEST_BUCKET}/instruction/eval_results.json"
print(f"Reading results from: {eval_results_file_path}\n")
results_df = pd.read_json(eval_results_file_path)
print("Successfully loaded data into a DataFrame:")
display(results_df.head())