In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Vertex Prompt Optimizer - Custom metric

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fprompts%2Fprompt_optimizer%2Fget_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer_custom_metric.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            


| Author(s) |
| --- |
| [Ivan Nardini](https://github.com/inardini) |

## Overview

When developing with large language models, crafting the perfect prompt—a process known as prompt engineering—is both an art and a science. It can be time-consuming and challenging to write prompts that consistently produce the desired results. Furthermore, as new and improved models are released, prompts that worked well before may need to be updated.

To address these challenges, Vertex AI offers the **Prompt Optimizer**, a prompt optimization tool to help you refine and enhance your prompts automatically. This notebook serves as a comprehensive guide to both of its  approaches: the **Zero-Shot Optimizer** and the **Data-Driven Optimizer**.

### The two approaches to prompt optimization

#### 1\. Zero-Shot Optimizer

This is your go-to tool for rapid prompt refinement and generation *without* needing an evaluation dataset.

  * **Generate from Scratch**: Simply describe a task in plain language, and it will generate a complete, well-structured system instruction for you.
  * **Refine Existing Prompts**: Provide an existing prompt, and it will rewrite it based on established best practices for clarity, structure, and effectiveness.

#### 2\. Data-Driven Optimizer

This tool performs a deep, performance-based optimization that uses your data to measure success.

  * **Tune for Performance**: You provide a dataset of sample inputs and expected outputs, and it systematically tests and rewrites your system instructions to find the version that scores highest on the evaluation metrics you define.
  * **Task-Specific**: It's the ideal choice when you want to fine-tune a prompt for a specific task and have data to prove what "better" looks like.

In this tutorial, we'll show you how to optimize a simple prompt for a Gemini model using your own metric. The goal is to use Vertex AI Prompt Optimizer to find a new prompt template that generates better responses based on your own optimization metric.

## Get started

Before we can start optimizing, we need to set up our Python environment and configure our Google Cloud project.


### Install required packages

This command installs the necessary Python libraries.


In [None]:
%pip install "google-cloud-aiplatform>=1.108.0" "pydantic" "etils" "protobuf==4.25.3" --force-reinstall --quiet

### Authenticate your notebook environment (Colab only)

If you are running this notebook in Google Colab, this cell handles authentication, allowing the notebook to securely access your Google Cloud resources.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

Here, we define essential variables for our Google Cloud project. The Prompt Optimizer job will run within a Google Cloud project. You need to [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com) and use the specified Cloud Storage bucket to read input data and write results.

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="get(projectNumber)"[0]
PROJECT_NUMBER = PROJECT_NUMBER[0]

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

import vertexai

client = vertexai.Client(project=PROJECT_ID, location=LOCATION)

### Service account and permissions

The Prompt Optimizer runs as a backend job that needs permission to perform actions on your behalf. We grant the necessary IAM roles to the default Compute Engine service account, which the job uses to operate.

  * `Vertex AI User`: Allows the job to call Vertex AI models.
  * `Storage Object Admin`: Allows the job to read your dataset from and write results to your GCS bucket.
  * `Artifact Registry Reader`: Allows the job to download necessary components.
  * `Cloud Run Developer` to deploy function on Cloud Run.

[Check out the documentation](https://cloud.google.com/iam/docs/manage-access-service-accounts#iam-view-access-sa-gcloud) to learn how to grant those permissions to a single service account.

In [None]:
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

for role in ['aiplatform.user', 'storage.objectAdmin', 'artifactregistry.reader', 'run.developer', 'run.invoker']:

    ! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/{role} --condition=None

### Import libraries

In [None]:
import json
import logging
import subprocess
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
from pathlib import Path
from etils import epath
import requests
from google.cloud import storage
from pydantic import BaseModel, Field
logging.basicConfig(level=logging.INFO, force=True)

### Helpers

In [None]:
def get_auth_token() -> str:
    """A function to collect the authorization token"""
    result = subprocess.run(
        ["gcloud", "auth", "application-default", "print-access-token", "-q"],
        capture_output=True,
        text=True,
        check=True,
    )
    return result.stdout.strip()

def format_demonstrations(demos: Any) -> List[str]:
    """Format demonstrations into readable strings."""
    if isinstance(demos, str):
        try:
            demos = json.loads(demos)
        except (json.JSONDecodeError, ValueError):
            return []

    if not isinstance(demos, list):
        return []

    formatted = []
    for demo in demos:
        if isinstance(demo, dict):
            demo_str = "\n".join(f"{k}: {v}" for k, v in demo.items())
            formatted.append(demo_str)
        else:
            formatted.append(str(demo))

    return formatted


def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
    """Split GCS path into bucket name and prefix."""
    if not gcs_path.startswith("gs://"):
        raise ValueError(f"Invalid GCS path. Must start with gs://: {gcs_path}")

    path = gcs_path[len("gs://"):]
    parts = path.split("/", 1)
    return parts[0], parts[1] if len(parts) > 1 else ""


def list_gcs_objects(gcs_path: str) -> List[str]:
    """List all objects under given GCS path."""
    bucket_name, prefix = parse_gcs_path(gcs_path)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    return [blob.name for blob in blobs]


def find_directories_with_files(
    base_path: str, required_files: List[str]
) -> List[str]:
    """Find directories containing all required files."""
    bucket_name, prefix = split_gcs_path(base_path)
    all_paths = list_gcs_objects(base_path)

    # Group files by directory
    directories: Dict[str, set] = {}
    for path in all_paths:
        dir_path = "/".join(path.split("/")[:-1])
        filename = path.split("/")[-1]

        if dir_path not in directories:
            directories[dir_path] = set()
        directories[dir_path].add(filename)

    # Find directories with all required files
    matching_dirs = []
    for dir_path, files in directories.items():
        if all(req_file in files for req_file in required_files):
            matching_dirs.append(f"gs://{bucket_name}/{dir_path}")

    return matching_dirs

def parse_gcs_path(gcs_path: str) -> Tuple[str, str]:
    """Parse GCS path into bucket name and prefix."""
    if not gcs_path.startswith("gs://"):
        raise ValueError("Invalid GCS path. Must start with gs://")

    path_without_prefix = gcs_path[5:]  # Remove 'gs://'
    parts = path_without_prefix.split("/", 1)
    bucket_name = parts[0]
    prefix = parts[1] if len(parts) > 1 else ""

    return bucket_name, prefix

def get_best_vapo_results(
    base_path: str, metric_name: Optional[str] = None
) -> Tuple[str, List[str]]:
    """Get the best system instruction and demonstrations across all VAPO runs."""
    # Find all valid runs
    required_files = ["eval_results.json", "templates.json"]
    runs = find_directories_with_files(base_path, required_files)

    if not runs:
        raise ValueError(f"No valid runs found in {base_path}")

    best_score = float("-inf")
    best_instruction = ""
    best_demonstrations: List[str] = []

    for run_path in runs:
        try:
            # Check main templates.json first
            templates_path = f"{run_path}/templates.json"
            with epath.Path(templates_path).open("r") as f:
                templates_data = json.load(f)

            if templates_data:
                df = pd.json_normalize(templates_data)

                # Find metric column
                metric_columns = [
                    col for col in df.columns
                    if "metric" in col and "mean" in col
                ]

                if metric_columns:
                    # Select appropriate metric
                    if metric_name:
                        metric_col = next(
                            (col for col in metric_columns if metric_name in col),
                            None
                        )
                    else:
                        composite_cols = [
                            col for col in metric_columns
                            if "composite_metric" in col
                        ]
                        metric_col = (
                            composite_cols[0] if composite_cols else metric_columns[0]
                        )

                    if metric_col and metric_col in df.columns:
                        best_idx = df[metric_col].argmax()
                        score = float(df.iloc[best_idx][metric_col])

                        if score > best_score:
                            best_score = score
                            best_row = df.iloc[best_idx]

                            # Extract instruction if present
                            if "prompt" in best_row or "instruction" in best_row:
                                instruction = best_row.get(
                                    "prompt", best_row.get("instruction", "")
                                )
                                if instruction:
                                    instruction = instruction.replace(
                                        "store('answer', llm())", "{{llm()}}"
                                    )
                                    best_instruction = instruction

                            # Extract demonstrations if present
                            if "demonstrations" in best_row or "demo_set" in best_row:
                                demos = best_row.get(
                                    "demonstrations", best_row.get("demo_set", [])
                                )
                                best_demonstrations = format_demonstrations(demos)

            # Check instruction-specific optimization
            instruction_path = f"{run_path}/instruction/templates.json"
            try:
                with epath.Path(instruction_path).open("r") as f:
                    instruction_data = json.load(f)

                if instruction_data:
                    inst_df = pd.json_normalize(instruction_data)
                    metric_columns = [
                        col for col in inst_df.columns
                        if "metric" in col and "mean" in col
                    ]

                    if metric_columns:
                        if metric_name:
                            metric_col = next(
                                (col for col in metric_columns if metric_name in col),
                                None,
                            )
                        else:
                            composite_cols = [
                                col for col in metric_columns
                                if "composite_metric" in col
                            ]
                            metric_col = (
                                composite_cols[0] if composite_cols else metric_columns[0]
                            )

                        if metric_col and metric_col in inst_df.columns:
                            inst_best_idx = inst_df[metric_col].argmax()
                            inst_score = float(inst_df.iloc[inst_best_idx][metric_col])

                            if inst_score > best_score:
                                best_score = inst_score
                                best_row = inst_df.iloc[inst_best_idx]

                                instruction = best_row.get(
                                    "prompt", best_row.get("instruction", "")
                                )
                                if instruction:
                                    instruction = instruction.replace(
                                        "store('answer', llm())", "{{llm()}}"
                                    )
                                    best_instruction = instruction
                                # In instruction-only mode, there might not be demonstrations
                                if "demonstrations" not in best_row and "demo_set" not in best_row:
                                    best_demonstrations = []
            except FileNotFoundError:
                pass

            # Check demonstration-specific optimization
            demo_path = f"{run_path}/demonstration/templates.json"
            try:
                with epath.Path(demo_path).open("r") as f:
                    demo_data = json.load(f)

                if demo_data:
                    demo_df = pd.json_normalize(demo_data)
                    metric_columns = [
                        col for col in demo_df.columns
                        if "metric" in col and "mean" in col
                    ]

                    if metric_columns:
                        if metric_name:
                            metric_col = next(
                                (col for col in metric_columns if metric_name in col),
                                None,
                            )
                        else:
                            composite_cols = [
                                col for col in metric_columns
                                if "composite_metric" in col
                            ]
                            metric_col = (
                                composite_cols[0] if composite_cols else metric_columns[0]
                            )

                        if metric_col and metric_col in demo_df.columns:
                            demo_best_idx = demo_df[metric_col].argmax()
                            demo_score = float(demo_df.iloc[demo_best_idx][metric_col])

                            if demo_score > best_score:
                                best_score = demo_score
                                best_row = demo_df.iloc[demo_best_idx]

                                demos = best_row.get(
                                    "demonstrations", best_row.get("demo_set", [])
                                )
                                best_demonstrations = format_demonstrations(demos)
                                # In demo-only mode, there might not be an instruction
                                if "prompt" not in best_row and "instruction" not in best_row:
                                    best_instruction = ""
                                else:
                                    instruction = best_row.get(
                                        "prompt", best_row.get("instruction", "")
                                    )
                                    if instruction:
                                        instruction = instruction.replace(
                                            "store('answer', llm())", "{{llm()}}"
                                        )
                                        best_instruction = instruction
            except (FileNotFoundError, json.JSONDecodeError):
                pass

        except Exception as e:
            logging.warning(f"Error processing run {run_path}: {e}")
            continue

    if best_score == float("-inf"):
        raise ValueError("Could not find any valid results")

    return best_instruction, best_demonstrations

## Using the Data-Driven Optimizer approach with your own custom metrics

The following sections will guide you through setting up your environment, preparing your data, deploy the custom metrics, and running an optimization job to find a better prompt using the data-driven optimizer.

### Preparing the Data and Running the Job

#### The prompt template to optimize

A prompt consists of two key parts:

* **System Instruction Template** which is a fixed part of the prompt that control or alter the model's behavior across all queries for a given task.

* **Prompt Template** which is a dynamic part of the prompt that changes based on the task. Prompt template includes context, task and more. To learn more, see [components of a prompt](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-design-strategies#components-of-a-prompt) in the official documentation.

In this scenario, you use Vertex AI prompt optimizer to optimize a simple system instruction template. And you use some examples in the remaining prompt template for evaluating different instruction templates along the optimization process.

> Important: Having the `target` placeholder in the prompt template is optional. It represents the prompt's ground truth response in your prompt optimization dataset that you aim to optimize for your templates. If you don't have the prompt's ground truth response, remember to set the `source_model` parameter to your prompt optimizer configuration (see below) instead of adding ground truth responses. Vertex AI prompt optimizer would run your sample prompts on the source model to generate the ground truth responses for you.

In [None]:
system_instruction = """
Given a question with context, provide the correct answer to the question.
"""

prompt_template = """
Some examples of correct answer to a question are:
Question: {question}
Context: {ctx}
Answer: {target}
"""

#### The optimization dataset

The optimizer's performance depends heavily on the quality of your sample data.

For this example, we use a question-answering dataset where each row contains a `question`, context (`ctx`), and a ground-truth `target` answer. The `{target}` variable is crucial for computation-based evaluation metrics like `question_answering_correctness`.

> Important: For effective **prompt optimization**, provide a dataset of examples where your model is poor in performance when using current system instruction template. For reliable results, use 50-100 distinct samples. In case of **prompt migration**, consider using the source model to label examples that the target model struggles with, helping to identify areas for improvement.


In [None]:
input_data_path = "gs://github-repo/prompts/prompt_optimizer/rag_qa_dataset.jsonl"
prompt_optimization_df = pd.read_json(input_data_path, lines=True)
prompt_optimization_df.head()

#### Define and deploy your own custom optimization metric on Cloud function

To optimize your prompt template using a custom optimization metric, you need to deploy a function with your own metric code on a Cloud function. To deploy a Cloud function with your own custom metric, you cover the following steps:

1.   Define requirements
2.   Write your own custom metric function code
3.   Deploy the custom code as Cloud function


##### Define requirements

Set the custom metric dependencies.

In [None]:
build_path = Path("build")
build_path.mkdir(exist_ok=True)

requirements = """
functions-framework==3.*
google-cloud-aiplatform>=1.108.0
"""

with open(build_path / "requirements.txt", "w") as f:
    f.write(requirements)

##### Write your own custom metric function

Define the module which contains your own custom metric function definition.

In this case, you have a custom evaluation metric to evaluate the user engagement and personalization. The custom evaluation metric is defined using the `evaluate_engagement_personalization_fn`.

The function leverages Gemini 2.5 Flash to act as an LLM-as-Judge. It sends a prompt to the auto-rater, receives a score (1-5), and an explanation, then returns these as a dictionary containing two fields: the custom metric's score (as you defined it) and an explanation of how this metric helps optimize the prompt template.

You use the `main` function to deploy the `evaluate_engagement_personalization_fn` function as a Cloud Function, receiving a question, response, and a target response as input and returning the auto-rater's evaluation.  


In [None]:
custom_metric_function_code = '''
"""
This module contains the custom evaluation metric definition to optimize a prompt template with Vertex AI prompt optimizer
"""

from typing import Dict
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
)

import json
import functions_framework

def get_autorater_response(metric_prompt: str) -> dict:
    """This function is to generate the evaluation response from the autorater."""

    metric_response_schema = {
        "type": "OBJECT",
        "properties": {
            "score": {"type": "NUMBER"},
            "explanation": {"type": "STRING"},
        },
        "required": ["score", "explanation"],
    }

    autorater = GenerativeModel(
        "gemini-2.5-flash",
        generation_config=GenerationConfig(
            response_mime_type="application/json",
            response_schema=metric_response_schema,
        ),
        safety_settings={
            HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        },
    )

    response = autorater.generate_content(metric_prompt)

    response_json = {}
    response_json = json.loads(response.text)
    return response_json


# Define custom evaluation criteria
def evaluate_engagement_personalization_fn(question: str, response:str, target: str) -> Dict[str, str]:
    """Evaluates an AI-generated response for User Engagement and Personalization."""

    custom_metric_prompt_template = """

    # Instruction
    You are an expert evaluator. Your task is to evaluate the quality of the LLM-generated responses against a reference target response.
    You should first read the Question carefully, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
    You will assign the response a rating following the Rating Rubric only and a step-by-step explanation for your rating.

    # Evaluation

    ## Criteria
    Relevance and Customization: The response should directly address the user's query and demonstrate an understanding of their specific needs or preferences, such as dietary restrictions, skill level, or taste preferences.
    Interactivity and Proactiveness: The response should go beyond simply answering the question by actively encouraging further interaction through follow-up questions, suggestions for additional exploration, or prompts for more information to provide a tailored experience.
    Tone and Empathy: The response should adopt an appropriate and empathetic tone that fosters a positive and supportive user experience, making the user feel heard and understood.

    ## Rating rubric
    1 - Minimal: The response lacks personalization and demonstrates minimal engagement with the user. The tone may be impersonal or generic.
    2 - Basic: The response shows some basic personalization but lacks depth or specificity. Engagement is limited, possibly with generic prompts or suggestions. The tone is generally neutral but may lack warmth or empathy.
    3 - Moderate: The response demonstrates clear personalization and attempts to engage the user with relevant follow-up questions or prompts based on their query. The tone is friendly and supportive, fostering a positive user experience.
    4 - High: The response demonstrates a high degree of personalization and actively engages the user with relevant follow-up questions or prompts. The tone is empathetic and understanding, creating a strong connection with the user.
    5 - Exceptional: The response goes above and beyond to personalize the experience, anticipating user needs, and fostering a genuine connection. The tone is warm, encouraging, and inspiring, leaving the user feeling empowered and motivated.

    ## Evaluation steps
    Step 1: Carefully read both the question and the generated response. Ensure a clear understanding of the user's intent, needs, and any specific context provided.
    Step 2: Evaluate how well the response directly addresses the user's query and demonstrates an understanding of their specific needs or preferences.
    Step 3: Determine the extent to which the response actively encourages further interaction and provides a tailored experience.
    Step 4: Evaluate Tone & Empathy: Analyze the tone of the response, ensuring it fosters a positive and supportive user experience, making the user feel heard and understood.
    Step 5: Based on the three criteria above, assign a score from 1 to 5 according to the score rubric.
    Step 6: Justify the assigned score with a clear and concise explanation, highlighting the strengths and weaknesses of the response with respect to each criterion.

    # Question : {question}
    # Generated response: {response}
    # Reference response: {target}
    """

    custom_metric_prompt = custom_metric_prompt_template.format(question=question, response=response, target=target)
    response_dict = get_autorater_response(custom_metric_prompt)

    return {
        "custom_engagement_personalization_score": response_dict["score"],
        "explanation": response_dict["explanation"],
    }

# Register an HTTP function with the Functions Framework
@functions_framework.http
def main(request):
  request_json = request.get_json(silent=True)

  if not request_json:
    raise ValueError('Cannot find request json.')

  question = request_json['question']
  response = request_json['response']
  reference = request_json['target']

  get_evaluation_result = evaluate_engagement_personalization_fn(question, response, reference)
  return json.dumps(get_evaluation_result)
'''

with open(build_path / "main.py", "w") as f:
    f.write(custom_metric_function_code)

##### Deploy the custom metric as a Cloud Function

Use gcloud command line to deploy a Cloud function. To learn more, check out [Deploy a Python service to Cloud Run](https://cloud.google.com/run/docs/quickstarts/build-and-deploy/deploy-python-service) quickstart.


In [None]:
!gcloud functions deploy 'custom_engagement_personalization_metric' \
 --gen2 \
 --runtime="python310" \
 --source={str(build_path)} \
 --entry-point=main \
 --trigger-http \
 --timeout=3600 \
 --memory=2Gb \
 --concurrency=6 \
 --min-instances=6 \
 --project {PROJECT_ID} \
 --region={LOCATION} \
 --quiet

##### Test your custom evaluation metric

After you deploy your  custom evaluation metric as Cloud function, submit a request to validate the output of the custom evaluation function.

In [None]:
custom_evaluator_function_uri = ! gcloud functions describe 'custom_engagement_personalization_metric' --gen2 --region {LOCATION} --format="value(url)"
custom_evaluator_function_uri = custom_evaluator_function_uri[0].strip()

In [None]:
headers = {
    "Authorization": f"Bearer {get_auth_token()}",
    "Content-Type": "application/json",
}

json_data = {
    "question": """
      What are some techniques for cooking red meat and pork that maximize flavor and tenderness while minimizing the formation of unhealthy compounds?
      """,
    "response": """
      * Marinating in acidic ingredients like lemon juice or vinegar to tenderize the meat \n * Cooking to an internal temperature of 145°F (63°C) for safety \n * Using high-heat cooking methods like grilling and pan-searing for browning and caramelization \n * Avoiding charring to minimize the formation of unhealthy compounds
      """,
    "target": """
      Here's how to tackle those delicious red meats and pork while keeping things healthy:
      **Prioritize Low and Slow:**
      * **Braising and Stewing:** These techniques involve gently simmering meat in liquid over low heat for an extended period. This breaks down tough collagen, resulting in incredibly tender and flavorful meat. Plus, since the cooking temperature is lower, it minimizes the formation of potentially harmful compounds associated with high-heat cooking.
      * **Sous Vide:** This method involves sealing meat in a vacuum bag and immersing it in a precisely temperature-controlled water bath.  It allows for even cooking to the exact desired doneness, resulting in incredibly juicy and tender meat.  Because the temperature is controlled and lower than traditional methods, it can be a healthier option.
      **High Heat Tips:**
      * **Marinades are Your Friend:** As you mentioned, acidic marinades tenderize meat.  They also add flavor!
      * **Temperature Control is Key:**  Use a meat thermometer to ensure you reach the safe internal temperature of 145°F (63°C) without overcooking.
      * **Don't Burn It!**  While some browning is desirable, charring creates those unhealthy compounds.  Pat meat dry before cooking to minimize steaming and promote browning.  Let the pan heat up properly before adding the meat to achieve a good sear.

      **Remember:**  Trim visible fat before cooking to reduce saturated fat content.  Let meat rest after cooking; this allows juices to redistribute, resulting in a more tender and flavorful final product.
      """,
}

response = requests.post(
    custom_evaluator_function_uri, headers=headers, json=json_data, timeout=70
).json()
print(response)

#### Set optimization configuration

Now, we'll create a dictionary with our specific settings and use it to instantiate our `OptimizationConfig` class.

The `OptimizationConfig` class, built using `pydantic`, acts as a structured and validated blueprint for our optimization task. It ensures all necessary parameters are defined before we submit the job.

In this scenario, you have two additional parameters:

* `custom_metric_name` parameter which lets you pass your own custom metric to optimizer the prompt template.

* `custom_metric_cloud_function_name` parameter which indicates the Cloud function to call for collecting custom function evaluation metric output.

For more advanced control, you can learn and explore more about all the parameters and how to best use them in the [detailed documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/data-driven-optimizer).


In [None]:
class OptimizationConfig(BaseModel):
    """
    A comprehensive prompt optimization configuration model.
    """

    # Basic Configuration
    system_instruction: str = Field(
        ...,
        description="System instructions for the target model. String. This field is required.",
    )
    prompt_template: str = Field(
        ..., description="Template for prompts. String. This field is required."
    )
    target_model: str = Field(
        "gemini-2.5-flash",
        description='Target model for optimization. Supported models: "gemini-2.5-flash", "gemini-2.5-pro"',
    )
    thinking_budget: int = Field(
        -1,
        description="Thinking budget for thinking models. -1 means auto/no thinking. Integer.",
    )
    optimization_mode: str = Field(
        "instruction",
        description='Optimization mode. Supported modes: "instruction", "demonstration", "instruction_and_demo".',
    )
    project: str = Field(
        ..., description="Google Cloud project ID. This field is required."
    )

    # Evaluation Settings
    eval_metrics_types: List[str] = Field(
        description='List of evaluation metrics. E.g., "bleu", "rouge_l", "safety".'
    )
    eval_metrics_weights: List[float] = Field(
        description="Weights for evaluation metrics. Length must match eval_metrics_types and should sum to 1."
    )
    aggregation_type: str = Field(
        "weighted_sum",
        description='Aggregation type for metrics. Supported: "weighted_sum", "weighted_average".',
    )
    custom_metric_name: str = Field(
        "",
        description="Metric name, as defined by the key that corresponds in the dictionary returned from Cloud function. String.",
    )
    custom_metric_cloud_function_name: str = Field(
        "",
        description="Cloud Run function name you previously deployed. String.",
    )

    # Data and I/O Paths
    input_data_path: str = Field(
        ...,
        description="Cloud Storage URI to input optimization data. This field is required.",
    )
    output_path: str = Field(
        ...,
        description="Cloud Storage URI to save optimization results. This field is required.",
    )

    # (Optional) Advanced Configuration
    num_steps: int = Field(
        10,
        ge=10,
        le=20,
        description="Number of iterations in instruction optimization mode. Integer between 10 and 20.",
    )
    num_demo_set_candidates: int = Field(
        10,
        ge=10,
        le=30,
        description="Number of demonstrations evaluated. Integer between 10 and 30.",
    )
    demo_set_size: int = Field(
        3,
        ge=3,
        le=6,
        description="Number of demonstrations generated per prompt. Integer between 3 and 6.",
    )

    # (Optional) Model Locations and QPS
    target_model_location: str = Field(
        "us-central1", description="Location of the target model. Default us-central1."
    )
    target_model_qps: int = Field(
        1,
        ge=1,
        description="QPS for the target model. Integer >= 1, based on your quota.",
    )
    optimizer_model_location: str = Field(
        "us-central1",
        description="Location of the optimizer model. Default us-central1.",
    )
    optimizer_model_qps: int = Field(
        1,
        ge=1,
        description="QPS for the optimization model. Integer >= 1, based on your quota.",
    )
    source_model: str = Field(
        "",
        description="Google model previously used with these prompts. Not needed if providing a target column.",
    )
    source_model_location: str = Field(
        "us-central1", description="Location of the source model. Default us-central1."
    )
    source_model_qps: Optional[int] = Field(
        None, ge=1, description="Optional QPS for the source model. Integer >= 1."
    )
    eval_qps: int = Field(
        1,
        ge=1,
        description="QPS for the eval model. Integer >= 1, based on your quota.",
    )

    # (Optional) Response, Language, and Data Handling
    response_mime_type: str = Field(
        "text/plain",
        description="MIME response type from the target model. E.g., 'text/plain', 'application/json'.",
    )
    response_schema: str = Field(
        "", description="The Vertex AI Controlled Generation response schema."
    )
    language: str = Field(
        "English",
        description='Language of the system instructions. E.g., "English", "Japanese".',
    )
    placeholder_to_content: Dict[str, Any] = Field(
        {},
        description="Dictionary of placeholders to replace parameters in the system instruction.",
    )
    data_limit: int = Field(
        10,
        ge=5,
        le=100,
        description="Amount of data used for validation. Integer between 5 and 100.",
    )
    translation_source_field_name: str = Field(
        "",
        description="Field name for source text if using translation metrics (Comet, MetricX).",
    )
    has_multimodal_inputs: bool = Field(
        False, description="Whether the input data is multimodal."
    )

Set the optimization configuration.


In [None]:
output_path = f"{BUCKET_URI}/optimization_results/"

vapo_data_settings = {
    "system_instruction": system_instruction,
    "prompt_template": prompt_template,
    "target_model": "gemini-2.5-flash",
    "thinking_budget": -1,
    "optimization_mode": "instruction",
    "custom_metric_name": "custom_engagement_personalization_score",
    "custom_metric_cloud_function_name": "custom_engagement_personalization_metric",
    "eval_metrics_types": ["question_answering_correctness", "custom_metric"],
    "eval_metrics_weights": [0.8, 0.2],
    "aggregation_type": "weighted_sum",
    "input_data_path": input_data_path,
    "output_path": output_path,
    "project": PROJECT_ID,
}

vapo_data_config = OptimizationConfig(**vapo_data_settings)
vapo_data_config_json = vapo_data_config.model_dump()

#### Upload configuration to Cloud Storage

Write the Prompt Optimizer configuration to the file in your GCS bucket.


In [None]:
config_path = f"{BUCKET_URI}/config.json"

with epath.Path(config_path).open("w") as config_file:
    json.dump(vapo_data_config_json, config_file)
config_file.close()

#### Run the prompt optimization job

This is the final step. We pass the path to our configuration file and the service account to the Vertex AI client. The `optimize` method starts the custom job on the Vertex AI backend. We set `wait_for_completion` to `True` so the script will pause until the job is finished.


In [None]:
vapo_data_run_config = {
    "config_path": config_path,
    "wait_for_completion": True,
    "service_account": SERVICE_ACCOUNT,
}

result = client.prompt_optimizer.optimize(method="vapo", config=vapo_data_run_config)

### Get and use the best prompt programmatically

For use in an application, you can programmatically retrieve the top-performing instruction from the output files stored in GCS.


In [None]:
best_instruction, _ = get_best_vapo_results(output_path)
print("The optimized instruction is:\n", best_instruction)

## Cleaning up

In [None]:
delete_job = True
delete_bucket = True
delete_function = True

if delete_job:
    from google.cloud import aiplatform
    aiplatform.init(project=PROJECT_ID, location=LOCATION)
    custom_job_list = aiplatform.CustomJob.list()
    if custom_job_list:
        latest_job = custom_job_list[0]
        latest_job.delete()

if delete_function:
    !gcloud functions delete 'custom_engagement_personalization_metric' --gen2 --region {LOCATION} --quiet

if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI