In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get Started with Vertex AI Prompt Optimizer

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fprompts%2Fprompt_optimizer%2Fget_started_with_vertex_ai_prompt_optimizer.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/get_started_with_vertex_ai_prompt_optimizer.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Ivan Nardini](https://github.com/inardini) |

## Overview

When developing with large language models, crafting the perfect prompt‚Äîa process known as prompt engineering‚Äîis both an art and a science. It can be time-consuming and challenging to write prompts that consistently produce the desired results. Furthermore, as new and improved models are released, prompts that worked well before may need to be updated.

To address these challenges, Vertex AI offers the **Prompt Optimizer**, a prompt optimization tool to help you refine and enhance your prompts automatically. This notebook serves as a comprehensive guide to both of its  approaches: the **Zero-Shot Optimizer** and the **Data-Driven Optimizer**.

### The two approaches to prompt optimization

#### 1\. Zero-Shot Optimizer

This is your go-to tool for rapid prompt refinement and generation *without* needing an evaluation dataset.

  * **Generate from Scratch**: Simply describe a task in plain language, and it will generate a complete, well-structured system instruction for you.
  * **Refine Existing Prompts**: Provide an existing prompt, and it will rewrite it based on established best practices for clarity, structure, and effectiveness.

#### 2\. Data-Driven Optimizer

This tool performs a deep, performance-based optimization that uses your data to measure success.

  * **Tune for Performance**: You provide a dataset of sample inputs and expected outputs, and it systematically tests and rewrites your system instructions to find the version that scores highest on the evaluation metrics you define.
  * **Task-Specific**: It's the ideal choice when you want to fine-tune a prompt for a specific task and have data to prove what "better" looks like.

In this tutorial, we will walk through both methods. First, we'll explore the **Zero-Shot Optimizer** for quick, data-free improvements. Then, we'll dive deep into the **Data-Driven Optimizer**, learning how to leverage a dataset to achieve the best possible performance for a specific task.


## Get started

Before we can start optimizing, we need to set up our Python environment and configure our Google Cloud project.


### Install required packages

This command installs the necessary Python libraries.


In [None]:
%pip install "google-cloud-aiplatform>=1.108.0" "pydantic" "etils" "protobuf==4.25.3" "gradio" --force-reinstall --quiet

### Authenticate your notebook environment (Colab only)

If you are running this notebook in Google Colab, this cell handles authentication, allowing the notebook to securely access your Google Cloud resources.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

Here, we define essential variables for our Google Cloud project. The Prompt Optimizer job will run within a Google Cloud project. You need to [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com) and use the specified Cloud Storage bucket to read input data and write results.

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="get(projectNumber)"[0]
PROJECT_NUMBER = PROJECT_NUMBER[0]

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

import vertexai

client = vertexai.Client(project=PROJECT_ID, location=LOCATION)

### Service account and permissions

The Prompt Optimizer runs as a backend job that needs permission to perform actions on your behalf. We grant the necessary IAM roles to the default Compute Engine service account, which the job uses to operate.

  * `Vertex AI User`: Allows the job to call Vertex AI models.
  * `Storage Object Admin`: Allows the job to read your dataset from and write results to your GCS bucket.
  * `Artifact Registry Reader`: Allows the job to download necessary components.

[Check out the documentation](https://cloud.google.com/iam/docs/manage-access-service-accounts#iam-view-access-sa-gcloud) to learn how to grant those permissions to a single service account.

In [None]:
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

for role in ['aiplatform.user', 'storage.objectAdmin', 'artifactregistry.reader']:

    ! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/{role} --condition=None

### Import libraries

In [None]:
import io
import json
import logging
import re
from typing import Any, Dict, List, Optional, Tuple

import gradio as gr
import pandas as pd
from etils import epath
from google.cloud import storage
from pydantic import BaseModel, Field

logging.basicConfig(level=logging.INFO, force=True)
from IPython.display import Markdown, display

### Helpers

In [None]:
def format_demonstrations(demos: Any) -> List[str]:
    """Format demonstrations into readable strings."""
    if isinstance(demos, str):
        try:
            demos = json.loads(demos)
        except (json.JSONDecodeError, ValueError):
            return []

    if not isinstance(demos, list):
        return []

    formatted = []
    for demo in demos:
        if isinstance(demo, dict):
            demo_str = "\n".join(f"{k}: {v}" for k, v in demo.items())
            formatted.append(demo_str)
        else:
            formatted.append(str(demo))

    return formatted


def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
    """Split GCS path into bucket name and prefix."""
    if not gcs_path.startswith("gs://"):
        raise ValueError(f"Invalid GCS path. Must start with gs://: {gcs_path}")

    path = gcs_path[len("gs://"):]
    parts = path.split("/", 1)
    return parts[0], parts[1] if len(parts) > 1 else ""


def list_gcs_objects(gcs_path: str) -> List[str]:
    """List all objects under given GCS path."""
    bucket_name, prefix = parse_gcs_path(gcs_path)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    return [blob.name for blob in blobs]


def find_directories_with_files(
    base_path: str, required_files: List[str]
) -> List[str]:
    """Find directories containing all required files."""
    bucket_name, prefix = split_gcs_path(base_path)
    all_paths = list_gcs_objects(base_path)

    # Group files by directory
    directories: Dict[str, set] = {}
    for path in all_paths:
        dir_path = "/".join(path.split("/")[:-1])
        filename = path.split("/")[-1]

        if dir_path not in directories:
            directories[dir_path] = set()
        directories[dir_path].add(filename)

    # Find directories with all required files
    matching_dirs = []
    for dir_path, files in directories.items():
        if all(req_file in files for req_file in required_files):
            matching_dirs.append(f"gs://{bucket_name}/{dir_path}")

    return matching_dirs

def parse_gcs_path(gcs_path: str) -> Tuple[str, str]:
    """Parse GCS path into bucket name and prefix."""
    if not gcs_path.startswith("gs://"):
        raise ValueError("Invalid GCS path. Must start with gs://")

    path_without_prefix = gcs_path[5:]  # Remove 'gs://'
    parts = path_without_prefix.split("/", 1)
    bucket_name = parts[0]
    prefix = parts[1] if len(parts) > 1 else ""

    return bucket_name, prefix

def get_best_vapo_results(
    base_path: str, metric_name: Optional[str] = None
) -> Tuple[str, List[str]]:
    """Get the best system instruction and demonstrations across all VAPO runs."""
    # Find all valid runs
    required_files = ["eval_results.json", "templates.json"]
    runs = find_directories_with_files(base_path, required_files)

    if not runs:
        raise ValueError(f"No valid runs found in {base_path}")

    best_score = float("-inf")
    best_instruction = ""
    best_demonstrations: List[str] = []

    for run_path in runs:
        try:
            # Check main templates.json first
            templates_path = f"{run_path}/templates.json"
            with epath.Path(templates_path).open("r") as f:
                templates_data = json.load(f)

            if templates_data:
                df = pd.json_normalize(templates_data)

                # Find metric column
                metric_columns = [
                    col for col in df.columns
                    if "metric" in col and "mean" in col
                ]

                if metric_columns:
                    # Select appropriate metric
                    if metric_name:
                        metric_col = next(
                            (col for col in metric_columns if metric_name in col),
                            None
                        )
                    else:
                        composite_cols = [
                            col for col in metric_columns
                            if "composite_metric" in col
                        ]
                        metric_col = (
                            composite_cols[0] if composite_cols else metric_columns[0]
                        )

                    if metric_col and metric_col in df.columns:
                        best_idx = df[metric_col].argmax()
                        score = float(df.iloc[best_idx][metric_col])

                        if score > best_score:
                            best_score = score
                            best_row = df.iloc[best_idx]

                            # Extract instruction if present
                            if "prompt" in best_row or "instruction" in best_row:
                                instruction = best_row.get(
                                    "prompt", best_row.get("instruction", "")
                                )
                                if instruction:
                                    instruction = instruction.replace(
                                        "store('answer', llm())", "{{llm()}}"
                                    )
                                    best_instruction = instruction

                            # Extract demonstrations if present
                            if "demonstrations" in best_row or "demo_set" in best_row:
                                demos = best_row.get(
                                    "demonstrations", best_row.get("demo_set", [])
                                )
                                best_demonstrations = format_demonstrations(demos)

            # Check instruction-specific optimization
            instruction_path = f"{run_path}/instruction/templates.json"
            try:
                with epath.Path(instruction_path).open("r") as f:
                    instruction_data = json.load(f)

                if instruction_data:
                    inst_df = pd.json_normalize(instruction_data)
                    metric_columns = [
                        col for col in inst_df.columns
                        if "metric" in col and "mean" in col
                    ]

                    if metric_columns:
                        if metric_name:
                            metric_col = next(
                                (col for col in metric_columns if metric_name in col),
                                None,
                            )
                        else:
                            composite_cols = [
                                col for col in metric_columns
                                if "composite_metric" in col
                            ]
                            metric_col = (
                                composite_cols[0] if composite_cols else metric_columns[0]
                            )

                        if metric_col and metric_col in inst_df.columns:
                            inst_best_idx = inst_df[metric_col].argmax()
                            inst_score = float(inst_df.iloc[inst_best_idx][metric_col])

                            if inst_score > best_score:
                                best_score = inst_score
                                best_row = inst_df.iloc[inst_best_idx]

                                instruction = best_row.get(
                                    "prompt", best_row.get("instruction", "")
                                )
                                if instruction:
                                    instruction = instruction.replace(
                                        "store('answer', llm())", "{{llm()}}"
                                    )
                                    best_instruction = instruction
                                # In instruction-only mode, there might not be demonstrations
                                if "demonstrations" not in best_row and "demo_set" not in best_row:
                                    best_demonstrations = []
            except FileNotFoundError:
                pass

            # Check demonstration-specific optimization
            demo_path = f"{run_path}/demonstration/templates.json"
            try:
                with epath.Path(demo_path).open("r") as f:
                    demo_data = json.load(f)

                if demo_data:
                    demo_df = pd.json_normalize(demo_data)
                    metric_columns = [
                        col for col in demo_df.columns
                        if "metric" in col and "mean" in col
                    ]

                    if metric_columns:
                        if metric_name:
                            metric_col = next(
                                (col for col in metric_columns if metric_name in col),
                                None,
                            )
                        else:
                            composite_cols = [
                                col for col in metric_columns
                                if "composite_metric" in col
                            ]
                            metric_col = (
                                composite_cols[0] if composite_cols else metric_columns[0]
                            )

                        if metric_col and metric_col in demo_df.columns:
                            demo_best_idx = demo_df[metric_col].argmax()
                            demo_score = float(demo_df.iloc[demo_best_idx][metric_col])

                            if demo_score > best_score:
                                best_score = demo_score
                                best_row = demo_df.iloc[demo_best_idx]

                                demos = best_row.get(
                                    "demonstrations", best_row.get("demo_set", [])
                                )
                                best_demonstrations = format_demonstrations(demos)
                                # In demo-only mode, there might not be an instruction
                                if "prompt" not in best_row and "instruction" not in best_row:
                                    best_instruction = ""
                                else:
                                    instruction = best_row.get(
                                        "prompt", best_row.get("instruction", "")
                                    )
                                    if instruction:
                                        instruction = instruction.replace(
                                            "store('answer', llm())", "{{llm()}}"
                                        )
                                        best_instruction = instruction
            except (FileNotFoundError, json.JSONDecodeError):
                pass

        except Exception as e:
            logging.warning(f"Error processing run {run_path}: {e}")
            continue

    if best_score == float("-inf"):
        raise ValueError("Could not find any valid results")

    return best_instruction, best_demonstrations

## **Part 1: Zero-Shot Optimizer**

We'll begin with the zero-shot approach. The following section will guide you through the process of optimizing your prompt without providing additional examples.

### Run a Zero-shot optimization job

To run a `Zero-shot optimization job`, you can use the `optimize_prompt` method. The service will use a research-based metaprompt to optimize your initial prompt.

**Notice:** The response also provides a more detailed output with `.applicable_guidelines`. This information is useful for understanding how your prompt was improved and possibly help you writing better prompts.


In [None]:
prompt = "You are a helpful assistant. Given a question with context, provide the correct answer to the question."
response = client.prompt_optimizer.optimize_prompt(prompt=prompt)

In [None]:
display(Markdown(response.suggested_prompt))

## **Part 2: The Data-Driven Optimizer**

The following sections will guide you through setting up your environment, preparing your data, and running an optimization job to find a better prompt using the data-driven optimizer

### Preparing the Data and Running the Job

#### The prompt template to optimize

A prompt consists of two key parts:

* **System Instruction Template** which is a fixed part of the prompt that control or alter the model's behavior across all queries for a given task.

* **Prompt Template** which is a dynamic part of the prompt that changes based on the task. Prompt template includes context, task and more. To learn more, see [components of a prompt](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-design-strategies#components-of-a-prompt) in the official documentation.

In this scenario, you use Vertex AI prompt optimizer to optimize a simple system instruction template. And you use some examples in the remaining prompt template for evaluating different instruction templates along the optimization process.

> Important: Having the `target` placeholder in the prompt template is optional. It represents the prompt's ground truth response in your prompt optimization dataset that you aim to optimize for your templates. If you don't have the prompt's ground truth response, remember to set the `source_model` parameter to your prompt optimizer configuration (see below) instead of adding ground truth responses. Vertex AI prompt optimizer would run your sample prompts on the source model to generate the ground truth responses for you.

In [None]:
system_instruction = """
Given a question with context, provide the correct answer to the question.
"""

prompt_template = """
Some examples of correct answer to a question are:
Question: {question}
Context: {ctx}
Answer: {target}
"""

#### The optimization dataset

The optimizer's performance depends heavily on the quality of your sample data.

For this example, we use a question-answering dataset where each row contains a `question`, context (`ctx`), and a ground-truth `target` answer. The `{target}` variable is crucial for computation-based evaluation metrics like `question_answering_correctness`.

> Important: For effective **prompt optimization**, provide a dataset of examples where your model is poor in performance when using current system instruction template. For reliable results, use 50-100 distinct samples. In case of **prompt migration**, consider using the source model to label examples that the target model struggles with, helping to identify areas for improvement.


In [None]:
input_data_path = "gs://github-repo/prompts/prompt_optimizer/rag_qa_dataset.jsonl"
prompt_optimization_df = pd.read_json(input_data_path, lines=True)
prompt_optimization_df.head()

#### Set optimization configuration

Now, we'll create a dictionary with our specific settings and use it to instantiate our `OptimizationConfig` class. 

The `OptimizationConfig` class, built using `pydantic`, acts as a structured and validated blueprint for our optimization task. It ensures all necessary parameters are defined before we submit the job.

In [None]:
class OptimizationConfig(BaseModel):
    """
    A comprehensive prompt optimization configuration model.
    """

    # Basic Configuration
    system_instruction: str = Field(
        ...,
        description="System instructions for the target model. String. This field is required.",
    )
    prompt_template: str = Field(
        ..., description="Template for prompts. String. This field is required."
    )
    target_model: str = Field(
        "gemini-2.5-flash",
        description='Target model for optimization. Supported models: "gemini-2.5-flash", "gemini-2.5-pro"',
    )
    thinking_budget: int = Field(
        -1,
        description="Thinking budget for thinking models. -1 means auto/no thinking. Integer.",
    )
    optimization_mode: str = Field(
        "instruction",
        description='Optimization mode. Supported modes: "instruction", "demonstration", "instruction_and_demo".',
    )
    project: str = Field(
        ..., description="Google Cloud project ID. This field is required."
    )

    # Evaluation Settings
    eval_metrics_types: List[str] = Field(
        description='List of evaluation metrics. E.g., "bleu", "rouge_l", "safety".'
    )
    eval_metrics_weights: List[float] = Field(
        description="Weights for evaluation metrics. Length must match eval_metrics_types and should sum to 1."
    )
    aggregation_type: str = Field(
        "weighted_sum",
        description='Aggregation type for metrics. Supported: "weighted_sum", "weighted_average".',
    )
    custom_metric_name: str = Field(
        "",
        description="Metric name, as defined by the key that corresponds in the dictionary returned from Cloud function. String.",
    )
    custom_metric_cloud_function_name: str = Field(
        "",
        description="Cloud Run function name you previously deployed. String.",
    )

    # Data and I/O Paths
    input_data_path: str = Field(
        ...,
        description="Cloud Storage URI to input optimization data. This field is required.",
    )
    output_path: str = Field(
        ...,
        description="Cloud Storage URI to save optimization results. This field is required.",
    )

    # (Optional) Advanced Configuration
    num_steps: int = Field(
        10,
        ge=10,
        le=20,
        description="Number of iterations in instruction optimization mode. Integer between 10 and 20.",
    )
    num_demo_set_candidates: int = Field(
        10,
        ge=10,
        le=30,
        description="Number of demonstrations evaluated. Integer between 10 and 30.",
    )
    demo_set_size: int = Field(
        3,
        ge=3,
        le=6,
        description="Number of demonstrations generated per prompt. Integer between 3 and 6.",
    )

    # (Optional) Model Locations and QPS
    target_model_location: str = Field(
        "us-central1", description="Location of the target model. Default us-central1."
    )
    target_model_qps: int = Field(
        1,
        ge=1,
        description="QPS for the target model. Integer >= 1, based on your quota.",
    )
    optimizer_model_location: str = Field(
        "us-central1",
        description="Location of the optimizer model. Default us-central1.",
    )
    optimizer_model_qps: int = Field(
        1,
        ge=1,
        description="QPS for the optimization model. Integer >= 1, based on your quota.",
    )
    source_model: str = Field(
        "",
        description="Google model previously used with these prompts. Not needed if providing a target column.",
    )
    source_model_location: str = Field(
        "us-central1", description="Location of the source model. Default us-central1."
    )
    source_model_qps: Optional[int] = Field(
        None, ge=1, description="Optional QPS for the source model. Integer >= 1."
    )
    eval_qps: int = Field(
        1,
        ge=1,
        description="QPS for the eval model. Integer >= 1, based on your quota.",
    )

    # (Optional) Response, Language, and Data Handling
    response_mime_type: str = Field(
        "text/plain",
        description="MIME response type from the target model. E.g., 'text/plain', 'application/json'.",
    )
    response_schema: str = Field(
        "", description="The Vertex AI Controlled Generation response schema."
    )
    language: str = Field(
        "English",
        description='Language of the system instructions. E.g., "English", "Japanese".',
    )
    placeholder_to_content: Dict[str, Any] = Field(
        {},
        description="Dictionary of placeholders to replace parameters in the system instruction.",
    )
    data_limit: int = Field(
        10,
        ge=5,
        le=100,
        description="Amount of data used for validation. Integer between 5 and 100.",
    )
    translation_source_field_name: str = Field(
        "",
        description="Field name for source text if using translation metrics (Comet, MetricX).",
    )
    has_multimodal_inputs: bool = Field(
        False, description="Whether the input data is multimodal."
    )

Set the optimization configuration.

In [None]:
output_path = f"{BUCKET_URI}/optimization_results/"

vapo_data_settings = {
    "system_instruction": system_instruction,
    "prompt_template": prompt_template,
    "target_model": "gemini-2.5-flash",
    "thinking_budget": -1,
    "optimization_mode": "instruction",
    "eval_metrics_types": ["question_answering_correctness", "fluency"],
    "eval_metrics_weights": [0.8, 0.2],
    "aggregation_type": "weighted_sum",
    "input_data_path": input_data_path,
    "output_path": output_path,
    "project": PROJECT_ID,
}

vapo_data_config = OptimizationConfig(**vapo_data_settings)
vapo_data_config_json = vapo_data_config.model_dump()

#### Upload configuration to Cloud Storage

Write the Prompt Optimizer configuration to the file in your GCS bucket.


In [None]:
config_path = f"{BUCKET_URI}/config.json"

with epath.Path(config_path).open("w") as config_file:
    json.dump(vapo_data_config_json, config_file)
config_file.close()

#### Run the prompt optimization job

This is the final step. We pass the path to our configuration file and the service account to the Vertex AI client. The `optimize` method starts the custom job on the Vertex AI backend. We set `wait_for_completion` to `True` so the script will pause until the job is finished.


In [None]:
vapo_data_run_config = {
    "config_path": config_path,
    "wait_for_completion": True,
    "service_account": SERVICE_ACCOUNT,
}

result = client.prompt_optimizer.optimize(method="vapo", config=vapo_data_run_config)

### Get and use the best prompt programmatically

For use in an application, you can programmatically retrieve the top-performing instruction from the output files stored in GCS.


In [None]:
best_instruction, _ = get_best_vapo_results(output_path)
print("The optimized instruction is:\n", best_instruction)

## (Optionally) Visualize results with the interactive app

The tutorial includes a helper function to launch a Gradio-based web interface. This is a great way to visually explore all the different results the optimizer generated with both prompt optimization approaches.

### Build the Gradio application

In [None]:
def find_vapo_runs(base_path: str) -> List[str]:
    """Find VAPO run directories containing required files."""

    REQUIRED_FILES = {"eval_results.json", "templates.json"}

    bucket_name, prefix = parse_gcs_path(base_path)
    all_objects = list_gcs_objects(base_path)

    # Group files by directory
    directories = {}
    for obj_path in all_objects:
        dir_path = "/".join(obj_path.split("/")[:-1])
        filename = obj_path.split("/")[-1]

        if dir_path not in directories:
            directories[dir_path] = set()
        directories[dir_path].add(filename)

    # Find directories with all required files
    valid_runs = []
    for dir_path, files in directories.items():
        if REQUIRED_FILES.issubset(files):
            valid_runs.append(f"gs://{bucket_name}/{dir_path}")

    return valid_runs


# ============================================================================
# Data Processing
# ============================================================================


def extract_metric_name(column_name: str) -> str:
    """Extract clean metric name from column name."""
    # Try to extract from pattern like ".metric_name/"
    match = re.search(r"\.(\w+)/", column_name)
    if match:
        return match.group(1)

    # Fallback to splitting approach
    parts = column_name.split(".")
    if parts:
        return parts[-1].split("/")[0]

    return column_name


def is_metric_column(col: str) -> bool:
    """Check if a column is a metric column with score, confidence, or explanation."""
    # Must end with /score, /confidence, /explanation (or with .)
    return bool(re.search(r"[/\.](score|confidence|explanation)$", col))


def is_core_field(col: str) -> bool:
    """Check if a column is a core field (not a metric)."""
    # Exact core fields we want to keep
    CORE_FIELDS = [
        "question",
        "target",
        "ctx",
        "context",
        "prompt",
        "response",
        "reference",
    ]

    col_lower = col.lower()

    # If it has metric indicators, it's not a core field
    if is_metric_column(col):
        return False

    # Check if it exactly matches a core field
    return col_lower in CORE_FIELDS


def is_duplicate_text_field(col: str, all_columns: List[str]) -> bool:
    """Check if a column is a duplicate text field that should be excluded."""

    # If it's a metric column (ends with score/confidence/explanation), keep it
    if is_metric_column(col):
        return False

    # Core fields that might have duplicates
    CORE_FIELDS = [
        "question",
        "target",
        "ctx",
        "context",
        "prompt",
        "response",
        "reference",
    ]

    col_lower = col.lower()

    # Check if this column starts with or contains a core field name followed by underscore
    # This catches patterns like question_fluency, question_answering_correctness, etc.
    for core_field in CORE_FIELDS:
        if col_lower.startswith(f"{core_field}_") or f"_{core_field}" in col_lower:
            # It's a duplicate if it's not the exact core field
            return col_lower != core_field

    # Also check for fields that end with _fluency, _correctness, etc. (common metric names)
    # but aren't actual metrics (don't end with /score, /confidence, /explanation)
    metric_names = ["fluency", "correctness", "relevance", "coherence"]
    for metric_name in metric_names:
        if col_lower.endswith(f"_{metric_name}") or col_lower == metric_name:
            # Check if there's a corresponding metric column
            # If there are columns like fluency/score, then 'fluency' alone is likely a duplicate
            has_metric_version = any(
                metric_name in c.lower() and is_metric_column(c) for c in all_columns
            )
            if has_metric_version:
                return True

    return False


def filter_columns_for_display(columns: List[str]) -> List[str]:
    """Filter columns to keep only core fields and metric scores/confidence/explanations."""
    filtered = []
    seen_context = False

    for col in columns:
        col_lower = col.lower()

        # Skip duplicate text fields
        if is_duplicate_text_field(col, columns):
            continue

        # Handle core fields
        if is_core_field(col):
            # Skip 'context' if we already have 'ctx' to avoid duplication
            if col_lower == "context":
                if not seen_context and "ctx" not in [c.lower() for c in filtered]:
                    filtered.append(col)
                    seen_context = True
            elif col_lower == "ctx":
                # Remove context if it was added, prefer ctx
                filtered = [c for c in filtered if c.lower() != "context"]
                filtered.append(col)
                seen_context = True
            else:
                filtered.append(col)
        # Handle metric columns (must have score/confidence/explanation)
        elif is_metric_column(col):
            filtered.append(col)

    return filtered


def process_evaluation_result(result: Dict[str, Any]) -> pd.DataFrame:
    """Process evaluation result for clean display."""
    df = pd.read_json(io.StringIO(result["metrics_table"]))
    return df


def categorize_columns(columns: List[str]) -> Dict[str, List[str]]:
    """Categorize columns by type for better organization."""
    categories = {
        "Core Fields": [],
        "Metric Scores": [],
        "Metric Confidence": [],
        "Metric Explanations": [],
    }

    for col in columns:
        if is_core_field(col):
            categories["Core Fields"].append(col)
        elif col.endswith("/score") or col.endswith(".score"):
            categories["Metric Scores"].append(col)
        elif col.endswith("/confidence") or col.endswith(".confidence"):
            categories["Metric Confidence"].append(col)
        elif col.endswith("/explanation") or col.endswith(".explanation"):
            categories["Metric Explanations"].append(col)

    # Remove empty categories
    return {k: v for k, v in categories.items() if v}


def get_default_columns(all_columns: List[str]) -> List[str]:
    """Get default columns to display - only the specified core fields."""

    # Specific fields to show by default (in order of preference)
    DEFAULT_FIELDS = ["question", "target", "ctx", "prompt", "response", "reference"]

    default_cols = []

    # Add fields in the specified order if they exist
    for field in DEFAULT_FIELDS:
        for col in all_columns:
            if col.lower() == field and col not in default_cols:
                default_cols.append(col)
                break

    # If 'ctx' doesn't exist but 'context' does, use 'context' (but not both)
    if not any(col.lower() == "ctx" for col in default_cols):
        for col in all_columns:
            if col.lower() == "context" and col not in default_cols:
                default_cols.append(col)
                break

    return default_cols


def simplify_dataframe_for_metrics(
    df: pd.DataFrame, selected_columns: List[str]
) -> pd.DataFrame:
    """Simplify DataFrame by filtering and formatting metric columns."""
    if df.empty or not selected_columns:
        return df

    # Filter to selected columns that exist
    valid_columns = [col for col in selected_columns if col in df.columns]

    if not valid_columns:
        return pd.DataFrame()

    simplified_df = df[valid_columns].copy()

    # Round numeric columns for better display
    for col in simplified_df.columns:
        if simplified_df[col].dtype in ["float64", "float32"]:
            # Round scores and confidence to 3 decimal places
            if any(
                x in col for x in ["/score", ".score", "/confidence", ".confidence"]
            ):
                simplified_df[col] = simplified_df[col].round(3)

    # Truncate long text fields (explanations)
    for col in simplified_df.columns:
        if "/explanation" in col or ".explanation" in col:
            simplified_df[col] = simplified_df[col].apply(
                lambda x: x[:200] + "..." if isinstance(x, str) and len(x) > 200 else x
            )

    return simplified_df


# ============================================================================
# Zero-Shot Optimization
# ============================================================================


class ZeroShotOptimizer:
    """Handle zero-shot prompt optimization using Vertex AI."""

    def __init__(self):
        self.client = None
        self.project_name = None
        self.location = None

    def initialize(self, project_name: str, location: str = "us-central1") -> str:
        """Initialize Vertex AI client."""
        try:
            self.project_name = project_name
            self.location = location
            vertexai.init(project=project_name, location=location)
            self.client = vertexai.Client(project=project_name, location=location)
            return f"‚úÖ Successfully initialized for project: {project_name} in location: {location}"
        except Exception as e:
            return f"‚ùå Error: {str(e)}"

    def optimize(self, prompt: str) -> Dict[str, Any]:
        """Run zero-shot optimization on prompt."""
        if not self.client:
            return {
                "error": "Client not initialized. Please set up the project first.",
                "original_prompt": prompt,
                "suggested_prompt": "",
                "guidelines": [],
            }

        try:
            output = self.client.prompt_optimizer.optimize_prompt(prompt=prompt)

            guidelines = [
                {
                    "guideline": g.applicable_guideline,
                    "improvement": g.suggested_improvement,
                    "before": g.text_before_change,
                    "after": g.text_after_change,
                }
                for g in output.applicable_guidelines
            ]

            return {
                "original_prompt": output.original_prompt,
                "suggested_prompt": output.suggested_prompt,
                "optimization_type": output.optimization_type,
                "guidelines": guidelines,
                "error": None,
            }
        except Exception as e:
            return {
                "error": f"Optimization failed: {str(e)}",
                "original_prompt": prompt,
                "suggested_prompt": "",
                "guidelines": [],
            }


# ============================================================================
# UI Components
# ============================================================================


class VAPOResultsViewer:
    """Main application class for VAPO results viewer."""

    def __init__(self):
        self.optimizer = ZeroShotOptimizer()
        self.reset_state()

    def reset_state(self):
        """Reset all application state."""
        self.base_path = None
        self.runs = []
        self.templates = []
        self.eval_results = []
        self.current_run = None
        self.current_eval_full = None  # Store full evaluation data
        self.available_columns = []  # Store available columns
        self.filtered_columns = []  # Store filtered columns

    # ========================================================================
    # Zero-Shot Optimization Methods
    # ========================================================================

    def setup_vertex_ai(self, project_name: str, location: str) -> str:
        """Setup Vertex AI client."""
        if not project_name:
            return "‚ö†Ô∏è Please enter a project name"
        if not location:
            location = "us-central1"  # Default location
        return self.optimizer.initialize(project_name, location)

    def optimize_prompt(self, prompt: str) -> Tuple:
        """Run zero-shot optimization and format results."""
        if not prompt:
            return (
                "‚ö†Ô∏è Please enter a prompt to optimize",
                pd.DataFrame(),
                "",
                pd.DataFrame(),
            )

        results = self.optimizer.optimize(prompt)
        return self._format_optimization_results(results)

    def _format_optimization_results(
        self, results: Dict[str, Any]
    ) -> Tuple[str, pd.DataFrame, str, pd.DataFrame]:
        """Format optimization results for display."""
        if results.get("error"):
            return results["error"], pd.DataFrame(), "", pd.DataFrame()

        # Create summary
        summary = self._create_optimization_summary(results)

        # Format guidelines table
        guidelines_df = self._create_guidelines_dataframe(results["guidelines"])

        # Format changes table
        changes_df = self._create_changes_dataframe(results["guidelines"])

        return summary, guidelines_df, results["suggested_prompt"], changes_df

    def _create_optimization_summary(self, results: Dict[str, Any]) -> str:
        """Create formatted summary of optimization results."""
        original_len = len(results["original_prompt"])
        optimized_len = len(results["suggested_prompt"])
        change_percent = (
            (optimized_len / original_len * 100 - 100) if original_len > 0 else 0
        )

        guidelines_list = "\n".join(
            f"{i}. **{g['guideline']}**" for i, g in enumerate(results["guidelines"], 1)
        )

        return f"""
## üìä Optimization Results

### Summary
- **Type**: `{results.get('optimization_type', 'zero_shot_prompt_optimization')}`
- **Guidelines Applied**: **{len(results['guidelines'])}**
- **Original Length**: {original_len} characters
- **Optimized Length**: {optimized_len} characters
- **Change**: {optimized_len - original_len:+d} chars ({change_percent:+.1f}%)

### Applied Guidelines
{guidelines_list if guidelines_list else "No guidelines applied."}
        """

    def _create_guidelines_dataframe(self, guidelines: List[Dict]) -> pd.DataFrame:
        """Create DataFrame for guidelines display."""
        if not guidelines:
            return pd.DataFrame()

        data = [
            {
                "#": i,
                "Guideline": g["guideline"],
                "Improvement": g["improvement"],
            }
            for i, g in enumerate(guidelines, 1)
        ]

        return pd.DataFrame(data)

    def _create_changes_dataframe(self, guidelines: List[Dict]) -> pd.DataFrame:
        """Create DataFrame for before/after changes."""
        if not guidelines:
            return pd.DataFrame()

        data = [
            {"#": i, "Before": g["before"] or "N/A", "After": g["after"] or "N/A"}
            for i, g in enumerate(guidelines, 1)
        ]

        return pd.DataFrame(data)

    # ========================================================================
    # Data-Driven Optimization Methods
    # ========================================================================

    def load_runs(self, base_path: str) -> gr.Dropdown:
        """Load available VAPO runs."""
        if not base_path:
            return gr.Dropdown(choices=[], value=None)

        try:
            self.base_path = base_path
            self.runs = find_vapo_runs(base_path)

            if not self.runs:
                return gr.Dropdown(choices=[], value=None)

            return gr.Dropdown(choices=self.runs, value=self.runs[0])

        except Exception as e:
            print(f"Error loading runs: {e}")
            return gr.Dropdown(choices=[], value=None)

    def load_run_data(
        self, run_path: str
    ) -> Tuple[gr.Dropdown, pd.DataFrame, pd.DataFrame, gr.CheckboxGroup]:
        """Load data for specific run."""
        if not run_path:
            return (
                gr.Dropdown(choices=[], value=None),
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

        try:
            self.current_run = run_path

            # Load templates and evaluation results
            templates_data = self._load_json(f"{run_path}/templates.json")
            eval_data = self._load_json(f"{run_path}/eval_results.json")

            # Process data
            self.templates = [pd.json_normalize(t) for t in templates_data]
            self.eval_results = [process_evaluation_result(r) for r in eval_data]

            # Handle mismatch
            if len(self.templates) == len(self.eval_results) + 1:
                self.templates = self.templates[1:]
            elif len(self.templates) != len(self.eval_results):
                raise ValueError(
                    f"Mismatch: {len(self.templates)} templates vs "
                    f"{len(self.eval_results)} results"
                )

            # Create template options
            template_options = self._create_template_options()

            if template_options:
                template_df, eval_df, column_selector = self._get_template_data(0)
                return (
                    gr.Dropdown(choices=template_options, value=template_options[0]),
                    template_df,
                    eval_df,
                    column_selector,
                )

            return (
                gr.Dropdown(choices=[], value=None),
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

        except Exception as e:
            print(f"Error loading run data: {e}")
            return (
                gr.Dropdown(choices=[], value=None),
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

    def _load_json(self, path: str) -> Any:
        """Load JSON file from GCS."""
        with epath.Path(path).open("r") as f:
            return json.load(f)

    def _create_template_options(self) -> List[str]:
        """Create dropdown options for templates."""
        options = []

        for i, template_df in enumerate(self.templates):
            metrics = []

            for col in template_df.columns:
                if "metric" in col and "mean" in col:
                    value = template_df[col].iloc[0]
                    metric_name = extract_metric_name(col)
                    metrics.append(f"{metric_name}: {value:.3f}")

            metrics_str = " | ".join(metrics) if metrics else "No metrics"
            options.append(f"Template {i} - {metrics_str}")

        return options

    def _get_template_data(
        self, index: int
    ) -> Tuple[pd.DataFrame, pd.DataFrame, gr.CheckboxGroup]:
        """Get template and evaluation data for index."""
        if not (0 <= index < len(self.templates)):
            return (
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

        # Transpose template for better display
        template_df = self.templates[index].T.reset_index()
        template_df.columns = ["Field", "Value"]

        # Store full evaluation data
        self.current_eval_full = self.eval_results[index]
        self.available_columns = list(self.current_eval_full.columns)

        # Filter columns to only keep core fields and metric score/confidence/explanation
        self.filtered_columns = filter_columns_for_display(self.available_columns)

        # Get default columns to display (only the specified core fields)
        default_columns = get_default_columns(self.filtered_columns)

        # Create categorized column selector
        categorized = categorize_columns(self.filtered_columns)

        # Create formatted choices with categories
        choices_info = []
        for category, cols in categorized.items():
            choices_info.append(f"üìÅ {category}: {len(cols)} columns")

        info_text = " | ".join(choices_info)

        # Simplify evaluation data
        eval_df = simplify_dataframe_for_metrics(
            self.current_eval_full, default_columns
        )

        column_selector = gr.CheckboxGroup(
            choices=self.filtered_columns,
            value=default_columns,
            label="Select Columns to Display",
            info=info_text,
        )

        return template_df, eval_df, column_selector

    def display_template(
        self, template_selection: str
    ) -> Tuple[pd.DataFrame, pd.DataFrame, gr.CheckboxGroup]:
        """Display selected template and evaluation results."""
        if not template_selection:
            return (
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

        try:
            index = int(template_selection.split()[1])
            return self._get_template_data(index)
        except (ValueError, IndexError):
            return (
                pd.DataFrame(),
                pd.DataFrame(),
                gr.CheckboxGroup(choices=[], value=[]),
            )

    def update_evaluation_display(self, selected_columns: List[str]) -> pd.DataFrame:
        """Update evaluation display based on selected columns."""
        if not selected_columns or self.current_eval_full is None:
            return pd.DataFrame()

        return simplify_dataframe_for_metrics(self.current_eval_full, selected_columns)

    def add_metric_scores(
        self, current_columns: List[str]
    ) -> Tuple[List[str], pd.DataFrame]:
        """Add all metric score columns to current selection."""
        if not self.filtered_columns:
            return current_columns, pd.DataFrame()

        # Get all metric score columns only
        metric_columns = []
        for col in self.filtered_columns:
            if col.endswith("/score") or col.endswith(".score"):
                metric_columns.append(col)

        # Combine with current selection
        updated = list(set(current_columns + metric_columns))

        # Sort to maintain consistent order
        updated.sort(
            key=lambda x: self.filtered_columns.index(x)
            if x in self.filtered_columns
            else len(self.filtered_columns)
        )

        # Update display
        df = self.update_evaluation_display(updated)

        return updated, df

    def add_metric_confidence(
        self, current_columns: List[str]
    ) -> Tuple[List[str], pd.DataFrame]:
        """Add all metric confidence columns to current selection."""
        if not self.filtered_columns:
            return current_columns, pd.DataFrame()

        # Get all metric confidence columns
        confidence_columns = []
        for col in self.filtered_columns:
            if col.endswith("/confidence") or col.endswith(".confidence"):
                confidence_columns.append(col)

        # Combine with current selection
        updated = list(set(current_columns + confidence_columns))

        # Sort to maintain consistent order
        updated.sort(
            key=lambda x: self.filtered_columns.index(x)
            if x in self.filtered_columns
            else len(self.filtered_columns)
        )

        # Update display
        df = self.update_evaluation_display(updated)

        return updated, df

    def add_metric_explanations(
        self, current_columns: List[str]
    ) -> Tuple[List[str], pd.DataFrame]:
        """Add all metric explanation columns to current selection."""
        if not self.filtered_columns:
            return current_columns, pd.DataFrame()

        # Get all metric explanation columns
        explanation_columns = []
        for col in self.filtered_columns:
            if col.endswith("/explanation") or col.endswith(".explanation"):
                explanation_columns.append(col)

        # Combine with current selection
        updated = list(set(current_columns + explanation_columns))

        # Sort to maintain consistent order
        updated.sort(
            key=lambda x: self.filtered_columns.index(x)
            if x in self.filtered_columns
            else len(self.filtered_columns)
        )

        # Update display
        df = self.update_evaluation_display(updated)

        return updated, df

    def reset_to_default_columns(self) -> Tuple[List[str], pd.DataFrame]:
        """Reset column selection to defaults."""
        if not self.filtered_columns:
            return [], pd.DataFrame()

        default_cols = get_default_columns(self.filtered_columns)
        df = self.update_evaluation_display(default_cols)

        return default_cols, df

    def select_only_scores(self) -> Tuple[List[str], pd.DataFrame]:
        """Select only core fields and metric scores."""
        if not self.filtered_columns:
            return [], pd.DataFrame()

        # Get core fields
        core_fields = []
        for col in self.filtered_columns:
            if is_core_field(col):
                core_fields.append(col)

        # Get metric scores only
        score_columns = []
        for col in self.filtered_columns:
            if col.endswith("/score") or col.endswith(".score"):
                score_columns.append(col)

        selected = core_fields + score_columns
        df = self.update_evaluation_display(selected)

        return selected, df

    def clear_all(self) -> Tuple:
        """Clear all data and reset interface."""
        self.reset_state()

        return (
            "",  # base_path_input
            gr.Dropdown(choices=[], value=None),  # run_dropdown
            gr.Dropdown(choices=[], value=None),  # template_dropdown
            pd.DataFrame(),  # template_display
            pd.DataFrame(),  # eval_display
            gr.CheckboxGroup(choices=[], value=[]),  # column_selector
            "",  # project_input
            "",  # location_input
            "",  # prompt_input
            "",  # optimization_summary
            pd.DataFrame(),  # guidelines_table
            "",  # suggested_prompt_display
            pd.DataFrame(),  # changes_table
        )

    # ========================================================================
    # UI Creation
    # ========================================================================

    def create_interface(self) -> gr.Blocks:
        """Create the Gradio interface."""
        css = """
        .markdown-text {
            font-size: 14px;
            line-height: 1.6;
        }
        .prompt-text {
            font-family: 'Courier New', monospace;
            white-space: pre-wrap;
            word-wrap: break-word;
            background-color: #f5f5f5;
            padding: 10px;
            border-radius: 5px;
        }
        .column-selector {
            max-height: 300px;
            overflow-y: auto;
            border: 1px solid #ddd;
            padding: 10px;
            border-radius: 5px;
        }
        .metric-info {
            background-color: #e3f2fd;
            padding: 8px;
            border-radius: 4px;
            margin: 5px 0;
        }
        """

        with gr.Blocks(
            title="VAPO Results Viewer", theme=gr.themes.Soft(), css=css
        ) as interface:

            self._create_header()
            self._create_setup_section()

            gr.Markdown("---")

            with gr.Tabs():
                self._create_zero_shot_tab()
                self._create_data_driven_tab()

        return interface

    def _create_header(self):
        """Create application header."""
        gr.Markdown("# üöÄ VAPO Results Viewer")
        gr.Markdown(
            "Analyze prompts using Zero-Shot and Data-Driven optimization "
            "with Vertex AI Prompt Optimizer"
        )

    def _create_setup_section(self):
        """Create Vertex AI setup section."""
        with gr.Accordion("‚öôÔ∏è Vertex AI Setup", open=False):
            with gr.Row():
                with gr.Column(scale=2):
                    self.project_input = gr.Textbox(
                        label="GCP Project Name",
                        placeholder="your-project-name",
                        info="Required for zero-shot optimization",
                    )
                with gr.Column(scale=2):
                    self.location_input = gr.Textbox(
                        label="Location",
                        placeholder="us-central1",
                        value="us-central1",
                        info="GCP region (e.g., us-central1, europe-west4)",
                    )
                with gr.Column(scale=1):
                    setup_btn = gr.Button("Initialize Vertex AI", variant="secondary")

            self.setup_status = gr.Textbox(label="Status", interactive=False)

            setup_btn.click(
                fn=self.setup_vertex_ai,
                inputs=[self.project_input, self.location_input],
                outputs=[self.setup_status],
            )

    def _create_zero_shot_tab(self):
        """Create zero-shot optimization tab."""
        with gr.Tab("üéØ Zero-Shot Optimization"):
            gr.Markdown("### Optimize prompts using Vertex AI's zero-shot optimization")

            with gr.Row():
                with gr.Column():
                    self.prompt_input = gr.Textbox(
                        label="Original Prompt",
                        placeholder=(
                            "Enter the prompt you want to optimize...\n\n"
                            "Example:\n"
                            "You are a professional chef. Your goal is teaching "
                            "how to cook healthy cooking recipes to your apprentice.\n\n"
                            "Given a question from your apprentice and some context, "
                            "provide the correct answer to the question."
                        ),
                        lines=8,
                        max_lines=20,
                    )

                    with gr.Row():
                        optimize_btn = gr.Button(
                            "üîÑ Run Zero-Shot Optimization", variant="primary", scale=2
                        )
                        clear_btn = gr.Button("Clear", variant="secondary", scale=1)

            # Results section
            with gr.Row():
                self.optimization_summary = gr.Markdown(
                    label="Optimization Summary", elem_classes="markdown-text"
                )

            with gr.Tabs():
                with gr.Tab("üìù Optimized Prompt"):
                    self.suggested_prompt = gr.Textbox(
                        label="Suggested Prompt (Zero-Shot Optimized)",
                        lines=15,
                        max_lines=30,
                        interactive=False,
                        elem_classes="prompt-text",
                    )

                with gr.Tab("üìã Applied Guidelines"):
                    self.guidelines_table = gr.DataFrame(
                        label="Guidelines and Improvements",
                        wrap=True,
                        interactive=False,
                        row_count=10,
                    )

                with gr.Tab("üîÑ Before/After Changes"):
                    self.changes_table = gr.DataFrame(
                        label="Text Changes", wrap=True, interactive=False, row_count=10
                    )

            # Event handlers
            optimize_btn.click(
                fn=self.optimize_prompt,
                inputs=[self.prompt_input],
                outputs=[
                    self.optimization_summary,
                    self.guidelines_table,
                    self.suggested_prompt,
                    self.changes_table,
                ],
            )

            clear_btn.click(
                fn=lambda: ("", "", pd.DataFrame(), "", pd.DataFrame()),
                outputs=[
                    self.prompt_input,
                    self.optimization_summary,
                    self.guidelines_table,
                    self.suggested_prompt,
                    self.changes_table,
                ],
            )

    def _create_data_driven_tab(self):
        """Create data-driven optimization tab."""
        with gr.Tab("üìä Data-Driven Optimization (VAPO)"):
            with gr.Row():
                with gr.Column(scale=3):
                    self.base_path_input = gr.Textbox(
                        label="GCS Base Path",
                        placeholder="gs://your-bucket/vapo-results",
                        info="Enter the base GCS path containing VAPO runs",
                    )
                with gr.Column(scale=1):
                    with gr.Row():
                        load_btn = gr.Button("Load Runs", variant="primary")
                        clear_btn = gr.Button("Clear All", variant="secondary")

            with gr.Row():
                self.run_dropdown = gr.Dropdown(
                    label="Select Run", choices=[], interactive=True
                )
                self.template_dropdown = gr.Dropdown(
                    label="Select Template", choices=[], interactive=True
                )

            with gr.Tabs():
                with gr.Tab("Template Details"):
                    self.template_display = gr.DataFrame(
                        label="Template Information", wrap=True, interactive=False
                    )

                with gr.Tab("Evaluation Results"):
                    # Info box
                    gr.Markdown(
                        """
                        <div class="metric-info">
                        üí° <b>Tip:</b> Only score, confidence, and explanation columns are shown for all metrics.
                        Default view shows: question, target, ctx, prompt, response, and reference.
                        </div>
                        """,
                        elem_classes="metric-info",
                    )

                    # Column selector section
                    with gr.Accordion("üìä Column Selection", open=True):
                        with gr.Row():
                            self.column_selector = gr.CheckboxGroup(
                                choices=[],
                                value=[],
                                label="Select Columns to Display",
                                elem_classes="column-selector",
                            )

                        with gr.Row():
                            with gr.Column(scale=1):
                                gr.Markdown("**Quick Actions:**")
                                with gr.Row():
                                    scores_only_btn = gr.Button(
                                        "üìä Scores Only", size="sm", variant="secondary"
                                    )
                                    add_scores_btn = gr.Button(
                                        "‚ûï Add Scores", size="sm", variant="secondary"
                                    )
                                    add_confidence_btn = gr.Button(
                                        "‚ûï Add Confidence",
                                        size="sm",
                                        variant="secondary",
                                    )
                                    add_explanations_btn = gr.Button(
                                        "‚ûï Add Explanations",
                                        size="sm",
                                        variant="secondary",
                                    )

                            with gr.Column(scale=1):
                                gr.Markdown("**Selection:**")
                                with gr.Row():
                                    reset_columns_btn = gr.Button(
                                        "üîÑ Reset", size="sm", variant="secondary"
                                    )
                                    select_all_btn = gr.Button(
                                        "‚úÖ All", size="sm", variant="secondary"
                                    )
                                    clear_selection_btn = gr.Button(
                                        "‚ùå Clear", size="sm", variant="secondary"
                                    )

                    # Evaluation display
                    self.eval_display = gr.DataFrame(
                        label="Evaluation Metrics (Filtered View)",
                        wrap=True,
                        interactive=False,
                        row_count=20,
                    )

            # Event handlers
            load_btn.click(
                fn=self.load_runs,
                inputs=[self.base_path_input],
                outputs=[self.run_dropdown],
            )

            clear_btn.click(
                fn=self.clear_all,
                outputs=[
                    self.base_path_input,
                    self.run_dropdown,
                    self.template_dropdown,
                    self.template_display,
                    self.eval_display,
                    self.column_selector,
                    self.project_input,
                    self.location_input,
                    self.prompt_input,
                    self.optimization_summary,
                    self.guidelines_table,
                    self.suggested_prompt,
                    self.changes_table,
                ],
            )

            self.run_dropdown.change(
                fn=self.load_run_data,
                inputs=[self.run_dropdown],
                outputs=[
                    self.template_dropdown,
                    self.template_display,
                    self.eval_display,
                    self.column_selector,
                ],
            )

            self.template_dropdown.change(
                fn=self.display_template,
                inputs=[self.template_dropdown],
                outputs=[
                    self.template_display,
                    self.eval_display,
                    self.column_selector,
                ],
            )

            # Column selector handlers
            self.column_selector.change(
                fn=self.update_evaluation_display,
                inputs=[self.column_selector],
                outputs=[self.eval_display],
            )

            scores_only_btn.click(
                fn=self.select_only_scores,
                outputs=[self.column_selector, self.eval_display],
            )

            add_scores_btn.click(
                fn=self.add_metric_scores,
                inputs=[self.column_selector],
                outputs=[self.column_selector, self.eval_display],
            )

            add_confidence_btn.click(
                fn=self.add_metric_confidence,
                inputs=[self.column_selector],
                outputs=[self.column_selector, self.eval_display],
            )

            add_explanations_btn.click(
                fn=self.add_metric_explanations,
                inputs=[self.column_selector],
                outputs=[self.column_selector, self.eval_display],
            )

            reset_columns_btn.click(
                fn=self.reset_to_default_columns,
                outputs=[self.column_selector, self.eval_display],
            )

            select_all_btn.click(
                fn=lambda: (
                    self.filtered_columns,
                    self.update_evaluation_display(self.filtered_columns),
                ),
                outputs=[self.column_selector, self.eval_display],
            )

            clear_selection_btn.click(
                fn=lambda: ([], pd.DataFrame()),
                outputs=[self.column_selector, self.eval_display],
            )


# ============================================================================
# Application Entry Point
# ============================================================================


def launch_app(
    share: bool = False,
    server_port: int = 7861,
    server_name: str = "0.0.0.0",
    debug: bool = False,
):
    """Launch the Gradio application."""
    viewer = VAPOResultsViewer()
    interface = viewer.create_interface()
    interface.launch(
        share=share, server_port=server_port, server_name=server_name, debug=debug
    )

    return interface

### Run the Gradio's VAPO Results Viewer app

In [None]:
interface = launch_app(share=True, server_port=7861, server_name="0.0.0.0", debug=False)

### Cleaning up

In [None]:
delete_app = True
delete_job = True
delete_bucket = True

if delete_app:
    interface.close()

if delete_job:
    from google.cloud import aiplatform
    aiplatform.init(project=PROJECT_ID, location=LOCATION)
    custom_job_list = aiplatform.CustomJob.list()
    latest_job = custom_job_list[0]
    latest_job.delete()

if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI