In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate agent final answer with custom parsing

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaltask_approach%2Fevaluate_agent_final_answer_with_custom_parsing.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_agent_final_answer_with_custom_parsing.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

 | | |
 |-|-|
 |Author(s): | [Greg Breard](https://github.com/gregbreard) |

## Overview

This Colab shows how to assess the quality of agent final answers from generative models, evaluating their ability to generate responses that are valid compared to the golden response (human answer) expected from the agent. Evaluating the validity of final agent answers with custom parsing, as demonstrated here, provides a more advanced assessment than standard [metric prompt templates](https://cloud.google.com/vertex-ai/generative-ai/docs/models/metrics-templates), enabling detailed analysis of correctness and expected output adherence.

Agents often interface with Retrieval-Augmented Generation (RAG) APIs to respond to a query. In the following example, the autorater checks whether the answer the agent model gave is valid (or invalid) by checking it against the human response. If the response is invalid, the autorater will rewrite the response so that it is true.

This is accomplished using the Vertex Gen AI Evaluation SDK which supports custom output parsing. The prompt instructs the autorater model to return a structured (JSON) output which is then parsed seemlessly by providing the parsing method in the metric definition. The parsed output is appended to the evaluation result data frame.

## Objective

1. Generate structured output from an autorater.
2. Use custom parsing for advanced evalutation.

## Steps

1. Set up the environment.
2. Define helper functions, prompt templates, and metric.
3. Prepare the dataset for evaluation.
4. Run the evaluation (including model inference).

## Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Get started

## Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform

## Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


## Authenticate your notebook environment (Colab only)


Authenticate your environment on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

## Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "your-project-id"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import libraries

In [None]:
import pandas as pd
from vertexai.preview.evaluation import (
    AutoraterConfig,
    CustomOutputConfig,
    EvalTask,
    PointwiseMetric,
)

# Set up evaluation

## Helper functions

The following functions provide support for extracting JSON objects from the results returned by the autorater and computing the model response score. Additionally, there are pretty printing methods to improve the readability of the evaluation results.

In [None]:
import json
import re
from typing import Any

_TABLE_STYLE = [
    {
        "selector": "th",
        "props": [
            ("background-color", "#f2f2f2"),
            ("border", "1px solid gray"),
            ("color", "black"),
            ("font-size", "11pt"),
            ("text-align", "center"),
            ("word-break", "break-all"),
        ],
    },
    {
        "selector": "td",
        "props": [
            ("border", "1px solid gray"),
            ("color", "black"),
            ("min-width", "100px"),
            ("text-align", "center"),
        ],
    },
    {"selector": "tr:nth-child(even)", "props": [("background-color", "#f9f9f9")]},
    {"selector": "tr:nth-child(odd)", "props": [("background-color", "white")]},
    {"selector": "tr:hover", "props": [("background-color", "#94e6ff")]},
    {"selector": "td:hover", "props": [("background-color", "#ffffb3")]},
]


def compute_metric_score(df: "pd.DataFrame") -> float:
    valid = 0.0
    for idx, row in df.iterrows():
        result = row["agent_final_answer/result"]
        if isinstance(result, dict):
            if result["is_the_agent_response_valid"] == "valid":
                valid += 1
    return valid / len(df)


def parse_response_to_json(responses: list[str]) -> dict[str, Any]:
    response = re.sub(
        r"(.*```json|```.*)",
        "",
        responses[0].strip(),
    )
    result = None
    try:
        result = json.loads(response)
    except Exception as e:
        print(f"Failed to parse JSON response: {str(e)}")
    return {"result": result}


def pretty_print_df(df: "pd.DataFrame") -> "pd.Styler":
    styled_df = df.copy()
    for col in df.columns:
        if isinstance(df[col][0], dict):
            styled_df[col] = styled_df[col].apply(lambda x: _dict_to_html_table(x))
    return styled_df.style.hide(axis="index").set_table_styles(_TABLE_STYLE)


def _dict_to_html_table(data: dict[str, Any]) -> str:
    if not data:
        return "<i>No data to display.</i>"
    html_table = "<table style='border-collapse: collapse'>"
    for key, value in data.items():
        html_table += (
            f"<tr><td style='font-weight: bold'>{key}</td><td>{value}</td></tr>"
        )
    html_table += "</tbody></table>"
    return html_table

## Prompt Templates

In [None]:
AGENT_FINAL_ANSWER_PROMPT = """
You are an expert rater for an AI agent. The AI agent is going to call an API to
answer the user query and generate API tool use code based for the choice of the
API and API arguments. The ideal model response should be a function call that
fulfills user query, or a natural language response hedges or asks users for
further clarification if a function call does not apply.
The primary focus of this rating task is to check correctness of the model
responses. If the model response is not correct, you need to write a corrected
response.

The data consists of:
- A set of python function definitions available to the agent.
- A user query.
- A model generated response for the prompt. The responses can consist of:
  - Natural language, when the model is asking for clarification, or tells the
    user it does not possess the requested functionality / option.
  - Code, in the form of one or multiple python function calls, and additional
    code as needed, for when the model is fulfilling the user request.
You can use the help from a reference response annotated by a human rater. This
reference response is of high quality. You can compare the agent's response with
the reference response and decide if the agent's response is valid.

You should follow the constitutions below to rate the model response:
- Allow flexibility of format even when reference code only uses one of the
  possible format, unless API spec or user prompt has explicit format
  requirement
  - e.g. For state name, allow both abbreviation and full name unless API spec
    has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in
    the agent response even when reference code only uses one of them.
  - e.g. If a reference response list outputs in a list format, the agent
    response is allowed to use sentence format unless user prompt explicitly
    asks for a list.
- The model shouldn't assume that it doesn't have access to according data or
  incapable of answering the question if reference response is able to find a
  legit answer.
- If the model response contains the correct final answer, rate it as valid even
  when the model response contains more information than the reference response.
- If the user prompt has csv or other table format data, don't read it yourself.
  Trust the reference response final answer instead.
- Be mindful about unit of numbers. For example, if the reference response says
  100 miles, but the model response says 100 km, it is invalid.

Below are the inputs:
{
  "User prompt": {prompt},
  "Agent response": {response},
  "Reference response": {golden},
}

The answer should be a json alone which follows the json structure below:
{
  "is_the_agent_response_valid": "[valid or invalid]",
  "reasoning":
  "rewritten response":
}
Answer with assertiveness:
"""

## Define the metric

In [None]:
agent_final_answer_metric = PointwiseMetric(
    metric="agent_final_answer",
    metric_prompt_template=AGENT_FINAL_ANSWER_PROMPT,
    custom_output_config=CustomOutputConfig(
        return_raw_output=True,
        parsing_fn=parse_response_to_json,
    ),
)

# Prepare the dataset

In [None]:
prompts = [
    "Considering countries population in 2022, which country ranked 10th in population for this year? Respond in the format 'Country_Name: Population'",
    "Calculate the difference between the average highest and lowest temperatures recorded in the USA between January and December 2023. Express your answer as a single number, which represents degrees Fahrenheit, rounded to 1 decimal place.",
    "How many total medals did the USA win in the Rio Olympics and what percentage of these were gold medals?",
    "In F1 2021, Lewis Hamilton lost the World Championship to Max Verstappen in the last race. How many times did both of the drivers retire during the same race that year?",
    "What is the volume in milliliters of a system comprised of 0.312 kg Freon-12 refrigerant when placed at the bottom of the Mariana Trench and allowed to stabilize at the Trench's peak temperature, rounded to the nearest mL? Provide your answer as just an integer value.",
]
responses = [
    "Mexico: 129,150,971",
    "40.5",
    "The USA won a total of 121 medals at the Rio Olympics.  38.02% of these medals were gold.",
    "Lewis Hamilton and Max Verstappen both retired from the same race once in the 2021 F1 season, at the Italian Grand Prix.",
    "55",
]
goldens = [
    "Mexico: 129,150,971",
    "23.5",
    "121, 38.02",
    "1",
    "55",
]

eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "response": responses,
        "golden": goldens,
    }
)

# Run evaluation

In [None]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[agent_final_answer_metric],
    autorater_config=AutoraterConfig(sampling_count=1),
)
eval_result = eval_task.evaluate()

# Calculate overall score for metric.
compute_metric_score(eval_result.metrics_table)

In [None]:
pretty_print_df(eval_result.metrics_table)