In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Intro to Batch Evaluations with the Gemini API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fintro_batch_evaluation.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/intro_batch_evaluation.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/intro_batch_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| Jessica Wang, [Ivan Nardini](https://github.com/inardini) |

## Overview

Different from getting online (synchronous) responses, where you are limited to one input request at a time, the batch evaluations in Vertex AI allow you to send a large number of evaluation requests to a Gemini model in a single batch request. Then, the model responses asynchronously populate to your storage output location in [Cloud Storage](https://cloud.google.com/storage/docs/introduction).

### Objectives

In this tutorial, you learn how to run batch evaluation with the Gemini API in Vertex AI. This tutorial shows how to use **Cloud Storage** as input sources. Vertex AI Gen AI Eval service supports **BigQuery** as well. Refer to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/run-evaluation#batch-eval) to learn more.

You will complete the following tasks:

- Preparing batch inputs and an output location
- Submitting a batch evaluation long running operation
- Retrieving batch evaluation results


## Get started

### Install Google Vertex AI SDK and other required packages

In [None]:
%pip install google-cloud-aiplatform[evaluation] gcsfs --force-reinstall --quiet

### Authenticate your notebook environment

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
# from google.colab import auth
# auth.authenticate_user()

In [None]:
# ! gcloud auth login

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

!gsutil mb -l {LOCATION} {BUCKET_URI}

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
import json
import subprocess
import time
from pprint import pprint

import pandas as pd
from IPython.display import display

### Helper functions

Here you can find some helper functions

- `send_request` and `get_operation`: These handle the mechanics of making authenticated API calls using curl and gcloud. While the Vertex AI Python SDK is great for many tasks, in this case we need to use curl for interacting with specific REST batch prediction endpoints.

- `expand_json_columns_in_df_simplified`, `extract_metric_score`, `style_df_for_slide_corrected`: These are our data wrangling and presentation helpers. The API returns results in a nested JSON format. These functions will parse that JSON, extract scores and explanations, and format the final DataFrame into a easy-to-read table.

In [None]:
def send_request(request_file):
    """
    Makes an authenticated POST request to the given API endpoint using gcloud authentication.
    """
    address = f"https://{LOCATION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/us-central1:evaluateDataset"
    try:
        # Get the access token
        token_result = subprocess.run(
            ["gcloud", "auth", "print-access-token"],
            capture_output=True,
            text=True,
            check=True,
        )
        access_token = token_result.stdout.strip()

        # Construct the curl command
        curl_command = [
            "curl",
            "-i",
            "-X",
            "POST",
            "-H",
            "Content-Type: application/json",
            "-H",
            f"Authorization: Bearer {access_token}",
            address,
            "-d",
            f"@{request_file}",
        ]

        # Execute the curl command
        response = subprocess.run(
            curl_command, capture_output=True, text=True, check=True
        )

        # Extract JSON from the response (ignoring HTTP headers)
        json_part = response.stdout.split("\n\n")[-1]  # Extract last part after headers

        # Try parsing the response as JSON
        try:
            response_json = json.loads(json_part)
            return response_json.get("name", "No 'name' field found in response")
        except json.JSONDecodeError:
            return "Failed to parse response as JSON:\n" + response.stdout

    except subprocess.CalledProcessError as e:
        return f"Error executing request: {e}"


def get_operation(operation):
    """
    Makes an authenticated request to the given API endpoint using gcloud authentication.
    """
    address = f"https://{LOCATION}-aiplatform.googleapis.com/v1beta1/" + operation
    try:
        # Get the access token
        token_result = subprocess.run(
            ["gcloud", "auth", "print-access-token"],
            capture_output=True,
            text=True,
            check=True,
        )
        access_token = token_result.stdout.strip()

        # Construct the curl command
        curl_command = [
            "curl",
            "-H",
            "GET",
            "-H",
            "Content-Type: application/json",
            "-H",
            f"Authorization: Bearer {access_token}",
            address,
        ]

        # Execute the curl command
        response = subprocess.run(
            curl_command, capture_output=True, text=True, check=True
        )

        # Try parsing the response as JSON
        try:
            return json.loads(response.stdout)
        except json.JSONDecodeError:
            print("raw response")
            return response.stdout  # Return raw response if not JSON

    except subprocess.CalledProcessError as e:
        return f"Error: {e}"


def expand_json_columns_in_df_simplified(
    df: pd.DataFrame,
    json_instance_col: str = "jsonInstance",
    eval_results_col: str = "evaluationResults",
) -> pd.DataFrame:
    """
    Expands JSON data stored in specified columns of a Pandas DataFrame (Simplified).
    """

    # Input validation
    if json_instance_col not in df.columns:
        raise ValueError(f"Column '{json_instance_col}' not found in DataFrame.")
    if eval_results_col not in df.columns:
        raise ValueError(f"Column '{eval_results_col}' not found in DataFrame.")

    # Helper function to process each row
    def _process_row_simplified(row):
        prompt, reference, response = None, None, None
        score, explanation = None, None

        # Process jsonInstance column
        json_instance_str = row[json_instance_col]
        if isinstance(json_instance_str, str) and json_instance_str:
            try:
                inner_data = json.loads(json_instance_str)
                if isinstance(inner_data, dict):
                    prompt = inner_data.get("prompt")
                    reference = inner_data.get("reference")
                    response = inner_data.get("response")
            except (json.JSONDecodeError, Exception):
                pass

        # Process evaluationResults column
        evaluation_results = row[eval_results_col]
        if isinstance(evaluation_results, list) and len(evaluation_results) > 0:
            first_result = evaluation_results[0]
            if isinstance(first_result, dict):
                pointwise_result = first_result.get("pointwiseMetricResult")
                if isinstance(pointwise_result, dict):
                    score = pointwise_result.get("score")
                    explanation = pointwise_result.get("explanation")

        return pd.Series(
            [prompt, reference, response, score, explanation],
            index=["prompt", "reference", "response", "score", "explanation"],
        )

    # Apply the helper function row-wise
    extracted_data_df = df.apply(_process_row_simplified, axis=1)
    return extracted_data_df


def extract_metric_score(
    df: pd.DataFrame,
    metric_col: str = "pointwiseMetricResult",
    score_key: str = "score",
) -> pd.DataFrame:
    """
    Extracts a numeric score from a dictionary stored in a DataFrame column.
    """

    # Input validation
    if metric_col not in df.columns:
        raise ValueError(f"Column '{metric_col}' not found in DataFrame.")

    # Extract function
    def _get_score(metric_dict):
        """Helper function to safely extract the score."""
        if isinstance(metric_dict, dict):
            return metric_dict.get(score_key)
        return None

    # Apply the helper function to the metric column
    extracted_scores = df[metric_col].apply(_get_score)

    # Assign the new Series as a column to the DataFrame
    df[metric_col] = extracted_scores

    # Convert the new column to numeric, coercing errors to NaN
    df[metric_col] = pd.to_numeric(df[metric_col], errors="coerce")

    return df


def style_df_for_slide_corrected(
    df: pd.DataFrame,
    n_rows: int = 10,
    text_col_width: int = 200,
    cols_to_show: list = None,
    score_precision: int = 2,
    font_size: str = "10pt",
    caption: str = "Model Evaluation Results",
) -> "pd.io.formats.style.Styler":
    """
    Styles a DataFrame for better presentation, suitable for slide screenshots.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a Pandas DataFrame.")

    # Select cols and rows ---
    if cols_to_show is None:
        default_cols = ["prompt", "reference", "response", "score", "explanation"]
        cols_to_show = [col for col in default_cols if col in df.columns]
        if not cols_to_show:
            cols_to_show = list(df.columns)

    missing_cols = [col for col in cols_to_show if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns not found in DataFrame: {missing_cols}")

    df_view = df[cols_to_show].head(n_rows).copy()

    text_cols = ["prompt", "reference", "response", "explanation"]
    text_cols_in_view = [col for col in text_cols if col in df_view.columns]

    # Format text
    for col in text_cols_in_view:
        df_view[col] = df_view[col].fillna("").astype(str)
        df_view[col] = df_view[col].str.slice(0, text_col_width) + df_view[col].apply(
            lambda x: "..." if len(x) > text_col_width else ""
        )

    # Apply style
    styler = df_view.style

    # Format nums
    if "score" in df_view.columns:
        styler = styler.format({"score": f"{{:.{score_precision}f}}"}, na_rep="-")

    # General table styles
    styles = [
        {
            "selector": "th",
            "props": [
                ("font-size", font_size),
                ("text-align", "center"),
                ("font-weight", "bold"),
                ("background-color", "#f2f2f2"),
            ],
        },
        {
            "selector": "td",
            "props": [
                ("font-size", font_size),
                ("text-align", "left"),
                ("padding", "5px"),
            ],
        },
        {"selector": "tr:nth-child(even)", "props": [("background-color", "#f9f9f9")]},
        {
            "selector": "table",
            "props": [
                ("border-collapse", "collapse"),
                ("border", "1px solid #ccc"),
                ("width", "100%"),
            ],
        },
        {"selector": "th, td", "props": [("border", "1px solid #ddd")]},
        {
            "selector": "caption",
            "props": [
                ("caption-side", "top"),
                ("font-size", "1.2em"),
                ("font-weight", "bold"),
                ("margin", "10px"),
            ],
        },
    ]
    styler = styler.set_table_styles(styles)

    # Hide index
    styler = styler.hide(axis="index")

    # Add caption
    if caption:
        styler = styler.set_caption(caption)

    # Specific column alignment
    if "score" in df_view.columns:
        styler = styler.set_properties(subset=["score"], **{"text-align": "center"})

    return styler

## Prepare evaluation metrics

This is where you define how you want to evaluate your model's responses. The batch evaluation service is powerful and flexible. You can use:

- [Model Based Metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise-pairwise): Use another powerful model (the "autorater") to judge the quality of your target model's output. You can provide a custom prompt template, like we're doing here, to guide the autorater. This is incredibly powerful for assessing subjective qualities like "fluency," "style," or "safety." Autorater model defaults to `gemini-2.0-flash` if not specified in the request and both Pointwise and Pairwise evaluation are supported.

- [Computation based metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#computation-based-metrics): These are traditional, objective metrics like Exact Match, ROUGE (for summarization) and BLEU (for translation) that compare the generated text to a reference text.

Batch evaluation also supports aggregation of successful evaluated instances.
By specifying one or more of the aggegation metrics, it generates a high-level summary of the scores across the entire dataset.

  - AVERAGE
  - MODE
  - STANDARD_DEVIATION
  - VARIANCE
  - MINIMUM
  - MAXIMUM
  - MEDIAN
  - PERCENTILE_P90
  - PERCENTILE_P95
  - PERCENTILE_P99

In [None]:
metrics = [
    {
        "pointwise_metric_spec": {
            "metric_prompt_template": (
                "Evaluate the fluency of this sentence: {response}. "
                "Give score from 0 to 1. 0 - not fluent at all. "
                "1 - very fluent."
            )
        },
        "aggregation_metrics": ["AVERAGE", "MEDIAN"],
    }
]

## Prepare evaluation dataset

Now, let's create our evaluation dataset. The batch service expects each item in your dataset to be a JSON object. For a pointwise, model-based evaluation like ours, each JSON object needs a prompt, a response (the model output you want to evaluate), and optionally a reference (a ground-truth answer).

Here, we're creating a pandas DataFrame first because it's a familiar and easy way to structure data. We have three columns:

- `prompt`: The input given to the model (in this case, a text to be summarized).

- `reference`: A "golden" summary, which we could use for other metrics but won't be used by our specific "fluency" metric. It's still good practice to include it.

- `response`: The actual summary generated by the model we're testing.
The input for batch requests specifies the items to send to the autorater model for evaluation.  Batch evaluation supports both Cloud storage JSONL files and BigQuery tables.  In this tutorial, we are going to use a Cloud storage JSONL file.

In [None]:
eval_dict = {
    "prompt": [
        "Researchers at the Institute for Advanced Studies have developed a new type of solar panel that boasts a 5% increase in efficiency compared to current market leaders. The innovation lies in a novel perovskite crystal structure that is both more stable and better at capturing a wider spectrum of light. Commercial production is expected within three years.",
        "Introducing the 'SilentStep' treadmill. Engineered with advanced noise-reduction technology, it allows for near-silent operation, perfect for apartment living or early morning workouts. It features 12 pre-set programs, a heart rate monitor, and folds easily for storage. Maximum user weight is 250 lbs.",
        "This study investigated the effects of intermittent fasting (IF) versus daily caloric restriction (DCR) on metabolic markers in overweight adults over 12 weeks. Both groups achieved similar weight loss. However, the IF group showed significantly better improvements in insulin sensitivity and reduction in visceral fat compared to the DCR group, suggesting potential unique metabolic benefits beyond weight loss alone.",
        "The old lighthouse stood sentinel on the cliff, its beam cutting through the thick fog rolling in from the sea. For generations, its light had guided ships safely to the harbor below. Elias, the keeper, felt the weight of that tradition as he climbed the winding stairs for his nightly duty, the rhythmic groan of the turning lens a familiar comfort.",
        "The project planning meeting concluded with action items assigned. Marketing (Jane) to finalize competitor analysis by Friday. Engineering (Tom) to provide a prototype schematic by next Wednesday. Budget approval pending confirmation from Finance (Mr. Davies). Next sync meeting scheduled for Thursday, 10 AM.",
        "To prepare the marinade, combine 1/4 cup soy sauce, 2 tablespoons honey, 1 tablespoon sesame oil, 2 minced garlic cloves, and 1 teaspoon grated ginger in a bowl. Whisk well. Add your protein (chicken, beef, or tofu) and ensure it's fully coated. Marinate for at least 30 minutes, or preferably 2 hours in the refrigerator.",
        "The Library of Alexandria, in Egypt, was one of the largest and most significant libraries of the ancient world. Flourishing under the Ptolemaic dynasty, it was dedicated to the Muses, the nine goddesses of the arts. It functioned more as a research institution, attracting scholars from across the Hellenistic world, but its eventual destruction remains a subject of debate among historians.",
        "A blockchain is a distributed, immutable ledger. Transactions are grouped into blocks, each cryptographically linked to the previous one using a hash. This chain structure, combined with decentralization across many computers, makes it extremely difficult to tamper with recorded data.",
        "Deforestation in the Amazon rainforest continues to be a major environmental concern, primarily driven by cattle ranching and agriculture. This loss of forest cover contributes significantly to global carbon emissions and biodiversity loss. Recent satellite data indicates a slight decrease in the rate of deforestation compared to the previous year, but levels remain alarmingly high.",
        "While the novel's premise was intriguing - a world where memories can be traded - the execution felt uneven. Character development was shallow, particularly for the protagonist, and the pacing dragged significantly in the middle third. However, the world-building details were imaginative and offered glimpses of a truly fascinating concept.",
    ],
    "reference": [
        "A new solar panel developed by institute researchers shows a 5% efficiency gain over current leaders due to a novel, stable perovskite structure capturing more light. Commercialization is expected in three years.",
        "The 'SilentStep' treadmill offers near-silent operation suitable for shared spaces. It includes 12 programs, a heart rate monitor, easy folding for storage, and supports up to 250 lbs.",
        "A 12-week study comparing intermittent fasting (IF) and daily caloric restriction (DCR) in overweight adults found similar weight loss, but IF led to significantly better insulin sensitivity and visceral fat reduction, indicating unique metabolic advantages.",
        "An old lighthouse keeper, Elias, feels the weight of tradition as he tends the light that has guided ships through fog for generations, finding comfort in the familiar sounds of the lighthouse.",
        "Meeting takeaways: Jane (Marketing) to complete competitor analysis by Friday; Tom (Engineering) to deliver prototype schematic by next Wednesday. Budget approval awaits Finance confirmation. Next meeting: Thursday, 10 AM.",
        "Whisk together soy sauce, honey, sesame oil, minced garlic, and grated ginger for the marinade. Coat protein (chicken, beef, tofu) and marinate for at least 30 minutes (2 hours refrigerated recommended).",
        "The ancient Library of Alexandria in Egypt, significant under the Ptolemaic dynasty and dedicated to the Muses, served as a major research hub attracting scholars. Its destruction is historically debated.",
        "A blockchain is a distributed, unchangeable ledger where transactions are grouped into blocks cryptographically linked in a chain, making tampering difficult due to decentralization.",
        "Amazon deforestation, mainly from cattle ranching and agriculture, remains a critical environmental issue contributing to carbon emissions and biodiversity loss. While recent data shows a slight decrease, the rate is still alarmingly high.",
        "The novel's intriguing memory-trading premise suffered from uneven execution, with shallow character development and slow pacing, though its imaginative world-building was a highlight.",
    ],
    "response": [
        "Researchers developed a new solar panel. It is 5% more efficient. It uses a new perovskite crystal structure. This structure is stable and captures more light. It should be commercially produced in three years.",
        "This is the 'SilentStep' treadmill. It is quiet because of noise-reduction tech. Good for apartments. Has 12 programs. Has heart monitor. Folds. Max weight 250 lbs.",
        "Overweight adults were studied for 12 weeks comparing intermittent fasting (IF) and daily caloric restriction (DCR). Weight loss was similar. IF group had better insulin sensitivity and less visceral fat than DCR group. IF might have unique benefits.",
        "A lighthouse on a cliff guides ships in fog. Elias is the keeper. He climbs the stairs for his duty. He feels the tradition. The lens groans.",
        "The meeting ended. Jane will do competitor analysis by Friday. Tom will provide a schematic next Wednesday. Finance needs to approve the budget. The next meeting is Thursday at 10 AM.",
        "Make the marinade: mix 1/4c soy sauce, 2tbsp honey, 1tbsp sesame oil, 2 cloves garlic (minced), 1tsp ginger (grated). Whisk it. Put protein in. Cover it. Marinate 30+ minutes, better for 2 hours in fridge.",
        "The Library of Alexandria was a big ancient library in Egypt. It was important during the Ptolemaic rule and dedicated to the Muses. Scholars came there to research. How it was destroyed is debated by historians.",
        "Blockchain is like a shared digital book that cannot be changed easily. Information (transactions) goes into blocks. Blocks are linked using crypto hashes. Because it's spread out on many computers, changing data is very hard.",
        "Deforestation in the Amazon is a big worry. Cattle and farming are main causes. It increases carbon emissions and hurts biodiversity. Satellites show the rate decreased slightly last year, but it's still very high.",
        "The book had a cool idea about trading memories. But it wasn't done perfectly. Characters weren't deep, especially the main one. The middle part was slow. The world details were creative and showed a good concept.",
    ],
}

eval_dict["prompt"] = [
    f"Summarize the following text:\n{p}" for p in eval_dict["prompt"]
]

In [None]:
eval_df = pd.DataFrame(eval_dict)
eval_df.head()

## Load the eval dataset in Cloud storage

The batch evaluation service reads its input from Google Cloud Storage (or BigQuery). Here, we take our pandas DataFrame and save it to GCS in the required JSONL (JSON Lines) format. Each line in the file is a separate, complete JSON object.

In [None]:
evaluation_file_uri = BUCKET_URI + "/pairwise_data.jsonl"
eval_df.to_json(evaluation_file_uri, orient="records", lines=True)

## Send a batch evaluation request

It's time to assemble the final request. This is the heart of our tutorial. We're creating a JSON object that specifies:

- `dataset`: Points to the pointwise.jsonl file we just uploaded to GCS.

- `metrics`: Includes the metrics configuration we defined earlier (the model-based fluency check).

- `output_config`: Tells the service where to save the results—in the root of our GCS bucket.

We save this request to a local file and then use our `send_request` helper function to kick off the job.


In [None]:
request = {
    "dataset": {"gcs_source": {"uris": evaluation_file_uri}},
    "metrics": metrics,
    "output_config": {"gcs_destination": {"output_uri_prefix": BUCKET_URI}},
}

# Write the JSON to a file
with open("pairwise_fluency_request.json", "w") as json_file:
    json.dump(request, json_file, indent=2)

In [None]:
operation = send_request("pairwise_fluency_request.json")

## Wait for the batch evaluation job to complete

Batch jobs are asynchronous, meaning they run in the background. This while loop is a simple poller. It checks the status of the job every 30 seconds using our get_operation helper.

Once the 'done' field in the response is True, we know the job has finished, and we can move on to the fun part: seeing the results.

For a real-world application, you might use a more sophisticated notification system like Pub/Sub or Cloud Functions instead of a while loop.

In [None]:
# Refresh the job until complete
while "done" not in get_operation(operation):
    print("Batch evaluation job is runnning...")
    time.sleep(30)

# Check if the job succeeds
response_json = get_operation(operation)
print("Operation complete. Please see path of the results in outputInfo")

In [None]:
pprint(response_json)

## Get the evaluation results

Success! The job is done. The API response contains the GCS path where the results are stored. We'll grab that path and use pandas to read the output JSONL files directly into DataFrames.

- `evaluation_results.jsonl` contains the detailed, row-by-row evaluation for each item in our dataset.
- `aggregation_results.jsonl` contains the overall AVERAGE and MEDIAN scores we requested.

Finally, we use our handy helper functions to parse the nested JSON and display the results in a beautifully styled table. You should be able to see the fluency score and the autorater's explanation for each response, allowing you to quickly diagnose your model's performance.

In [None]:
output_uri = response_json["response"]["outputInfo"]["gcsOutputDirectory"]

In [None]:
evaluation_results = expand_json_columns_in_df_simplified(
    pd.read_json(
        output_uri + "/evaluation_results.jsonl",
        lines=True,
    )
)

In [None]:
styled_table = style_df_for_slide_corrected(
    evaluation_results,
    n_rows=5,
    text_col_width=300,
    cols_to_show=["prompt", "response", "score", "explanation"],
    caption="Evaluation Summary",
)

display(styled_table)

In [None]:
evaluation_results_agg = extract_metric_score(
    pd.read_json(
        output_uri + "/aggregation_results.jsonl",
        lines=True,
    )
)

In [None]:
evaluation_results_agg