In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate Gemini Structured Output

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2F%2Fgemini%2Fevaluation%2Fevaluate_gemini_structured_output.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_gemini_structured_output.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Steve Phillips](https://github.com/stevie-p) |

## Overview

This notebook uses the [*GenAI Evaluation Service*](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/evaluation) to evaluate and compare the performance of Gemini models for an extraction task.

The task is to accurately extract information from a scanned, handwritten order form for "Acme Corporation".

Within this notebook, we:

* Use Gemini models with [*structured output*](https://ai.google.dev/gemini-api/docs/structured-output) to ensure well-structured JSON output
* Extract the data using Gemini models
* Use the *GenAI Evaluation service* to run the evaluation experiments with a custom *accuracy* metric

The [models](https://ai.google.dev/gemini-api/docs/models) under test are:
* Gemini 2.0 Flash
* Gemini 2.5 Flash
* Gemini 2.5 Pro

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-genai jsonschema IPython==7.34.0

Restart the runtime to use the newly installed packages.

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Create your own project and insert the project ID here ---->

# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

EXPERIMENT_NAME = "eval-gemini-structured"


### Import libraries

In [None]:
import pandas as pd
from vertexai.evaluation import EvalTask, CustomMetric, notebook_utils
from google import genai
from google.cloud import storage  # type: ignore
from google.genai.types import (
    GenerateContentConfig,
    Part,
)
import json
import io
from jsonschema import validate
from IPython.display import display, Image, Markdown
from datetime import datetime
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)


## View the images

Let's have a look at the images we want to extract data from.

In [None]:
images = [
    {
        "image_uri": "gs://eval-extraction-examples/Acme Order Form.jpg",
        "image_type": "image/jpeg",
        "image_name": "Acme Order Form.jpg",
        "reference": {  # The Ground Truth
          "order_number": "98-X42-77A",
          "order_date": "2025-09-01",
          "customer_name": "WILE E. COYOTE (ESQ., PH.D, S.G.)",
          "customer_address": "HIGH MESA, CORNER OF X-MARK AND DETONATION CANYON, ANVIL FALLS, AZ",
          "line_items": [
            {
                "item_description": "Jet Propelled Unicycle",
                "quantity": 1,
                "unit_price": 99.99,
                "delivery_option": "Next Day"
            },
            {
                "item_description": "Instant Hole Kit",
                "quantity": 3,
                "unit_price": 45.00,
                "delivery_option": "Standard"
            },
            {
                "item_description": "TNT High Explosives x24",
                "quantity": 1,
                "unit_price": 120.00,
                "delivery_option": "Fast"
            },
            {
                "item_description": "Super Magnet (XL)",
                "quantity": 1,
                "unit_price": 150.00,
                "delivery_option": "Fast"
            },
            {
                "item_description": "Rocket-Powered Roller skates",
                "quantity": 2,
                "unit_price": 79.99,
                "delivery_option": "Next Day"
            }
          ]
        }
    },
    {
        "image_uri": "gs://eval-extraction-examples/EF0004.jpg",
        "image_type": "image/jpeg",
        "image_name": "EF0004.jpg",
        "reference": {  # The Ground Truth
          "order_number": "EF0004",
          "order_date": "2025-10-26",
          "customer_name": "Elmer J. Fudd",
          "customer_address": "Happy Hunter's Hollow, Looney Tune Forest, CA",
          "line_items": [
            {
                "item_description": "Silent Sneak Shoes",
                "quantity": 1,
                "unit_price": 35.99,
                "delivery_option": "Standard"
            },
            {
                "item_description": "Invisible Rabbit Trap",
                "quantity": 2,
                "unit_price": 75.00,
                "delivery_option": "Standard"
            },
            {
                "item_description": "Giant Butterfly Net",
                "quantity": 1,
                "unit_price": 49.50,
                "delivery_option": "Fast"
            },
            {
                "item_description": "Instant Camouflage Kit",
                "quantity": 3,
                "unit_price": 65.00,
                "delivery_option": "Next Day"
            },
            {
                "item_description": "Repellent Spray",
                "quantity": 4,
                "unit_price": 29.99,
                "delivery_option": "Next Day"
            }
          ]
        }
    },
]

In [None]:
# Extract bucket name and blob path from the GCS URI
from google.cloud import storage
storage_client = storage.Client(project=PROJECT_ID)

for image_info in images:
    image_uri = image_info['image_uri']
    bucket_name = image_uri.split('gs://')[1].split('/')[0]
    blob_path = '/'.join(image_uri.split('gs://')[1].split('/')[1:])

    # Download the image to a temporary file
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_path)
    temp_image_path = f'/tmp/{blob_path.replace("/", "_")}' # Using /tmp as it's a common writable directory in Colab and creating unique filenames
    blob.download_to_filename(temp_image_path)

    # Display the image with a maximum height
    print(image_info['image_name'])
    display(Image(filename=temp_image_path, height=800))

These are mock order forms for *Acme Corporation*, for customers to order various products, and select a delivery option for each; either "Standard", "Fast" or "Next Day".

We will use this form to evaluate the performance of Gemini.

## Extract the data using Gemini


### Select the models

In [None]:
# Define the client and which models to use

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

# Define which models to compare

models = [
    # Gemini 2.0 family
    "gemini-2.0-flash",

    # Gemini 2.5 family
    "gemini-2.5-flash",
    "gemini-2.5-pro"
]

### Define the prompt and schema

In [None]:
# Define the prompt and the structured output schema

prompt = """
    Analyze the attached scanned form and extract the information in the table in accordance with the schema.

    Provide the output in a clean JSON format.

    If any date field is formatted ambiguously, assume the dates are in dd/mm/yyyy format.

    If a field is blank, illegible, or cannot be found, return null for its value.

    If there are blank rows, do not include them in the output.

    If there is no image attached, return null for all fields.

"""

# Use structured output to ensure well formatted and consistent JSON output

schema = {
    "type": "object",
    "properties": {
        "order_number": {
            "type": "string"
        },
        "order_date": {
            "type": "string",
            "format": "date" # Note: Enforces a full date output in the RFC 3339 format ("YYYY-MM-DD")
        },
        "customer_name": {
            "type": "string"
        },
        "customer_address": {
            "type": "string"
        },
        "line_items": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "item_description": {
                        "type": "string"
                    },
                    "quantity": {
                        "type": "integer"
                    },
                    "unit_price": {
                        "type": "number"
                    },
                    "delivery_option": { # Note: We do not tell Gemini how to interpret the checkboxes as "Standard", "Fast" or "Next Day"
                        "type": "string"
                    }
                }
            }
        }
    }
}

generate_content_config = GenerateContentConfig(
    response_mime_type="application/json",
    response_schema=schema,
)

### Run the prompt

In [None]:
# Run the prompt for each model in `models` and each image in `images`, storing the output in `gemini_response`

gemini_response = {}
run_id = notebook_utils.generate_uuid(8)


for image_info in images:
    image = Part.from_uri(file_uri=image_info['image_uri'], mime_type=image_info['image_type'])
    image_name = image_info['image_name']

    gemini_response[image_name] = {}

    for model in models:
        run_name = f"{run_id}-{model}-{image_name}"

        response = client.models.generate_content(
            model=model,
            contents=[
                prompt,
                image
            ],
            config=generate_content_config
        )

        # Parse the response text as JSON, and then pretty-print
        try:
            response_json_data = json.loads(response.text)
            response_json_string = json.dumps(response_json_data, indent=4)
        except json.JSONDecodeError:
            # Handle cases where the response text is not valid JSON
            response_json_string = response.text
            print(f"Warning: Response for {run_name} is not valid JSON.")

        print(f"----------------------------------")
        print(f"{run_name}: {response_json_string}")
        gemini_response[image_name][model] = response_json_string

## Perform the Evaluation

### Prepare the evaluation dataset

Now we have the outputs from the Gemini models we can run the evaulation.

In [None]:
# Create the Evaluation Dataset

eval_dataset_rows = []
for image_info in images:
    image_name = image_info['image_name']
    image_uri = image_info['image_uri']
    image_type = image_info['image_type']
    reference_str = json.dumps(image_info['reference'], indent=4) # Convert the reference (ground truth) to pretty-printed JSON

    if image_name in gemini_response:
        models_data = gemini_response[image_name]
        for model_name, response_text in models_data.items():
            eval_dataset_rows.append({
                "model": model_name,
                "prompt": prompt, # Assuming the same prompt is used for all Gemini calls
                "image": image_name,
                "reference": reference_str,
                "response": response_text
            })

eval_dataset = pd.DataFrame(eval_dataset_rows)

This evaluation data set now contains the reference (ground truth) and response for each combination of model and image.

### Define custom metrics for JSON schema validation and accuracy

In [None]:
# Define a custom evaluation metric to assess whether the response complies with the schema

def is_valid_schema(instance: dict[str,str]) -> dict[str, float]:
    """Return 1 if the response complies with the schema, 0 if not"""

    response = instance["response"]

    try:
        validate(instance=json.loads(response), schema=schema)
    except Exception:
        return {
            "valid_schema": 0
        }

    return {
        "valid_schema": 1
    }

valid_schema = CustomMetric(name="valid_schema", metric_function=is_valid_schema)

In [None]:
# Define a custom evaluation metric to assess the accuracy of the response compared with the reference (ground truth)



def string_similarity(str1, str2):
    """Calculates a simple character-based similarity score between two strings."""
    if not str1 and not str2:
        return 1.0 # Both empty strings are a perfect match
    if not str1 or not str2:
        return 0.0 # One is empty, the other is not

    # Normalize strings for case-insensitive and whitespace-agnostic comparison
    norm_str1 = str1.strip().upper().replace('\n', ' ')
    norm_str2 = str2.strip().upper().replace('\n', ' ')

    # Simple character matching
    match_count = sum(c1 == c2 for c1, c2 in zip(norm_str1, norm_str2))
    max_len = max(len(norm_str1), len(norm_str2))

    return match_count / max_len if max_len > 0 else 1.0


def compare_values_recursive(ref_value, resp_value):
    """Compares two values recursively, handling nested structures and basic types. Returns a tuple: (total_score, comparison_count)."""
    # Handle None values
    if ref_value is None and resp_value is None:
        return (1.0, 1) # Treat two None values as a perfect match (score 1.0, 1 comparison)
    if ref_value is None or resp_value is None:
        return (0.0, 1) # One is None, the other is not (score 0.0, 1 comparison)

    if isinstance(ref_value, dict) and isinstance(resp_value, dict):
        # If both are dictionaries, compare recursively and aggregate scores
        return compare_dicts_recursive(ref_dict=ref_value, resp_dict=resp_value)
    elif isinstance(ref_value, list) and isinstance(resp_value, list):
        # If both are lists, compare recursively and aggregate scores (order doesn't matter here)
        return compare_lists_recursive(ref_list=ref_value, resp_list=resp_value)
    elif isinstance(ref_value, str) and isinstance(resp_value, str):
        # Attempt to parse as date first, return 1.0 for match, 0.0 for mismatch
        try:
            ref_date = datetime.strptime(ref_value, "%Y-%m-%d") # Assuming reference is always YYYY-MM-DD
            # Attempt to parse response date in common formats
            resp_date = None
            try:
                resp_date = datetime.strptime(resp_value, "%Y-%m-%d")
            except ValueError:
                pass # Not a recognized date format

            if resp_date and ref_date == resp_date:
                return (1.0, 1) # Dates match
            else:
                 return (0.0, 1) # Dates don't match or response date not recognized

        except ValueError:
            # If not a date, perform character-based string similarity
            return (string_similarity(ref_value, resp_value), 1)

    elif isinstance(ref_value, (int, float)) and isinstance(resp_value, (int, float)):
        # Numeric comparison with tolerance for floating point differences
        return (1.0, 1) if abs(ref_value - resp_value) < 1e-9 else (0.0, 1) # Return 1.0 for match, 0.0 for mismatch, 1 comparison
    else:
        # Direct comparison for other types (booleans, etc.), return 1.0 for match, 0.0 for mismatch
        return (1.0, 1) if ref_value == resp_value else (0.0, 1)

def compare_dicts_recursive(ref_dict, resp_dict):
    """Compares two dictionaries recursively, allowing extra keys in response. Returns a tuple: (total_score, comparison_count)."""
    total_score = 0
    comparison_count = 0

    # Compare keys in reference
    for key in ref_dict.keys():
        if key in resp_dict:
            score, count = compare_values_recursive(ref_dict[key], resp_dict[key])
            total_score += score
            comparison_count += count
        else:
            # Reference key missing in response - treat as mismatch for all nested elements
            score, count = compare_values_recursive(ref_dict[key], None) # Compare against None to count all elements
            total_score += score
            comparison_count += count


    # Keys only in response are ignored for accuracy based on reference
    # They are not added to comparison_count as we are measuring accuracy against the reference structure


    return (total_score, comparison_count)


def compare_lists_recursive(ref_list, resp_list):
    """Compares two lists recursively, allowing elements to be in any order. Returns a tuple: (total_score, comparison_count)."""
    if not ref_list and not resp_list:
        return (1.0, 1) # Both empty lists are a perfect match

    # If list lengths differ, penalize
    if len(ref_list) != len(resp_list):
         # Simple penalty: every extra/missing item is a full mismatch
         # Calculate potential maximum comparisons based on the larger list
         max_possible_comparisons = max(len(ref_list), len(resp_list)) * get_nested_element_count(ref_list if len(ref_list) > len(resp_list) else resp_list)
         return (0.0, max_possible_comparisons if max_possible_comparisons > 0 else 1)


    total_score = 0
    comparison_count = 0
    resp_copy = [(item, False) for item in resp_list] # Store item and a flag indicating if it's matched

    for i, ref_item in enumerate(ref_list):
        best_match_score = -1 # Initialize with a score less than any possible similarity
        best_match_index = -1
        best_match_count_for_item = 0 # Initialize here

        for j, (resp_item, matched) in enumerate(resp_copy):
            if not matched:
                # Use the compare_values_recursive function to get a similarity score for the items
                try:
                    score, count = compare_values_recursive(ref_item, resp_item) # Use the scoring version
                except Exception as e:
                    # Fallback if scoring fails, which shouldn't happen with compare_values_recursive
                    print(f"Error during recursive comparison: {e}", file=sys.stderr)
                    score = 0.0 # Treat as no match if recursive comparison fails
                    count = get_nested_element_count(ref_item) # Count elements for penalty


                if score > best_match_score:
                    best_match_score = score
                    best_match_index = j
                    best_match_count_for_item = count


        if best_match_index != -1:
            total_score += best_match_score
            comparison_count += best_match_count_for_item
            resp_copy[best_match_index] = (resp_copy[best_match_index][0], True) # Mark as matched
        else:
             # If no match is found, report the missing reference item
             # Compare the reference item against None to count its nested elements as mismatches
             score, count = compare_values_recursive(ref_item, None)
             total_score += score # This should be 0.0
             comparison_count += count # Count all nested elements as mismatches


    # Ensure all items in the response list are accounted for in comparison_count, even if unmatched
    # This prevents inflated accuracy when response list has extra items
    for resp_item, matched in resp_copy:
        if not matched:
             # Compare unmatched response item against None to count its nested elements
             score, count = compare_values_recursive(None, resp_item)
             # Do not add to total_score as these are extra items not in reference
             comparison_count += count


    return (total_score, comparison_count)

def get_nested_element_count(data):
    """Recursively counts the number of comparable elements (non-dict/list) in the structure."""
    count = 0
    if isinstance(data, dict):
        for value in data.values():
            count += get_nested_element_count(value)
    elif isinstance(data, list):
        for item in data:
            count += get_nested_element_count(item)
    else:
        count = 1 # Count this element
    return count


def calculate_accuracy(instance: dict[str,str]) -> dict[str, float]:
    """
    This calculates accuracy by recursively comparing the JSON structures,
    allowing for elements to be in any order in lists and ignoring extra fields in the response.
    Returns an average similarity score.
    """

    ref_json_string = instance["reference"]
    resp_json_string = instance["response"]

    try:
        reference_data = json.loads(ref_json_string)
        response_data = json.loads(resp_json_string)
    except json.JSONDecodeError:
        # If JSON is invalid or parsing fails, return 0 accuracy
        return {"accuracy": 0.0}

    # Perform a recursive comparison
    total_score, comparison_count = compare_values_recursive(reference_data, response_data)

    # Calculate overall accuracy
    overall_accuracy = total_score / comparison_count if comparison_count > 0 else 1.0


    return {
        "accuracy": overall_accuracy
    }


accuracy = CustomMetric(name="accuracy", metric_function=calculate_accuracy)

### Define EvalTask & Experiment

In [None]:
# Define the evaluation task

extraction_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        "exact_match", # Exact match will only be 1 if the response is perfectly accurate, with no allowance for inconsistent JSON formatting. Hence, the custom `accuracy` metric is the better metric.
        valid_schema,
        accuracy
    ],
    experiment=EXPERIMENT_NAME
)

### Run the Evaluation

In [None]:
# Define the experiment & experiment run

experiment_run_name = f"eval-{run_id}"

eval_result = extraction_eval_task.evaluate(
    # experiment_run_name = experiment_run_name
)

### Display the results

In [None]:
notebook_utils.display_eval_result(eval_result=eval_result)

## Analyse the results

### Do a field-wise comparison

It is helpful to do a deeper comparison to investigate where each Gemini model is not extracting the information accurately.

In [None]:
def deep_compare_and_print(ref, resp, path="", output_buffer=None):
    """Recursively compares and prints differences to the provided buffer."""
    if output_buffer is None:
      output_buffer = sys.stdout # Default to stdout if no buffer is provided

    if isinstance(ref, dict) and isinstance(resp, dict):
        all_keys = set(ref.keys()).union(set(resp.keys()))
        for key in all_keys:
            new_path = f"{path}.{key}" if path else key
            if key in ref and key in resp:
                deep_compare_and_print(ref[key], resp[key], new_path, output_buffer)
            elif key in ref:
                print(f"  Mismatch at {new_path}: Reference has value '{ref[key]}', Response is missing key.", file=output_buffer)
            else: # key in resp
                 print(f"  Mismatch at {new_path}: Response has value '{resp[key]}', Reference is missing key.", file=output_buffer)

    elif isinstance(ref, list) and isinstance(resp, list):
        # Compare lists by trying to find the best match for each reference item in the response list
        ref_copy = list(ref)
        resp_copy = [(item, False) for item in resp] # Store item and a flag indicating if it's matched

        for i, ref_item in enumerate(ref_copy):
            new_path = f"{path}[{i}]"
            best_match_score = -1 # Initialize with a score less than any possible similarity
            best_match_index = -1

            for j, (resp_item, matched) in enumerate(resp_copy):
                if not matched:
                    # Use the compare_values_recursive function to get a similarity score for the items
                    try:
                        score, _ = compare_values_recursive(ref_item, resp_item) # Use the scoring version
                    except Exception as e:
                        # Fallback if scoring fails, which shouldn't happen with compare_values_recursive
                        print(f"Error during recursive comparison: {e}", file=sys.stderr)
                        score = 0.0 # Treat as no match if recursive comparison fails


                    if score > best_match_score:
                        best_match_score = score
                        best_match_index = j


            if best_match_index != -1:
                # If a match is found, recursively compare the matched items
                deep_compare_and_print(ref_item, resp_copy[best_match_index][0], new_path, output_buffer)
                resp_copy[best_match_index] = (resp_copy[best_match_index][0], True) # Mark as matched
            else:
                 # If no match is found, report the missing reference item
                 print(f"  Mismatch in list at {new_path}: Reference item '{ref_item}' has no close match in response.", file=output_buffer)


        # Report any extra items in the response list
        for j, (resp_item, matched) in enumerate(resp_copy):
             if not matched:
                 print(f"  Mismatch in list at {path}[extra_item_{j}]: Response has extra item '{resp_item}'.", file=output_buffer)


    else:
        # Use the compare_values_recursive from the other cell for the actual comparison
        # If not a perfect match (score < 1.0), print the mismatch
        try:
            score, _ = compare_values_recursive(ref, resp) # Use the scoring version
            if score < 1.0:
                 print(f"  Mismatch at {path}: Reference='{ref}', Response='{resp}' (Similarity: {score:.2f})", file=output_buffer)
        except Exception as e:
            # Fallback to boolean comparison if scoring fails
            print(f"Error during recursive comparison: {e}", file=sys.stderr)
            if ref != resp:
                 print(f"  Mismatch at {path}: Reference='{ref}', Response='{resp}' (Direct equality check)", file=output_buffer)



# Create a string buffer to capture the output
comparison_output_buffer = io.StringIO()

print("Deep comparison for all models and images:", file=comparison_output_buffer)

for index, row in eval_dataset.iterrows():
    model_name = row['model']
    image_name = row['image']
    reference_string = row['reference']
    response_string = row['response']

    print(f"\n--- Comparing {model_name} on {image_name} ---", file=comparison_output_buffer)

    try:
        reference_data = json.loads(reference_string)
        response_data = json.loads(response_string)

        deep_compare_and_print(reference_data, response_data, output_buffer=comparison_output_buffer)

    except json.JSONDecodeError as e:
        print(f"Could not decode JSON for comparison: {e}", file=comparison_output_buffer)

print("\n--- End of Deep Comparison ---", file=comparison_output_buffer)

# Get the captured output from the buffer and store it in a variable
deep_comparison_output = comparison_output_buffer.getvalue()

# You can now use the 'deep_comparison_output' variable as needed.
# For example, you could print it or pass it to another function.
print(deep_comparison_output)

### Use Gemini to analyse the results

In [None]:
# Let's get Gemini to analyse the results.

# prompt: Write code that calls Gemini 2.5 Flash to summarise and analyze the output of this experiment. What conclusions can be drawn?

# Prepare the prompt for Gemini 2.5 Flash to summarize and analyze the results
summary_prompt = """
Analyze the following experiment results comparing Gemini models for extracted data from a scanned form.
The results include a summary table with overall metrics and row-based metrics, as well as a detailed field-wise comparison highlighting mismatches between the extracted data and the reference (ground truth).

Summarize the performance of each model based on the metrics provided (valid_schema, accuracy) from the summary table.
Analyze the detailed field-wise comparison to understand the *types* of errors and mismatches occurring for each model.
Identify which models performed best and worst for each metric and based on the detailed error analysis.
Draw conclusions about the strengths and weaknesses of Gemini models for this specific tabular data extraction task, considering both the overall accuracy and the nature of the errors.
Consider the different versions of Gemini and how their performance varies.
Provide a clear and concise summary of the overall results, followed by key conclusions supported by observations from the detailed comparison.

Experiment Results Summary Table:
"""

# Convert the evaluation results summary and row-based metrics to a string format
# Assuming eval_result has a structure that can be converted to a readable string
try:
    # This will likely involve converting the DataFrames within eval_result to string
    eval_result_string = str(eval_result)
except Exception as e:
    eval_result_string = f"Could not convert evaluation results to string: {e}"
    print(eval_result_string)


# Concatenate the prompt, summary table results, and detailed comparison output
full_prompt = summary_prompt + eval_result_string + "\n\nDetailed Field-wise Comparison:\n" + deep_comparison_output

# Use Gemini 2.5 Flash to analyze the results
try:

    # Generate the response
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=full_prompt
    )

    # Display the summary and analysis from Gemini
    display(Markdown(response.text))

except Exception as e:
    print(f"An error occurred while calling Gemini: {e}")
    print("Please ensure you have access to Gemini 2.5 Flash and your project/location settings are correct.")

## Conclusions


This notebook has shown how to use the Gen AI Evaluation Service to evaluate Gemini's Structured Output, for a document processing task.

It uses a "bring your own response" approach and uses custom `valid_schema` and `accuracy` metrics as well as the `exact_match` metric.

It also does a deep "field-wise" comparison of the responses to understand inaccuracies, and uses Gemini to summarise and analyse the results.