In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate images with predefined Gecko

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluate_images_with_predefined_gecko.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb">
      <img width="32px" src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main//gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>


<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_predefined_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

 | | | | |
 |-|-|-|-|
 |Author(s): | [Greg Breard](https://github.com/gregbreard) | Anant Nawalgaria | Olivia Wiles |

## Overview

This CoLAB shows how to leverage the Vertex AI evaluation service in order to run [Gecko](https://arxiv.org/abs/2404.16820).

As with a more standard rubric approach, Gecko proceeds in two stages: a rubric generation step followed by a validator step. The key difference is that the rubric is generated based on the prompt.
This allows for a more fine-grained metric that can be customized to prompts with differing challenges.

In more detail, Gecko proceeds as follows, with two key steps: the QA generation step (ie the rubric generation step) and then the VQA step (ie the validator step).

## The rubric generation step
Given a prompt, such as `A teddy bear riding a skateboard`, we prompt the Gemini model to generate a set of questions, answer choices and corresponding ground truth (GT) answer. The question is also tagged with a question type. Depending on the prompt, these questions can either be `yes`/`no` questions or multiple choice ones.

`A teddy bear riding a skateboard` -->

- `Q1: Is there a teddy bear? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q2: Is there a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q3: Is the teddy bear riding a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Action.`

## The validator step
Given a generated image and the questions above, we query the Gemini model for each question to give an answer. We then check if it matches the GT answer, with a result of 1 if it matches and 0 if it does not. We aggregate these results to give a final overall score, which can be broken down into scores per question. We can also aggregate scores based on tags.

For example, imagine we have a generated image `<image1>` which includes a teddy bear but no skateboard, and Gemini outputs the following results:

- `<image1> Is there a teddy bear? GT Answer: yes. Result: 1.`
- `<image1> Is there a skateboard? GT Answer: no. Result: 0.`
- `<image1> Is the teddy bear riding a skateboard? GT Answer: no. Result: 0.`

The final score will be `0.33` with a score of `0.5` for the question tag and `0.0` for the action tag.

## Further exploration
We provide two predefined metrics, engineered for video and image generation tasks. Below, we show how to run Gecko for the image modality on a set of generations.

The quality can be analysed by exploring what  questions are generated as well as the reliability of the validator step. Questions can also be manually added as desired for an application.

## Steps

1. Set up the environment.
2. Prepare the dataset for evaluation.
3. Run the evaluation.

## Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Get started

In [None]:
# @title ### Install Vertex AI SDK for Python and other required packages

%pip install --upgrade --quiet "google-cloud-aiplatform[evaluation]>=1.122.0"

In [None]:
# @title ### Authenticate your notebook environment (Colab only)
# @markdown If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# @title ### Set Google Cloud project information
# @markdown To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).
# @markdown Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

# @markdown ---

# fmt: off
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION= "us-central1"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
# fmt: on

from vertexai import Client, types

client = Client(project=PROJECT_ID, location=LOCATION)

# Prepare the dataset

In the following dataset, two prompts are used for each generated image. The first is the prompt that corresponds to the generated content. The second is a counterexample that is similar but does not exactly match the generated content. This is done to demonstrate the difference in the Gecko evaluation for high quality and low quality responses.

In [None]:
import pandas as pd

prompts = [
    "steaming cup of coffee and a croissant on a table",
    "steaming cup of coffee and toast in a cafe",
    "sunset over a calm ocean",
    "sunset over a tranquil forest",
    "butterfly with colorful wings on a flower",
    "butterfly fluttering over a leaf",
    "musician playing guitar on a street corner",
    "musician playing saxophone under lamp post",
    "vintage camera with a worn leather strap",
    "new camera with a power zoom lens",
    "colorful abstract painting",
    "black and white painting",
    "baker decorating a cake with frosting",
    "baker topping cupcakes with sprinkles",
    "hot air balloon floating above a field of lavender",
    "hot air balloon landing in a field of sunflowers",
]
responses = [
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/sunset.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/sunset.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/butterfly.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/butterfly.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/musician.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/musician.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/camera.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/camera.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/abstract.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/abstract.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/baker.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/baker.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/balloon.png",
                }
            }
        ],
        "role": "model",
    },
    {
        "parts": [
            {
                "file_data": {
                    "mime_type": "image/png",
                    "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/balloon.png",
                }
            }
        ],
        "role": "model",
    },
]

eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "response": responses,
    }
)

# Run evaluation

In [None]:
# @title ### Generate rubrics
# @markdown First we generate rubrics for the user prompts.

data_with_rubrics = client.evals.generate_rubrics(
    src=eval_dataset,
    rubric_group_name="gecko_image_rubrics",
    predefined_spec_name=types.RubricMetric.GECKO_TEXT2IMAGE,
)

data_with_rubrics.show()

In [None]:
# @title ### Evaluate with rubrics
# @markdown Then we use the generated rubrics to evaluate the quality of the responses.

eval_result = client.evals.evaluate(
    dataset=data_with_rubrics,
    metrics=[types.RubricMetric.GECKO_TEXT2IMAGE],
)

eval_result.show()