In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using open autorater for running evaluations with Vertex AI Gen AI Evaluation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fevaluation%2Fvertex_ai_tgi_evaluate_llm_with_open_judge.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_evaluate_llm_with_open_judge.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Ivan Nardini](https://github.com/inardini) |

## Overview

This notebook demostrates how to evaluate the performance of a Large Language Model (LLM) using the Vertex AI Generative AI Evaluation service. Specifically, you will learn how to leverage an open judge model deployed on Vertex AI, such as `AtlaAI/Selene-1-Mini-Llama-3.1-8B`, to evaluate responses generated by another LLM against predefined criteria.

By following this tutorial, you will perform the following key steps:

- **Deploy Judge Model:** Upload and deploy the open-source Selene model to a Vertex AI Endpoint to serve as the autorater.
- **Test Prediction:** Send a sample request to the deployed Selene model to ensure it's operational.
- **Prepare Data:** Load or define an evaluation dataset containing prompts, reference answers, model responses, and corresponding human ratings.
- **Define Custom Metric:** Create a custom evaluation metric (e.g., 'Completeness') with a detailed prompt template and scoring rubric for the autorater.
- **Run Evaluation:** Execute the Vertex AI evaluation task, instructing the deployed Selene model to score the target LLM's responses based on the custom metric.
- **Evaluate the Autorater:** Compare the autorater's scores against the human ratings provided in the dataset to assess the autorater's alignment with human judgment (meta-evaluation).
- **Visualize Results:** Generate plots (distributions, confusion matrix, scatter plot) to visually analyze the agreement between the autorater and human ratings.

## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
# import sys

# if "google.colab" in sys.modules:
#     from google.colab import auth

#     auth.authenticate_user()

### Authenticate your Hugging Face account

Authenticate with Hugging Face Hub using the `interpreter_login` function from the `huggingface_hub` library, which will prompt you to enter a Hugging Face access token. This token authorizes the notebook to download the model artifacts and tokenizer.


In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

Read more about [Hugging Face Security](https://huggingface.co/docs/hub/en/security), specifically about [Hugging Face User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens).

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}

BUCKET_URI = f"gs://{BUCKET_NAME}"

!gsutil mb -l {LOCATION} {BUCKET_URI}

EXPERIMENT_NAME = "eval-open-judge"  # @param {type:"string"}

vertexai.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
    experiment=EXPERIMENT_NAME,
)

## Import libraries

Imports the necessary Python libraries used throughout the notebook.


In [None]:
import os
from typing import Any

from IPython.display import Markdown, display
from google.cloud import aiplatform
from huggingface_hub import get_token
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from transformers import AutoTokenizer
from vertexai.preview.evaluation import AutoraterConfig, EvalTask, PointwiseMetric
from vertexai.preview.evaluation.autorater_utils import evaluate_autorater

## Define constants

In [None]:
JITTER_AMOUNT = 0.1
PLOTLY_RENDERER = "colab"
DEFAULT_HUMAN_RATING_COL = "human_rating"
DEFAULT_SCORE_COL = "score"

## Define helpers

Defines some helpers function to format user requests and plot an evaluation report.


In [None]:
def format_user_content(user_content, tokenizer, **kwargs):
    """
    Applies tokenizer.apply_chat_template to user content string.
    Assumes user_content is the text for the 'user' role.
    """

    message = [
        {"role": "user", "content": user_content},
    ]
    kwargs.setdefault("tokenize", False)
    kwargs.setdefault("add_generation_prompt", True)

    try:
        formatted_input = tokenizer.apply_chat_template(message, **kwargs)
        return formatted_input
    except Exception as e:
        print(
            f"Error applying chat template to content: '{user_content[:50]}...'. Error: {e}"
        )
        return None


def prepare_dataframe(
    raw_data: Any,
    human_col_name: str = DEFAULT_HUMAN_RATING_COL,
    score_col_name: str = DEFAULT_SCORE_COL,
) -> pd.DataFrame:
    """
    Prepares the DataFrame from raw data, renames columns, and converts types.
    """
    try:
        if isinstance(raw_data, pd.DataFrame):
            df = raw_data.copy()
            if len(df.columns) >= 2:
                # If it's a DataFrame, check if desired columns exist, else use first two
                if human_col_name not in df.columns or score_col_name not in df.columns:
                    original_cols = df.columns
                    df = df[
                        [original_cols[0], original_cols[1]]
                    ].copy()  # Use first two
                    df.columns = [human_col_name, score_col_name]
                else:
                    # Ensure we only keep the needed columns if more exist
                    df = df[[human_col_name, score_col_name]].copy()
            else:
                print(
                    f"Warning: Input DataFrame has < 2 columns. Expected at least '{human_col_name}' and '{score_col_name}'."
                )
                return pd.DataFrame(columns=[human_col_name, score_col_name])
        else:
            df = pd.DataFrame(raw_data)
            if df.empty:
                print("Warning: Created empty DataFrame from raw_data.")
                return pd.DataFrame(columns=[human_col_name, score_col_name])

            if len(df.columns) >= 2:
                df = df.iloc[
                    :, :2
                ]  # Assume first two columns are human rating and score
                df.columns = [human_col_name, score_col_name]  # Rename for simplicity
            else:
                print(
                    f"Warning: DataFrame from raw_data has < 2 columns. Cannot set '{human_col_name}' and '{score_col_name}'."
                )
                return pd.DataFrame(columns=[human_col_name, score_col_name])

        df[human_col_name] = pd.to_numeric(df[human_col_name], errors="coerce")
        df[score_col_name] = pd.to_numeric(df[score_col_name], errors="coerce")
        df.dropna(subset=[human_col_name, score_col_name], inplace=True)
        df[human_col_name] = df[human_col_name].astype(float)
        df[score_col_name] = df[score_col_name].astype(float)
        return df

    except Exception as e:
        print(f"Error preparing DataFrame: {e}")
        return pd.DataFrame(columns=[human_col_name, score_col_name])  # Return empty df


def extract_completeness_metrics(
    metrics_data: list[dict[str, Any]] | None,
) -> tuple[list[list[Any]] | None, list[str] | None, list[float] | None]:
    """
    Extracts confusion matrix info from metrics data (expected at index 0).
    """
    if not metrics_data or not isinstance(metrics_data, list) or len(metrics_data) == 0:
        print("Warning: Metrics data is empty or not a list.")
        return None, None, None

    try:
        completeness_metrics = metrics_data[0]
        if (
            "confusion_matrix" not in completeness_metrics
            or "confusion_matrix_labels" not in completeness_metrics
        ):
            print(
                "Warning: 'confusion_matrix' or 'confusion_matrix_labels' not in first metrics item."
            )
            return None, None, None

        cm = completeness_metrics["confusion_matrix"]
        cm_labels = completeness_metrics["confusion_matrix_labels"]
        cm_labels_numeric = [float(cl) for cl in cm_labels]
        return cm, cm_labels, cm_labels_numeric
    except (KeyError, IndexError, ValueError, TypeError) as e:
        print(f"Warning: Could not extract confusion matrix info from metrics: {e}")
        return None, None, None


def plot_distribution_comparison(
    df: pd.DataFrame,
    human_col: str = DEFAULT_HUMAN_RATING_COL,
    score_col: str = DEFAULT_SCORE_COL,
) -> go.Figure:
    """Generates bar charts comparing distributions of human ratings and model scores."""
    human_counts = df[human_col].value_counts().sort_index()
    score_counts = df[score_col].value_counts().sort_index()

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=("Human Rating Distribution", "Model Score Distribution"),
    )

    fig.add_trace(
        go.Bar(
            x=human_counts.index,
            y=human_counts.values,
            name="Human Rating",
            marker_color="indianred",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Bar(
            x=score_counts.index,
            y=score_counts.values,
            name="Model Score",
            marker_color="lightsalmon",
        ),
        row=1,
        col=2,
    )

    fig.update_layout(
        title_text="Distribution of Human Ratings vs. Model Scores",
        bargap=0.2,
        xaxis1_title="Rating Value",
        yaxis1_title="Count",
        xaxis2_title="Score Value",
        yaxis2_title="Count",
        xaxis1_type="category",
        xaxis2_type="category",
        xaxis1=dict(
            categoryorder="array", categoryarray=sorted(human_counts.index.unique())
        ),
        xaxis2=dict(
            categoryorder="array", categoryarray=sorted(score_counts.index.unique())
        ),
        height=400,
    )
    return fig


def plot_confusion_matrix(
    cm: list[list[Any]] | None, cm_labels: list[str] | None
) -> go.Figure | None:
    """Generates a heatmap for the confusion matrix."""
    if cm is None or cm_labels is None:
        print("Skipping confusion matrix plot: missing data.")
        return None

    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=cm_labels,
            y=cm_labels,
            hoverongaps=False,
            colorscale="Blues",
            text=cm,
            texttemplate="%{text}",
            zmin=0,
        )
    )

    fig.update_layout(
        title="Confusion Matrix: Human Rating vs. Model Score (Completeness)",
        xaxis_title="Predicted (Model Score)",
        yaxis_title="True (Human Rating)",
        yaxis=dict(type="category", categoryorder="array", categoryarray=cm_labels),
        xaxis=dict(type="category", categoryorder="array", categoryarray=cm_labels),
        height=600,
        width=600,
    )
    return fig


def plot_jitter_scatter(
    df: pd.DataFrame,
    cm_labels: list[str] | None,
    cm_labels_numeric: list[float] | None,
    human_col: str = DEFAULT_HUMAN_RATING_COL,
    score_col: str = DEFAULT_SCORE_COL,
) -> go.Figure:
    """Generates a jitter scatter plot comparing individual scores and ratings."""
    df_jitter = df[[human_col, score_col]].copy()

    if cm_labels is None or cm_labels_numeric is None:
        print(
            "Using data range for jitter plot axes due to missing confusion matrix labels."
        )
        min_val: float = (
            min(df_jitter[human_col].min(), df_jitter[score_col].min()) - 0.5
        )
        max_val: float = (
            max(df_jitter[human_col].max(), df_jitter[score_col].max()) + 0.5
        )
        plot_range = [min_val, max_val]
        tick_vals = sorted(
            df_jitter[human_col].unique()
        )  # Use unique human ratings for ticks if available
        tick_text = [str(int(v)) if v == int(v) else str(v) for v in tick_vals]
    else:
        plot_range = [min(cm_labels_numeric) - 0.5, max(cm_labels_numeric) + 0.5]
        tick_vals = cm_labels_numeric
        tick_text = cm_labels

    # Add jitter
    df_jitter[f"{human_col}_jitter"] = df_jitter[human_col] + np.random.uniform(
        -JITTER_AMOUNT, JITTER_AMOUNT, size=len(df_jitter)
    )
    df_jitter[f"{score_col}_jitter"] = df_jitter[score_col] + np.random.uniform(
        -JITTER_AMOUNT, JITTER_AMOUNT, size=len(df_jitter)
    )

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_jitter[f"{score_col}_jitter"],
            y=df_jitter[f"{human_col}_jitter"],
            mode="markers",
            marker=dict(
                color="rgba(0, 100, 200, 0.7)",
                size=10,
                line=dict(width=1, color="DarkSlateGrey"),
            ),
            text=[
                f"HR: {hr:.1f}, Score: {s:.1f}"
                for hr, s in zip(df_jitter[human_col], df_jitter[score_col])
            ],  # Format hover text
            hoverinfo="text",
            name="Ratings",
        )
    )

    # Add ideal alignment line (y=x)
    fig.add_trace(
        go.Scatter(
            x=plot_range,
            y=plot_range,
            mode="lines",
            name="Ideal Alignment (Score = Human Rating)",
            line=dict(color="red", dash="dash"),
        )
    )

    fig.update_layout(
        title="Model Score vs. Human Rating (with Jitter)",
        xaxis_title="Model Score (Jittered)",
        yaxis_title="Human Rating (Jittered)",
        xaxis=dict(
            range=plot_range, tickvals=list(tick_vals), ticktext=tick_text
        ),  # tickvals expects list
        yaxis=dict(range=plot_range, tickvals=list(tick_vals), ticktext=tick_text),
        width=600,
        height=600,
        showlegend=True,
        hovermode="closest",
    )
    return fig


def run_visual_analysis(
    df_data: Any,
    metrics: list[dict[str, Any]] | None,
    human_col_name: str = DEFAULT_HUMAN_RATING_COL,
    score_col_name: str = DEFAULT_SCORE_COL,
) -> None:
    """
    Runs the visual analysis comparing model scores and human ratings.
    """
    display(Markdown("# Visual Analysis: Model Score vs. Human Rating Alignment "))

    # 1. Prepare Data
    df = prepare_dataframe(
        df_data, human_col_name=human_col_name, score_col_name=score_col_name
    )
    cm, cm_labels, cm_labels_numeric = extract_completeness_metrics(metrics)

    # Check if DataFrame creation failed
    if df.empty:
        display(
            Markdown(
                f"Could not create valid DataFrame with columns '{human_col_name}' and '{score_col_name}' from input data. Stopping analysis."
            )
        )
        return

    if human_col_name not in df.columns or score_col_name not in df.columns:
        display(
            Markdown(
                f"Expected columns '{human_col_name}' and '{score_col_name}' not found in DataFrame. Stopping analysis."
            )
        )
        return

    # 2. Visualization 1: Distribution Comparison
    display(Markdown("## 1. Distributions Comparison"))
    fig_dist: go.Figure | None = plot_distribution_comparison(
        df, human_col=human_col_name, score_col=score_col_name
    )
    if fig_dist:
        fig_dist.show(renderer=PLOTLY_RENDERER)
        display(
            Markdown("*Shows if the model score distribution mirrors human ratings.*")
        )
    else:
        display(Markdown("*Could not generate distribution comparison plot.*"))
    print("\n")

    # 3. Visualization 2: Confusion Matrix Heatmap
    display(Markdown("## 2. Confusion Matrix (Human vs. Model)"))
    fig_cm: go.Figure | None = plot_confusion_matrix(cm, cm_labels)
    if fig_cm:
        fig_cm.show(renderer=PLOTLY_RENDERER)
        display(
            Markdown(
                "*Visualizes agreement and disagreement between discrete human ratings and model scores. Ideal alignment is along the diagonal.*"
            )
        )
    else:
        display(
            Markdown(
                "*Confusion Matrix could not be generated (check metrics data or ensure it's at index 0).**"
            )
        )
    print("\n")

    # 4. Visualization 3: Jitter Scatter Plot
    display(Markdown("## 3. Score vs. Rating Alignment (Jitter Plot)"))
    fig_scatter: go.Figure | None = plot_jitter_scatter(
        df,
        cm_labels,
        cm_labels_numeric,
        human_col=human_col_name,
        score_col=score_col_name,
    )
    if fig_scatter:
        fig_scatter.show(renderer=PLOTLY_RENDERER)
        display(
            Markdown(
                "*Shows individual item alignment. Points close to the red dashed line indicate good agreement.*"
            )
        )
    else:
        display(Markdown("*Could not generate jitter scatter plot.*"))

## Deploy your open judge model

Deploying an open model means making it available as a callable API endpoint within Vertex AI.

First, the `aiplatform.Model.upload(...)` function performs the registration of the model with Vertex AI, specifying its Hugging Face ID (`AtlaAI/Selene-1-Mini-Llama-3.1-8B`), the serving container image (a pre-built Text Generation Inference container), and necessary environment variables like your Hugging Face token.

In [None]:
judge_model = aiplatform.Model.upload(
    display_name="google--selene-1-mini-llama-3.1-8b",
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311",
    serving_container_environment_variables={
        "MODEL_ID": "AtlaAI/Selene-1-Mini-Llama-3.1-8B",
        "NUM_SHARD": "1",
        "HUGGING_FACE_HUB_TOKEN": get_token(),
    },
    serving_container_ports=[8080],
)
judge_model.wait()

After the model is uploaded, `judge_model.deploy(...)` deploys it to a newly created Vertex AI Endpoint, specifying the required hardware (machine type, GPU type and count). This makes the Selene model ready to receive prediction requests, in this case, requests to evaluate LLM responses.

In [None]:
deployed_judge_model = judge_model.deploy(
    endpoint=aiplatform.Endpoint.create(
        display_name="google--selene-1-mini-llama-3.1-8b-endpoint"
    ),
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    enable_access_logging=True,
)

## Generate predictions

You can verify that the deployed Selene model endpoint is active and responding correctly by loading the model's specific tokenizer, formatting a simple test message using the chat template, and then sending this formatted input to the endpoint via the `deployed_model.predict(...)` method.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "AtlaAI/Selene-1-Mini-Llama-3.1-8B", token=get_token()
)

messages = [
    {"role": "user", "content": "I heard you can evaluate my responses?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

prediction = deployed_judge_model.predict(
    instances=[
        {
            "inputs": inputs,
            "parameters": {
                "temperature": 0,
                "max_new_tokens": 512,
            },
        },
    ]
)
print(prediction.predictions[0])

## Evaluate your LLM

### Prepare the evaluation data

To evaluate your LLM, you start by preparing your evaluation data as Pandas DataFrame. The dataframe contains lists for `user_input` (prompts), `ground_truth` (ideal answers), `assistant_response` (the actual responses from the LLM being evaluated), and `completeness/human_rating` (scores given by humans for one specific metric).


In [None]:
human_rated_dict = {
    "user_input": [
        "Researchers at the Institute for Advanced Studies have developed a new type of solar panel that boasts a 5% increase in efficiency compared to current market leaders. The innovation lies in a novel perovskite crystal structure that is both more stable and better at capturing a wider spectrum of light. Commercial production is expected within three years.",
        "Introducing the 'SilentStep' treadmill. Engineered with advanced noise-reduction technology, it allows for near-silent operation, perfect for apartment living or early morning workouts. It features 12 pre-set programs, a heart rate monitor, and folds easily for storage. Maximum user weight is 250 lbs.",
        "This study investigated the effects of intermittent fasting (IF) versus daily caloric restriction (DCR) on metabolic markers in overweight adults over 12 weeks. Both groups achieved similar weight loss. However, the IF group showed significantly better improvements in insulin sensitivity and reduction in visceral fat compared to the DCR group, suggesting potential unique metabolic benefits beyond weight loss alone.",
        "The old lighthouse stood sentinel on the cliff, its beam cutting through the thick fog rolling in from the sea. For generations, its light had guided ships safely to the harbor below. Elias, the keeper, felt the weight of that tradition as he climbed the winding stairs for his nightly duty, the rhythmic groan of the turning lens a familiar comfort.",
        "The project planning meeting concluded with action items assigned. Marketing (Jane) to finalize competitor analysis by Friday. Engineering (Tom) to provide a prototype schematic by next Wednesday. Budget approval pending confirmation from Finance (Mr. Davies). Next sync meeting scheduled for Thursday, 10 AM.",
        "To prepare the marinade, combine 1/4 cup soy sauce, 2 tablespoons honey, 1 tablespoon sesame oil, 2 minced garlic cloves, and 1 teaspoon grated ginger in a bowl. Whisk well. Add your protein (chicken, beef, or tofu) and ensure it's fully coated. Marinate for at least 30 minutes, or preferably 2 hours in the refrigerator.",
        "The Library of Alexandria, in Egypt, was one of the largest and most significant libraries of the ancient world. Flourishing under the Ptolemaic dynasty, it was dedicated to the Muses, the nine goddesses of the arts. It functioned more as a research institution, attracting scholars from across the Hellenistic world, but its eventual destruction remains a subject of debate among historians.",
        "A blockchain is a distributed, immutable ledger. Transactions are grouped into blocks, each cryptographically linked to the previous one using a hash. This chain structure, combined with decentralization across many computers, makes it extremely difficult to tamper with recorded data.",
        "Deforestation in the Amazon rainforest continues to be a major environmental concern, primarily driven by cattle ranching and agriculture. This loss of forest cover contributes significantly to global carbon emissions and biodiversity loss. Recent satellite data indicates a slight decrease in the rate of deforestation compared to the previous year, but levels remain alarmingly high.",
        "While the novel's premise was intriguing - a world where memories can be traded - the execution felt uneven. Character development was shallow, particularly for the protagonist, and the pacing dragged significantly in the middle third. However, the world-building details were imaginative and offered glimpses of a truly fascinating concept.",
    ],
    "ground_truth": [
        "A new solar panel developed by institute researchers shows a 5% efficiency gain over current leaders due to a novel, stable perovskite structure capturing more light. Commercialization is expected in three years.",
        "The 'SilentStep' treadmill offers near-silent operation suitable for shared spaces. It includes 12 programs, a heart rate monitor, easy folding for storage, and supports up to 250 lbs.",
        "A 12-week study comparing intermittent fasting (IF) and daily caloric restriction (DCR) in overweight adults found similar weight loss, but IF led to significantly better insulin sensitivity and visceral fat reduction, indicating unique metabolic advantages.",
        "An old lighthouse keeper, Elias, feels the weight of tradition as he tends the light that has guided ships through fog for generations, finding comfort in the familiar sounds of the lighthouse.",
        "Meeting takeaways: Jane (Marketing) to complete competitor analysis by Friday; Tom (Engineering) to deliver prototype schematic by next Wednesday. Budget approval awaits Finance confirmation. Next meeting: Thursday, 10 AM.",
        "Whisk together soy sauce, honey, sesame oil, minced garlic, and grated ginger for the marinade. Coat protein (chicken, beef, tofu) and marinate for at least 30 minutes (2 hours refrigerated recommended).",
        "The ancient Library of Alexandria in Egypt, significant under the Ptolemaic dynasty and dedicated to the Muses, served as a major research hub attracting scholars. Its destruction is historically debated.",
        "A blockchain is a distributed, unchangeable ledger where transactions are grouped into blocks cryptographically linked in a chain, making tampering difficult due to decentralization.",
        "Amazon deforestation, mainly from cattle ranching and agriculture, remains a critical environmental issue contributing to carbon emissions and biodiversity loss. While recent data shows a slight decrease, the rate is still alarmingly high.",
        "The novel's intriguing memory-trading premise suffered from uneven execution, with shallow character development and slow pacing, though its imaginative world-building was a highlight.",
    ],
    "assistant_response": [
        "Researchers developed a new solar panel. It is 5% more efficient. It uses a new perovskite crystal structure. This structure is stable and captures more light. It should be commercially produced in three years.",
        "This is the 'SilentStep' treadmill. It is quiet because of noise-reduction tech. Good for apartments. Has 12 programs. Has heart monitor. Folds. Max weight 250 lbs.",
        "Overweight adults were studied for 12 weeks comparing intermittent fasting (IF) and daily caloric restriction (DCR). Weight loss was similar. IF group had better insulin sensitivity and less visceral fat than DCR group. IF might have unique benefits.",
        "A lighthouse on a cliff guides ships in fog. Elias is the keeper. He climbs the stairs for his duty. He feels the tradition. The lens groans.",
        "The meeting ended. Jane will do competitor analysis by Friday. Tom will provide a schematic next Wednesday. Finance needs to approve the budget. The next meeting is Thursday at 10 AM.",
        "Make the marinade: mix 1/4c soy sauce, 2tbsp honey, 1tbsp sesame oil, 2 cloves garlic (minced), 1tsp ginger (grated). Whisk it. Put protein in. Cover it. Marinate 30+ minutes, better for 2 hours in fridge.",
        "The Library of Alexandria was a big ancient library in Egypt. It was important during the Ptolemaic rule and dedicated to the Muses. Scholars came there to research. How it was destroyed is debated by historians.",
        "Blockchain is like a shared digital book that cannot be changed easily. Information (transactions) goes into blocks. Blocks are linked using crypto hashes. Because it's spread out on many computers, changing data is very hard.",
        "Deforestation in the Amazon is a big worry. Cattle and farming are main causes. It increases carbon emissions and hurts biodiversity. Satellites show the rate decreased slightly last year, but it's still very high.",
        "The book had a cool idea about trading memories. But it wasn't done perfectly. Characters weren't deep, especially the main one. The middle part was slow. The world details were creative and showed a good concept.",
    ],
    "completeness/human_rating": ["4", "5", "5", "4", "5", "2", "4", "2", "5", "4"],
}

human_rated_dict["user_input"] = [
    f"Summarize the following text:\n{p}" for p in human_rated_dict["user_input"]
]
human_rated_dataset = pd.DataFrame(human_rated_dict)
human_rated_dataset.head()

### Define custom metrics

You define the criteria the autorater (Selene) will use to judge the LLM responses. This template instructs the autorater on its task, provides the scoring rubric (defining scores 1-5 for 'Completeness'), specifies the required output format (Reasoning and Result), and includes placeholders (`{user_input}`, `{assistant_response}`, etc.) that will be filled with data for each evaluation instance.

This template is then converted into a `PointwiseMetric` from the Vertex AI SDK, giving the metric a name (`completeness`) and linking it to the prompt template.

In [None]:
completeness_prompt_template = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.

  Here are some rules of the evaluation:
  (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.

  Your reply should strictly follow this format:
  **Reasoning:** <Your feedback>

  **Result:** <an integer between 1 and 5>

  Here is the data:

  Instruction:
  '''
  {user_input}
  '''

  Response:
  '''
  {assistant_response}
  '''

  Score Rubrics:
  Does the response provide a sufficient explanation? Comprehensiveness and thoroughness of the response should be considered, which depends on the breadth of
  topics covered and the level of detail provided within each topic.
  Score 1: The response doesn't include any specifics or examples to support the statements
  made.
  Score 2: The response does not provide sufficient details or supportive examples, requiring
  a major effort to make the response more complete.
  Score 3: It is a decent response, but the breadth and depth of the response are rather limited.
  The details and examples used to substantiate the response may be insufficient.
  Score 4: The response provides detailed explanations, but there is room for enhancement.
  The response could be further improved by including more details and supportive examples.
  Score 5: The response fully provides comprehensive explanations. It delves deep into the
  topic, providing as much detail as possible, and it offers several examples to back up its
  points.

  Reference answer:
  {ground_truth}"""

completeness_messages = [
    {"role": "user", "content": completeness_prompt_template},
]

completeness_prompt_template_fmt = tokenizer.apply_chat_template(
    completeness_messages,
    tokenize=False,
    add_generation_prompt=True,
)

completeness = PointwiseMetric(
    metric="completeness",
    metric_prompt_template=completeness_prompt_template_fmt,
)

### Run evaluation

To run an evaluation using Gen AI Evaluation service, you define an `EvalTask`. The `EvalTask` is configured with the evaluation `dataset`, the list of `metrics` to compute (our custom 'completeness' metric), the `experiment` name for tracking, and the `autorater_config`.

The autorater configuration crucially specifies the `autorater_model` by providing the resource name of the deployed Selene endpoint.

Running the `.evaluate()` method initiates the job. This process iterates through the dataset, formats the evaluation prompt for each row using the metric template, sends it to the Selene autorater endpoint, parses the score and reasoning, and returns the results.

In [None]:
eval_result = EvalTask(
    dataset=human_rated_dataset,
    metrics=[
        completeness,
    ],
    experiment=EXPERIMENT_NAME,
    autorater_config=AutoraterConfig(
        autorater_model=deployed_judge_model.resource_name
    ),
    output_uri_prefix=BUCKET_URI + "/evaluation_results",
).evaluate()

### Evaluate the autorater

After the autorater has generated its scores, you may what to  assess how well those scores align with the pre-existing human ratings for the same metric ('completeness').

The `evaluate_autorater` utility function is used for this purpose. It takes the evaluation results table (containing both human and autorater scores) and the metric definition as input. The function calculates various comparison statistics, and a confusion matrix, which directly compares the distribution of scores between the human rater and the autorater model. These statistics help quantify the reliability of the autorater.

In [None]:
evaluate_autorater_result = evaluate_autorater(
    evaluate_autorater_input=eval_result.metrics_table, eval_metrics=[completeness]
)

In [None]:
df_data = eval_result.metrics_table[["completeness/human_rating", "completeness/score"]]
metrics = evaluate_autorater_result.eval_result

### Visualize Judge model alignement

Use some visualization to represent the comparison between the autorater's scores and the human ratings using the Plotly library. Three types of plots are generated:

1.  **Distribution Bar Charts:** Side-by-side histograms showing the frequency of each score (1-5) given by humans versus the autorater, allowing for a quick comparison of overall scoring tendencies.
2.  **Confusion Matrix Heatmap:** A heatmap visualizing the agreement and disagreement between human ratings (y-axis) and autorater scores (x-axis). Strong agreement appears along the diagonal.
3.  **Jitter Scatter Plot:** Each point represents an evaluated item, plotted by its autorater score (x) and human rating (y). A small amount of "jitter" (random noise) is added to prevent points from overlapping perfectly, making density clearer. An "Ideal Alignment" line (y=x) is included for reference; points close to this line indicate good agreement.


In [None]:
run_visual_analysis(df_data=df_data, metrics=metrics)

## Cleaning up

In [None]:
delete_endpoint = False
delete_experiments = False

if delete_endpoint:
    deployed_judge_model.delete(force=True)

if delete_experiments:
    experiments = aiplatform.Experiment.list()
    for experiment in experiments:
        if experiment.name == EXPERIMENT_NAME:
            experiment.delete()