In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

 # Vertex AI Model Garden - TranslationLLM Translation and Evaluation (Demo)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Ftranslation_and_evaluation_demo.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/translation_and_evaluation_demo.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

In this tutorial, you will learn how to use the *Vertex AI Python SDK* to generate translation responses and then use the *Gen AI Evaluation Service* to measure the translation quality of your LLM responses using [BLEU](https://en.wikipedia.org/wiki/BLEU), [MetricX](https://github.com/google-research/metricx) and [COMET](https://unbabel.github.io/COMET/html/index.html).

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Getting Started

### Install Vertex AI Python SDK for Gen AI Evaluation Service and Cloud translation Python client

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation] google-cloud-translate

In [None]:
# @title Import libraries
import os

import pandas as pd
import vertexai
from google.cloud import aiplatform, translate_v3
from IPython.display import Markdown, display
from vertexai import evaluation
from vertexai.evaluation.metrics import pointwise_metric

### Define Google Cloud Project Information

In [None]:
# Get the default project id and region.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
# @markdown If you want to use a different region, please make sure the region is supported by Vertex AI Evaluation.
# @markdown Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#eval-locations.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# @markdown **[Optional]** Set the experiment name for your experiment.
EXPERIMENT_NAME = "my-eval-task-experiment"  # @param {type:"string"}

### Initialize Vertex AI SDK and Google Cloud Translation client.

In [None]:
client = translate_v3.TranslationServiceClient()
vertexai.init(project=PROJECT_ID, location=REGION)

## Helper Functions

In [None]:
# @title Display evaluation result.
def display_eval_result(eval_result, metrics=None, model_name=None, rows=0):
    """Display the evaluation results."""
    if model_name is not None:
        display(Markdown("## Eval Result for %s" % model_name))

    summary_metrics, metrics_table = (
        eval_result.summary_metrics,
        eval_result.metrics_table,
    )

    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        metrics_table = metrics_table.filter(
            [
                metric
                for metric in metrics_table.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the summary metrics
    display(Markdown("### Summary Metrics"))
    display(metrics_df)
    if rows > 0:
        # Display samples from the metrics table
        display(Markdown("### Row-based Metrics"))
        display(metrics_table.head(rows))

In [None]:
# @title Translate text.
def translate_text(
    text: str,
    source_language_code: str,
    target_language_code: str,
) -> translate_v3.TranslationServiceClient:
    """Translating Text from English.

    Args:
        text: The content to translate.
        source_language_code: The language code for the text.
        target_language_code: The language code for the translation. E.g. "fr" for
          French, "es" for Spanish, etc. Available languages:
          https://cloud.google.com/translate/docs/languages#neural_machine_translation_model
    """
    parent = f"projects/{PROJECT_ID}/locations/us-central1"
    # @markdown Translate text from English to `target_language_code` (your chosen language) using the Translate LLM model.
    # @markdown 1. Translate LLM is available in us-central1.
    # @markdown 2. Supported mime types are listed in https://cloud.google.com/translate/docs/supported-formats.
    response = client.translate_text(
        contents=[text],
        target_language_code=target_language_code,
        parent=parent,
        mime_type="text/plain",
        source_language_code=source_language_code,
        model=f"{parent}/models/general/translation-llm",  # Use Translate LLM.
    )

    # Display the translation for each input text provided
    for translation in response.translations:
        print(f"Translated text: {translation.translated_text}")
    # Example response:
    # Translated text: Bonjour comment vas-tu aujourd'hui?

    return response

## Getting Translations

In [None]:
# @title Try out a translation.
translations = translate_text(
    text="Dem Feuer konnte Einhalt geboten werden",
    source_language_code="de",
    target_language_code="en",
)
translations

In [None]:
# @title Generate translations.

# Define original text.
sources = [
    "Dem Feuer konnte Einhalt geboten werden",
    "Schulen und Kindergärten wurden eröffnet.",
]

# Generate responses.
translations = []
for source in sources:
    translation = (
        translate_text(
            text=source, target_language_code="en", source_language_code="de"
        )
        .translations[0]
        .translated_text
    )
    translations.append(translation)

translations

## Evaluating Your translations

In [None]:
# @title Prepare evaluation dataset.

# These are the references we will send for evaluation.
references = [
    "They were able to control the fire.",
    "Schools and kindergartens opened",
]

# Define evaluation dataset using the responses generated.
eval_dataset = pd.DataFrame(
    {
        "source": sources,
        "response": translations,
        "reference": references,
    }
)

### Set up eval metrics for your data.

You can evaluate the translation quality of your data generated from an LLM using any of the metrics below.

- [BLEU](https://en.wikipedia.org/wiki/BLEU):\
BLEU calculates a score from 0 to 1 based on how many matching words and phrases appear in a machine translation compared to a human reference, with higher scores indicating better quality.

- [COMET](https://unbabel.github.io/COMET/html/index.html):\
COMET uses a neural network to produce a score typically between 0 and 1, reflecting the similarity between a machine translation and a human reference, where higher scores mean better quality.

- [MetricX](https://github.com/google-research/metricx):\
Metric-X is a LLM-based evaluation metric for translation quality measurement that aims at maching the Multidimensional Quality Metrics (MQM) score range of 0 (best) to 25 (worst). It is a newer and improved version of Bluert-X that was published publicly by Google.

See [documentations](https://github.com/googleapis/python-aiplatform/blob/main/vertexai/evaluation/metrics/pointwise_metric.py) for more information about supported COMET and MetricX versions.

In [None]:
metrics = [
    "bleu",
    pointwise_metric.Comet(),
    pointwise_metric.MetricX(),
]

### Run evaluation

With the evaluation dataset and metrics defined, you can run evaluation for an `EvalTask` on different models and applications, and many other use cases.

In [None]:
eval_task = evaluation.EvalTask(
    dataset=eval_dataset, metrics=metrics, experiment=EXPERIMENT_NAME
)
eval_result = eval_task.evaluate()

You can view the summary metrics and row-based metrics for each response in the `EvalResult`.


In [None]:
display_eval_result(eval_result, rows=2)

## Clean up

In [None]:
# @title Delete ExperimentRun
delete_experiment = False
if delete_experiment:
    aiplatform.ExperimentRun(
        run_name=eval_result.metadata["experiment_run"],
        experiment=eval_result.metadata["experiment"],
    ).delete()