In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate and Select Model Generation Settings | Rapid Evaluation SDK Tutorial

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fnotebook_template.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/notebook_template.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| | |
|-|-|
| Author(s) | [Jason Dai](https://github.com/jsondai)  [Bo Zheng](https://github.com/coolalexzb) |

# New Section

## Overview

Evaluate and Select Model Generation Settings:

* Adjust temperature and other model generation configurations of Gemini on a summarization task and compare the evaluation results from different generation settings on quality, fluency, safety, and verbosity, etc.

## Get started

### Install Vertex AI SDK and other required packages


In [None]:
!pip3 install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type:"string", isTemplate: true}
if PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
# General
import inspect
from uuid import uuid4
from google.colab import auth
from IPython.display import display, Markdown, HTML
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
import random
import string
from tqdm import tqdm
import warnings

# Main
import pandas as pd
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, HarmCategory, HarmBlockThreshold

from vertexai.evaluation import (
    EvalTask,
    CustomMetric,
    PointwiseMetric,
    PairwiseMetric,
    PointwiseMetricPromptTemplate,
    PairwiseMetricPromptTemplate,
    MetricPromptTemplateExamples,
)

In [None]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

### Helper Functions

In [None]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specifed length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def print_doc(function):
    print(f"{function.__name__}:\n{inspect.getdoc(function)}\n")


def display_eval_report(eval_result, metrics=None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown(f"### Report Metrics"))
    display(report_df)


def display_explanations(df, metrics=None, n=1):
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
        df = df.filter(
            ["context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"<h2>{col}:</h2> <div style='{style}'>{row[col]}</div>"))
        display(HTML("<hr>"))


def plot_radar_plot(eval_results, metrics=None):
    fig = go.Figure()

    for eval_result in eval_results:
        title, summary_metrics, report_df = eval_result

        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        fig.add_trace(
            go.Scatterpolar(
                r=list(summary_metrics.values()),
                theta=list(summary_metrics.keys()),
                fill="toself",
                name=title,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])), showlegend=True
    )

    fig.show()


def plot_bar_plot(eval_results, metrics=None):
    fig = go.Figure()
    data = []

    for eval_result in eval_results:
        title, summary_metrics, _ = eval_result
        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        data.append(
            go.Bar(
                x=list(summary_metrics.keys()),
                y=list(summary_metrics.values()),
                name=title,
            )
        )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def print_aggregated_metrics(job):
    """Print AutoMetrics"""

    rougeLSum = round(job.rougeLSum, 3) * 100
    display(
        HTML(
            f"<h3>The {rougeLSum}% of the reference summary is represented by LLM when considering the longest common subsequence (LCS) of words.</h3>"
        )
    )


def print_pairwise_judgments(df, n=3, metric="pairwise_summarization_quality"):
    """Print Pairwise judgments in the notebook"""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)

    for index, row in df.iterrows():
        if row[f"{metric}/confidence"] >= 0.5:
            display(
                HTML(f"<h2>Context:</h2> <div style='{style}'>{row['context']}</div>")
            )
            display(
                HTML(
                    f"<h2>Baseline Model Response:</h2> <div style='{style}'>{row['baseline_model_response']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Candidate Model Response:</h2> <div style='{style}'>{row['response']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Explanation:</h2> <div style='{style}'>{row[f'{metric}/explanation']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Confidence score:</h2> <div style='{style}'>{row[f'{metric}/confidence']}</div>"
                )
            )
            display(HTML("<hr>"))


def print_pairwise_win_metrics(scores, metric="pairwise_summarization_quality"):
    """Print Pairwise aggregated metrics"""

    candidate_model_win_rate = round(scores[f"{metric}/candidate_model_win_rate"] * 100)
    display(
        HTML(
            f"<h3>Win rate: Pairwise Autorater prefers {candidate_model_win_rate}% of time Candidate Model over Baseline Model</h3>"
        )
    )

## Define an Evaluation Dataset

In [None]:
instruction = "Summarize the following article: \n"

context = [
    "Typhoon Phanfone has killed at least one person, a US airman on Okinawa who was washed away by high waves. Thousands of households have lost power and Japan's two largest airlines have suspended many flights. The storm also forced the suspension of the search for people missing after last week's volcanic eruption. The storm-tracking website Tropical Storm Risk forecasts that Phanfone will rapidly lose power over the next few hours as it goes further into the Pacific Ocean. Typhoon Phanfone was downgraded from an earlier status of a super typhoon, but the Japan Meteorological Agency had warned it was still a dangerous storm. Japan averages 11 typhoons a year, according to its weather agency. The typhoon made landfall on Monday morning near the central city of Hamamatsu, with winds of up to 180 km/h (112 mph). The airman was one of three US military personnel swept away by high waves whipped up by the typhoon off southern Okinawa island, where the US has a large military base. The remaining two are still missing. A police spokesman said they had been taking photographs of the sea. A university student who was surfing off the seas of Kanagawa Prefecture, south of Tokyo, was also missing, national broadcast NHK reports. It said at least 10 people had been injured and 9,500 houses were without power. The storm was expected to deposit about 100mm of rain on Tokyo over 24 hours, according to the Transport Ministry website. Many schools were closed on Monday and two car companies in Japan halted production at some plants ahead of the storm. More than 174 domestic flights were affected nationwide, NHK state broadcaster said on Sunday. On Sunday, heavy rain delayed the Japanese Formula One Grand Prix in Suzaka. French driver Jules Bianchi lost control in the wet conditions and crashed, sustaining a severe head injury.",
    "The blaze started at the detached building in Drivers End in Codicote, near Welwyn, during the morning. There was another fire at the building 20 years ago, after which fire-proof foil was placed under the thatch, which is protecting the main building. More than 15 fire engines and support vehicles were called to tackle the blaze. Roads in the area were closed and traffic diverted.",
    'The 18-year-old fell at the New Charter Academy on Broadoak Road in Ashton-under-Lyne at about 09:10 BST, Greater Manchester Police (GMP) said. GMP said he had gone to Manchester Royal Infirmary and his condition was "serious". Principal Jenny Langley said the school would remain "fully open" while police investigated. "Our thoughts are with the family and we\'re doing everything we can to support them along with staff and pupils," she said.',
    'But Belgian-born Dutchman Max Verstappen was unable to drive a car legally on his own in either country. That all changed on Wednesday when the youngster turned 18 and passed his driving test at the first attempt. Despite having competed in 14 grands prix since his debut in Australia in March, Verstappen admitted to feeling the pressure during his test. "It\'s a relief," said the Toro Rosso driver, who finished ninth in Japan on Sunday and had only started driving lessons a week ago. "I was a bit nervous to make mistakes, but the exam went well." A bonus of turning 18 is that Verstappen will now be able to drink the champagne if he ever makes it onto the podium.',
]

reference = [
    "A powerful typhoon has brought many parts of Japan to a standstill and briefly battered Tokyo before heading out to sea.",
    "A major fire has been burning in the thatched roof of a large property in Hertfordshire.",
    "A student has been taken to hospital after falling from a balcony at a Greater Manchester school.",
    "He is Formula 1's youngest ever driver and in charge of a car that can reach over 200mph.",
]

eval_dataset = pd.DataFrame(
    {
        "prompt": [instruction + item for item in context],
        "reference": reference,
    }
)

### Evaluate Gemini Model with different Generation Settings

In [None]:
experiment_name = "eval-model-generation-settings"  # @param {type:"string"}
summarization_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.TEXT_QUALITY,
        MetricPromptTemplateExamples.Pointwise.FLUENCY,
        MetricPromptTemplateExamples.Pointwise.SAFETY,
        MetricPromptTemplateExamples.Pointwise.VERBOSITY,
    ],
    experiment=experiment_name,
)

In [None]:
generation_config_1 = {
    "max_output_tokens": 64,
    "temperature": 0.1,
}
generation_config_2 = {
    "max_output_tokens": 128,
    "temperature": 0.3,
}
generation_config_3 = {
    "max_output_tokens": 256,
    "temperature": 0.8,
}

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

gemini_1 = GenerativeModel(
    "gemini-1.5-pro",
    generation_config=generation_config_1,
    safety_settings=safety_settings,
)
gemini_2 = GenerativeModel(
    "gemini-1.5-pro",
    generation_config=generation_config_2,
    safety_settings=safety_settings,
)
gemini_3 = GenerativeModel(
    "gemini-1.5-pro",
    generation_config=generation_config_3,
    safety_settings=safety_settings,
)

In [None]:
models = {
    "gemini-setting-1": gemini_1,
    "gemini-setting-2": gemini_2,
    "gemini-setting-3": gemini_3,
}

In [None]:
eval_results = []
run_id = generate_uuid()

for _, (model_name, model) in tqdm(
    enumerate(zip(models.keys(), models.values())), total=len(models.keys())
):
    experiment_run_name = f"eval-{model_name}-{run_id}"

    eval_result = summarization_eval_task.evaluate(
        model=model,
        experiment_run_name=experiment_run_name,
    )

    eval_results.append(
        (f"Model {model_name}", eval_result.summary_metrics, eval_result.metrics_table)
    )

### Evaluation Results

In [None]:
for eval_result in eval_results:
    display_eval_report(eval_result)

In [None]:
for eval_result in eval_results:
    display_explanations(eval_result[2], metrics=["fluency"])

In [None]:
summarization_eval_task.display_runs()