In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Supervised Fine-tuning for Text Classification with Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Ftuning%2Fgemini_supervised_finetuning_text_classification.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| | | |
|-|-|-|
|Author(s) | [Gabriela Hernandez Larios](https://github.com/gabrielahrlr) | [Elia Secchi](https://github.com/eliasecchig)|


## Overview

This tutorial demonstrates how to perform text classification with Gemini models. From in-context learning (using zero-shot and few-shot learning) to in-weights learning fine-tuning Gemini models for text classification.

### Objective
 We'll cover the development cycle from preparing the dataset, to setting up an evaluation framework to perform text classification tasks using Gemini. Additionally, you'll learn how to create and log experiments, adapting Gemini models to the text classification task with in-context and in-weights (fine-tuning) learning approaches, and compare the performances.

This tutorial uses the following Google Cloud ML Services and Resources:

- Google Cloud Storage
- Vertex AI Experiments
- Vertex AI Fine-Tuning
- Gemini 1.0 Pro

The steps performed include:
- [Load and split dataset](#scrollTo=EdvJRUWRNGHE&line=1&uniqifier=1)
- [Evaluation and Experiment Setup](#scrollTo=c2YOsromfcuB&line=6&uniqifier=1)
- [In-Context learning (zero-shot and few-shot) using Gemini Models](#scrollTo=EfKnRU-SfcuB)
- [Fine-tuning Gemini 1.0 Pro for text classification](#scrollTo=Qs9eHiL5fcuD)
- [Comparative Evaluation]()
- [[Optional] Heuristics for computing Confidence Scores](#scrollTo=KW7wPWQWuQT4)

### Dataset
The [BBC News dataset](http://mlg.ucd.ie/datasets/bbc.html) consists of 2225 articles from the BBC news website corresponding to five topical areas: business, entertainment, politics, sport, and tech.  This dataset was downloaded from http://mlg.ucd.ie/datasets/bbc.html

**Dataset Citation**

```
@inproceedings{greene06icml,
	Author = {Derek Greene and P\'{a}draig Cunningham},
	Booktitle = {Proc. 23rd International Conference on Machine learning (ICML'06)},
	Pages = {377--384},
	Publisher = {ACM Press},
	Title = {Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering},
	Year = {2006}}
```

## Installation

### Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform datasets backoff multiprocess gcsfs

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries


In [4]:
from collections import Counter
import json
from typing import Any, Callable, Dict, List, Optional, Union

# Data Handling and Processing
from datasets import load_dataset
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
import pandas as pd
import gcsfs
from google.cloud import storage

# Google Cloud Libraries
from google.api_core.exceptions import ResourceExhausted
from google.cloud import aiplatform
import vertexai
from vertexai.generative_models import (
    GenerativeModel,
    GenerationConfig,
    HarmBlockThreshold,
    HarmCategory,
)
from vertexai.preview.tuning import sft

# Multiprocessing
import multiprocess as mp
from tqdm import tqdm
import backoff
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import traceback

### Set Google Cloud project information, initialize Vertex AI SDK for Python and create a GCS bucket

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

**warning:** Only if your bucket doesn't already exist: Run the following cell to create your Cloud Storage bucket.


In [None]:
!gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents

### Helper Functions

#### Batch Prediction - Helper functions

These helper functions streamline batch predictions using parallelization and multithreading with online Gemini Models. Gemini also offers the possibility to [perform batch text generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/batch-prediction-gemini) in **Public Preview** (July 2024). 

In [43]:
def backoff_hdlr(details) -> None:
    """
    Handles backoff events.

    Args:
        details: A dictionary containing information about the backoff event.
    """
    print(f"Backing off {details['wait']:.1f} seconds after {details['tries']} tries")


def log_error(msg: str, *args: Any) -> None:
    """
    Logs an error message and raises an exception.

    Args:
        msg: The error message.
        *args: Additional arguments to be passed to the logger.
    """
    mp.get_logger().error(msg, *args)
    raise Exception(msg)


def handle_exception_threading(f: Callable) -> Callable:
    """
    A decorator that handles exceptions in a threaded environment.

    Args:
        f: The function to decorate.

    Returns:
        The decorated function.
    """

    def applicator(*args: Any, **kwargs: Any) -> Any:
        try:
            return f(*args, **kwargs)
        except:
            log_error(traceback.format_exc())

    return applicator


@handle_exception_threading
@backoff.on_exception(
    backoff.expo, ResourceExhausted, max_tries=30, on_backoff=backoff_hdlr
)
def _predict_message(message: str, model: GenerativeModel) -> Optional[str]:
    """
    Predict messages

    Args:
        message: The message to predict.
        model: The GenerativeModel to use for prediction.

    Returns:
        The predicted message, or None if an error occurred.
    """
    response = model.generate_content([message], stream=False)
    return response.text


def batch_predict(
    messages: List[str], model: GenerativeModel, max_workers: int = 4
) -> List[Optional[str]]:
    """
    Predicts the classes for a list of messages

    Args:
        - messages: list of all messages to predict
        - model: model to use for predicting.
        - max_workers: number of workers to use for parallel predictions

    Returns:
        - list of predicted labels

    """
    predictions = list()
    with ThreadPoolExecutor(max_workers) as pool:
        partial_func = partial(_predict_message, model=model)
        for message in tqdm(pool.map(partial_func, messages), total=len(messages)):
            predictions.append(message)
            pass

    return predictions

#### Vertex AI Experiment Helper
We will define a `VertexAIExperimentManager` class to simplify the creation, logging and runs management of experiments using Vertex AI Experiments.

In [1]:
class VertexAIExperimentManager:
    """
    A class for managing experiments and runs in Vertex AI.
    This class encapsulates the functionality for creating experiments, logging runs,
    and retrieving experiment data in Vertex AI.
    """

    def __init__(self, project: str, location: str):
        self.project = project
        self.location = location
        self.current_experiment = None

    def init_experiment(
        self, experiment_name: str, experiment_description: Optional[str] = None
    ):
        """Initialize or switch to a specific experiment."""
        self.current_experiment = experiment_name
        aiplatform.init(
            experiment=experiment_name,
            experiment_description=experiment_description,
            experiment_tensorboard=False,
            project=self.project,
            location=self.location,
        )

    def create_experiment(
        self, experiment_name: str, experiment_description: Optional[str] = None
    ) -> None:
        """Create an Experiment on Vertex AI Experiments"""
        self.init_experiment(experiment_name, experiment_description)

    def log_run(
        self, run_name: str, params: Dict[str, Any], metrics: Dict[str, Any]
    ) -> None:
        """Log experiment run data to Vertex AI Experiments."""
        if not self.current_experiment:
            raise ValueError("No experiment initialized. Call init_experiment first.")

        aiplatform.start_run(run=run_name)
        aiplatform.log_params(params)
        aiplatform.log_metrics(metrics)
        aiplatform.end_run()

    def get_experiments_data_frame(self) -> Optional[pd.DataFrame]:
        """Retrieve a DataFrame of experiment data from Vertex AI Experiments."""
        if not self.current_experiment:
            raise ValueError("No experiment initialized. Call init_experiment first.")

        return aiplatform.get_experiment_df()

####  Helper Functions for Data Transformation 
This section contains helper functions designated to transform data from different formats into the specific format required for fine-tuning Gemini models on Vertex AI. These functions handle:

- Pandas DataFrames
- CSV files previously used for training AutoML text classifiers
- JSONL files previously used for training AutoML text classifiers

It also includes a function to validate the transformed dataset, ensuring it adheres to the correct format and roles for Gemini fine-tuning on Vertex AI.

In [None]:
def create_gemini_messages(
    text: str, label: str, system_prompt: Optional[str] = None
) -> dict:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.extend(
        [
            {"role": "user", "content": text},
            {"role": "model", "content": label},
        ]
    )
    return {"messages": messages}


def prepare_tuning_dataset_from_df(
    tuning_df: pd.DataFrame, system_prompt: Optional[str] = None
) -> pd.DataFrame:
    """
    Prepares a tuning dataset from a pandas DataFrame for Gemini fine-tuning.
    Args:
        tuning_df: A pandas DataFrame with columns "text" and "label_text".
        system_prompt: An optional system prompt for zero-shot learning.
    Returns:
        A pandas DataFrame containing the data in the Gemini tuning format.
    """
    tuning_dataset = [
        create_gemini_messages(row["text"], row["label_text"], system_prompt)
        for _, row in tuning_df.iterrows()
    ]
    return pd.DataFrame(tuning_dataset)


def convert_tuning_dataset_from_automl_csv(
    automl_gcs_csv_path: str,
    system_prompt: Optional[str] = None,
    partition: str = "training",
) -> pd.DataFrame:
    """
    Converts an AutoML CSV dataset for text classification to the Gemini tuning format.
    Args:
        automl_gcs_csv_path: The GCS path to the AutoML CSV dataset.
        system_prompt: The instructions to the model.
        partition: The partition to extract from the dataset (e.g., "training", "validation", "test"). Defaults to "training".
    Returns:
        A pandas DataFrame containing the data in the Gemini tuning format.
    """
    df = pd.read_csv(automl_gcs_csv_path, names=["partition", "text", "label"])
    df_automl = df.loc[df["partition"] == partition]
    gemini_dataset = [
        create_gemini_messages(row["text"], row["label"], system_prompt)
        for _, row in df_automl.iterrows()
    ]
    return pd.DataFrame(gemini_dataset)


def convert_tuning_dataset_from_automl_jsonl(
    project_id: str,
    automl_gcs_jsonl_path: str,
    system_prompt: Optional[str] = None,
    partition: str = "training",
) -> pd.DataFrame:
    """
    Converts an AutoML JSONL dataset for text classification to the Gemini tuning format.
    Args:
        automl_gcs_jsonl_path: The GCS path to the AutoML JSONL dataset for text classification.
        system_prompt: The instructions to the model.
        partition: The partition to extract from the dataset (e.g., "training", "validation", "test"). Defaults to "training".
    Returns:
        A pandas DataFrame containing the data in the Gemini tuning format.
    """
    processed_data = []
    gcs_file_system = gcsfs.GCSFileSystem(project=project_id)
    with gcs_file_system.open(automl_gcs_jsonl_path) as f:
        for line in f:
            data = json.loads(line)
            processed_data.append(
                {
                    "label": data["classificationAnnotation"]["displayName"],
                    "text": data["textContent"],
                    "partition": data["dataItemResourceLabels"][
                        "aiplatform.googleapis.com/ml_use"
                    ],
                }
            )

    df = pd.DataFrame(processed_data)
    df_automl = df.loc[df["partition"] == partition]
    gemini_dataset = [
        create_gemini_messages(row["text"], row["label"], system_prompt)
        for _, row in df_automl.iterrows()
    ]
    return pd.DataFrame(gemini_dataset)


def validate_gemini_tuning_jsonl(gcs_jsonl_path: str) -> List[Dict]:
    """
    Validates a JSONL file on Google Cloud Storage against the Gemini tuning format.

    Args:
        gcs_jsonl_path: The GCS path to the JSONL file.

    Returns:
        A list of dictionaries representing the errors found in the file.
        Each dictionary has the following structure:
        {
            "error_type": "Error description",
            "row_index": The index of the row where the error occurred,
            "message": The error message
        }
    """

    errors = []
    storage_client = storage.Client()
    blob = storage.Blob.from_string(uri=gcs_jsonl_path, client=storage_client)

    with blob.open("r") as f:
        for row_index, line in enumerate(f):
            try:
                data = json.loads(line)
                # Check for the presence of the "messages" key
                if "messages" not in data:
                    errors.append(
                        {
                            "error_type": "Missing 'messages' key",
                            "row_index": row_index,
                            "message": f"Row {row_index} is missing the 'messages' key.",
                        }
                    )
                    continue

                messages = data["messages"]
                # Check if "messages" is a list
                if not isinstance(messages, list):
                    errors.append(
                        {
                            "error_type": "Invalid 'messages' type",
                            "row_index": row_index,
                            "message": f"Row {row_index}: 'messages' is not a list.",
                        }
                    )
                    continue

                # Validate each message in the "messages" list
                for message_index, message in enumerate(messages):
                    if not isinstance(message, dict):
                        errors.append(
                            {
                                "error_type": "Invalid message format",
                                "row_index": row_index,
                                "message": f"""Row {row_index},
                            message {message_index}: Message is not a dictionary.""",
                            }
                        )
                        continue

                    # Check for required keys in each message dictionary
                    if "role" not in message or "content" not in message:
                        errors.append(
                            {
                                "error_type": "Missing 'role' or 'content' key",
                                "row_index": row_index,
                                "message": f"Row {row_index}, message {message_index}: "
                                "Missing 'role' or 'content' key.",
                            }
                        )
                        continue

                    # Check for valid role values
                    if message["role"] not in ["system", "user", "model"]:
                        errors.append(
                            {
                                "error_type": "Invalid 'role' value",
                                "row_index": row_index,
                                "message": f"""Row {row_index}, message {message_index}:
                            Invalid 'role' value. Expected 'system', 'user', or 'model'.""",
                            }
                        )
                        continue

            except json.JSONDecodeError as e:
                errors.append(
                    {
                        "error_type": "JSON Decode Error",
                        "row_index": row_index,
                        "message": f"Row {row_index}: JSON decoding error: {e}",
                    }
                )

    return errors

## 1. Load and Splitting Dataset
In this step, we will load the raw data and create training, validation and test sets. Later these datasets will be used to perform  different types of adaptations to Gemini models for the task under consideration.


Load the dataset from Hugging Face

In [None]:
datasets = load_dataset("SetFit/bbc-news")

Store in Pandas Dataframes the train and test partitions.

In [None]:
train = pd.DataFrame(datasets["train"])
test = pd.DataFrame(datasets["test"])

We now take a quick look to the data

In [None]:
train.head()

We want to check the distribution of the label values

In [None]:
train.label_text.value_counts()

In [None]:
print(train.shape)
print(test.shape)

We are going to partition the test data into validation and test datasets, in order to have three datasets, namely train, val (validation) and test datasets. To perform evaluations.

Test size will be slightly larger than validation, as while fine-tuning Gemini the validation dataset can only be max 256 rows.

In [None]:
val, test = train_test_split(
    test, test_size=0.75, shuffle=True, stratify=test["label_text"], random_state=2
)

In [None]:
print(val.shape)
print(test.shape)

Verify that the values of the label column are following a similar distribution, in order to have comparable evaluations.

In [None]:
val.label_text.value_counts()

In [None]:
test.label_text.value_counts()

##  2. Evaluation and Experiment Setup
We will create the required functions to evaluate our experiments and to log them in Vertex Experiments.


### Evaluation Setup
For this text classification task, we will use the below classification metrics to evaluate the performance of the models and it different adaptations. We will track the below metrics in our development.

- Overall Micro-F1
- Overall Macro-F1
- Overall Accuracy
- Overall Weighted Precision
- Overall Weighted Recall
- F1-Score (overall and per class)

The below functions would allow us to 

In [2]:
def predictions_postprocessing(text: str) -> str:
    """
    Cleans the predicted class label string.

    Args:
        text (str): The predicted class label string.

    Returns:
        str: The cleaned class label string.
    """
    return text.strip().lower()


def evaluate_predictions(
    df: pd.DataFrame,
    target_column: str = "label_text",
    predictions_column: str = "predicted_labels",
    postprocessing: bool = True,
) -> Dict[str, float]:
    """
    Batch evaluation of predictions, returns a dictionary with the metric.

    Args:
       - df (pandas.DataFrame):  a pandas dataframe with two mandatory columns, a target column with
       the actual true values, and a predictions column with the predicted values.
       - target_column (str): column name with the actual ground truth values
       - predictions_column (str): column name with the model predictions
       - postprocessing (bool): whether to apply postprocessing to predictions.

    Returns:
        Dict[str, float]: Dictionary of evaluation metrics.
    """
    if postprocessing:
        df[predictions_column] = df[predictions_column].apply(
            predictions_postprocessing
        )

    y_true = df[target_column]
    y_pred = df[predictions_column]

    metrics_report = classification_report(y_true, y_pred, output_dict=True)
    overall_macro_f1_score = f1_score(y_true, y_pred, average="macro")
    overall_micro_f1_score = f1_score(y_true, y_pred, average="micro")
    weighted_precision = precision_score(y_true, y_pred, average="weighted")
    weighted_recall = recall_score(y_true, y_pred, average="weighted")

    metrics = {
        "accuracy": metrics_report["accuracy"],
        "weighted precision": weighted_precision,
        "weighted recall": weighted_recall,
        "macro f1": overall_macro_f1_score,
        "micro f1": overall_micro_f1_score,
    }

    categories = ["business", "sport", "politics", "tech", "entertainment"]
    for category in categories:
        if category in metrics_report:
            metrics[f"{category}_f1_score"] = metrics_report[category]["f1-score"]

    return metrics

### Experiment Setup
Before starting the development and experimentation process, we will setup Vertex AI Experiments, in order to log all the experiments we run and compare them using our defined metrics. 

In this part we will use some of the helper functions we defined in the [helper functions section](#scrollTo=0tKVjsJKfcuA), to create an experiment where we will log all our different runs.

For more information about Vertex Experiments, please refer to its [documentation](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)

In [None]:
EXPERIMENT_NAME = "[your-experiment]"  # @param {type:"string"}

In [None]:
experiment_manager = VertexAIExperimentManager(project=PROJECT_ID, location=LOCATION)
experiment_manager.create_experiment(
    experiment_name=EXPERIMENT_NAME,
    experiment_description="Fine-tuning Gemini 1.0 Pro for text classification",
)

We will create an evaluation DataFrame from our Test dataset, where we will store the predictions from all the experiments.

In [None]:
# Create an Evaluation dataframe to store the predictions from all the experiments.
df_evals = test.copy()

## 3. In-Context Adaptation using Gemini models

In this section we'll do in-context learning to instruct Gemini models to perform the text classification task under consideration, using zero-shot and few-shot prompt engineering techniques.  

The prompts presented in this section are crafted for this task, and in our experiments they demonstrate superior results compared to other simpler prompts.

**Before fine-tuning a model, it is important to find the best prompt**: system instructions, examples, structure, etc., for the task under consideration. This will permit to get an understanding of which prompt works the best for the used model, and even boost more the performances when fine-tuning.

In this Colab, we are using Gemini 1.0 Pro, in order to compare the performances of the frozen model and after fine-tuning. But you can reuse this code to test also Gemini 1.5 Pro and Gemini 1.5 Flash by changing the model name in the code.

**Note:** Prompt Engineering is model-dependent. We recommend you to experiment with different prompting techniques per model. Techniques like Chain-of-Thought can increase performances, as well as Dynamic Few-Shots (using a RAG system to dynamically integrate the examples that are similar to the user input).

### Prompts Definition

We create the prompts we want to use for our experiments. In this case, we define two: zero-shot and few-shot prompts.

In [None]:
system_prompt_zero_shot = """TASK:
Classify the text into ONLY one of the following classes [business, entertainment, politics, sport, tech].

CLASSES:
- business
- entertainment
- politics
- sport
- tech

INSTRUCTIONS
- Respond with ONLY one class.
- You MUST use the exact word from the list above.
- DO NOT create or use any other classes.
- CAREFULLY analyze the text before choosing the best-fitting category from [business, entertainment, politics, sport, tech].

"""

For the few-shot prompt, we'll randomly pick an example from each category using the `train` dataset we previously computed.

In [None]:
system_prompt_few_shot = f"""TASK:
Classify the text into ONLY one of the following classes [business, entertainment, politics, sport, tech].

CLASSES:
- business
- entertainment
- politics
- sport
- tech

INSTRUCTIONS:
- Respond with ONLY one class.
- You MUST use the exact word from the list above.
- DO NOT create or use any other classes.
- CAREFULLY analyze the text before choosing the best-fitting category from [business, entertainment, politics, sport, tech].

EXAMPLES:
- EXAMPLE 1:
    <user>
    {train.loc[train["label_text"] == "business", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "business", "label_text"].iloc[10]}

- EXAMPLE 2:
    <user>
    {train.loc[train["label_text"] == "entertainment", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "entertainment", "label_text"].iloc[10]}

- EXAMPLE 3:
    <user>
    {train.loc[train["label_text"] == "politics", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "politics", "label_text"].iloc[10]}

- EXAMPLE 4:
    <user>
    {train.loc[train["label_text"] == "sport", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "sport", "label_text"].iloc[10]}

- EXAMPLE 4:
    <user>
    {train.loc[train["label_text"] == "tech", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "tech", "label_text"].iloc[10]}

"""

For the below evaluations, we'll use the respective functions we have already set up. For in-context learning, we recommend to use the validation set to find the optimal performance and then apply it to the test set, to make sure the metrics remain consistent. In this notebook, we'll directly evaluate on the test dataset, as the validation and prompt engineering part has been already done.

### Model Configuration Setup

We are going to define the generation configuration for doing the text classification task, and keep the same configuration across all of our experiments (both in-context and in-weights). 

We configure the temperature to 0, to make it as grounded as possible, and max output tokens to 10, as the categories are only one word, we don't need more than that.

We are also going to set the safety filters to only block responses which have high severity scores across all four categories. For more information about the Safety configurations, please refer to the [official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/configure-safety-attributes).

In [None]:
generation_config = GenerationConfig(max_output_tokens=10, temperature=0)

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

### 3.1 Gemini 1.0 Pro in-context Evaluation

#### Zero-Shot Evaluation

First we will compute the predictions using the frozen model with a prompt without examples (i.e. using the `system_prompt_zero_shot` prompt). 

In [None]:
gem_pro_1_model_zero = GenerativeModel(
    "gemini-1.0-pro-002",  # e.g. gemini-1.5-pro-001, gemini-1.5-flash-001
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

We convert the texts we want to predict to a list and run the online inference parallelizing the calls.

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].to_list()
# Compute the preictions
predictions_zero_shot = batch_predict(
    messages=messages_to_predict, model=gem_pro_1_model_zero, max_workers=4
)

We store the predictions in the DataFrame we previously defined for storing all the evaluations.

In [None]:
df_evals["gem1.0-zero-shot_predictions"] = predictions_zero_shot
len(predictions_zero_shot)

We compute the evaluation metrics for each text, using the zero-shot prompt

In [None]:
# Compute Evaluation Metrics for zero-shot prompt
metrics_zero_shot = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.0-zero-shot_predictions",
    postprocessing=True,
)
metrics_zero_shot

We finally log the run in the experiment we created in Vertex AI Experiments.

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.0 Pro
params = {
    "model": "gemini-1.0-pro-002",
    "adaptation_type": "in-context zero-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

experiment_manager.log_run(
    run_name="gemini-1-0-pro-002-zero-shot", params=params, metrics=metrics_zero_shot
)

#### Few-shot Evaluation

We will now conduct experiments adding examples to our prompt to steer the model. For this, we will use the `system_prompt_few_shot` prompt.

In [None]:
# Test Few-Shot, and other prompts/possibilities
gem_pro_1_model_few = GenerativeModel(
    "gemini-1.0-pro-002",
    system_instruction=[system_prompt_few_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

We convert the texts we want to predict to a list and run the online inference parallelizing the calls.

In [None]:
predictions_few_shot = batch_predict(
    messages=messages_to_predict, model=gem_pro_1_model_few
)

We store the predictions on our designated DataFrame

In [None]:
df_evals["gem1.0-few-shot_predictions"] = predictions_few_shot
len(predictions_few_shot)

We compute the evaluation metrics for each text, using the zero-shot prompt

In [None]:
# Compute Evaluation Metrics for few-shot prompt
metrics_few_shot = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.0-few-shot_predictions",
    postprocessing=True,
)
metrics_few_shot

And finally, we also log this run in our experiment, for comparison purposes.

In [None]:
# Log Experiment with Few-Shot Prompt with Gemini 1.0 Pro

params = {
    "model": "gemini-1.0-pro-002",
    "adaptation_type": "in-context few-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

experiment_manager.log_run(
    run_name="gemini-1-0-pro-few-shot", params=params, metrics=metrics_few_shot
)

## 4. Fine-tuning (Parameter Efficient) Gemini 1.0 Pro
Supervised fine-tuning helps adapt foundation models to new tasks using smaller, highly relevant datasets. To ensure success, focus on:

- Using domain-specific data: Choose data closely matching your real-world use case.
- Accurate labeling: High-quality annotations are crucial.
- Clean data: Remove duplicates, fix errors, and ensure relevance to your task.
- Diverse but focused examples: Include variety within your target domain, avoiding irrelevant data.
- Balanced classes (for classification): Maintain a balance to prevent bias towards a specific class.

### 4.1 Prepare tuning and validation datasets for fine-tuning Gemini Models on Vertex AI

Training data should be structured within a JSONL file located at a Google Cloud Storage (GCS) URI. Each line (or row) of the JSONL file must adhere to a specific schema: It should contain a "messages" array, with objects inside defining a "role" ("system" for the system context,  "user" for user input or "model" for model output) and the corresponding text "content". For example, a valid data row would look like this:

```
{
    "messages": [
      {
        "role": "system",
        "content": "You should classify the text into one of the following classes:[business, entertainment]"
      },
      { "role": "user", "content": "Diversify your investment portfolio" },
      { "role": "model", "content": "business" }
    ]
}
```

The role "system" is optional. You can find more information about the dataset format and preparation in the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about)

To run a tuning job, you need to upload your tuning and validation(optional) datasets to a Cloud Storage bucket. You can either create a new Cloud Storage bucket or use an existing one to store dataset files. We recommend that you use a bucket that's in the same Google Cloud project where you plan to tune your model.


In this section, we will provide guidelines to prepare the training and validation (optional) datasets based on three options:

1. [Option 1] From scratch, using the datasets we loaded and splitted at the beginning of this notebook.

1. [Option 2] Providing a function to convert an AutoML Dataset on CSV format to the expected format to fine-tune and validate Gemini Models.

1. [Option 3] Providing a function to convert an AutoML Dataset on JSONL format to the expected format to fine-tune and validate Gemini Models.


#### [Option 1] Prepare tuning and validation datasets from scratch

We need to prepare our training and validaiton (optional) datasets for the text classification task. It is recommended to add a system role within the instructions on how to classify. Since we are going to fine-tune the model, the need to add few-shot examples as part of the prompt is eliminated, and therefore we will reuse the `system_prompt_zero_shot` that we used previously.

##### Prepare Tuning Dataset for fine-tuning Gemini

We will create the tuning dataset by using our previously created `train` DataFrame, and formatting it in the expected structure.

In [None]:
tuning_gemini_df = prepare_tuning_dataset_from_df(
    tuning_df=train, system_prompt=system_prompt_zero_shot
)

Let's take a look at how it looks

In [None]:
tuning_gemini_df.head()

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store tuning dataset in GCS
tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}

tuning_gemini_df.to_json(tuning_data_gcs_path, orient="records", lines=True)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=tuning_data_gcs_path)

##### Prepare Validation Dataset for Fine-tuning Gemini 1.0 Pro
We do the same but now using the validation dataset

In [None]:
validation_gemini_df = prepare_tuning_dataset_from_df(
    tuning_df=val, system_prompt=system_prompt_zero_shot
)
validation_gemini_df.head()

In [None]:
validation_gemini_df.__len__()

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store validation dataset in GCS
validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
validation_gemini_df.to_json(validation_data_gcs_path, orient="records", lines=True)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=validation_data_gcs_path)

------------------

#### [Option 2] Transform AutoML CSV training dataset format to the expected data format for fine-tuning Gemini.
If you were previously using Vertex AI AutoML for text classification, and you have your data in the below csv format expected by AutoML:

```
[ml_use],gcs_file_uri|"inline_text",label
```

```
test,"inline_text",label1
test,"inline_text",label2
training,"inline_text",label3
validation,"inline_text",label1
```

 In the file `data_transformations_tuning.py` we have the function ` def convert_tuning_dataset_from_automl_csv(...)` to convert AutoML CSV datasets for text classification to the format expected for the tuning dataset to fine-tune Gemini models.

##### Prepare tuning dataset

In [None]:
# Usage Example for Training dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.csv"  # @param {type: "string"}
)
df_gemini_tuning = convert_tuning_dataset_from_automl_csv(
    automl_gcs_csv_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="training",
)

In [None]:
df_gemini_tuning.head()

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store tuning dataset in GCS
gemini_tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_tuning.to_json(gemini_tuning_data_gcs_path, orient="records", lines=True)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=gemini_tuning_data_gcs_path)

##### Prepare Validation Dataset

We will repeat the same process for the validation dataset in case there is one available. It is not mandatory to provide a validation dataset when fine-tuning Gemini, but rather optional.

In [None]:
# Usage Example for validation dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.csv"  # @param {type: "string"}
)
df_gemini_validation = convert_tuning_dataset_from_automl_csv(
    automl_gcs_csv_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="validation",
)

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store tuning dataset in GCS
gemini_validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_validation.to_json(
    gemini_validation_data_gcs_path, orient="records", lines=True
)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=gemini_validation_data_gcs_path)

------------------------

####  [Option 3] AutoML JSONL training dataset format to Gemini tuning data format

If you were previously using Vertex AI AutoML for text classification, and you have your data in the below JSONL format expected by AutoML:

```
{
  "classificationAnnotation": {
    "displayName": "label"
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
{
  "classificationAnnotation": {
    "displayName": "label2"
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
```

 In the file `data_transformations_tuning.py` we have the function ` def convert_tuning_dataset_from_automl_jsonl(...)` to convert  AutoML JSONL datasets for text classification to the format expected for the tuning dataset to fine-tune Gemini models.

##### Prepare Tuning Dataset

In [None]:
# Usage Example for Training dataset

gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.jsonl"  # @param {type: "string"}
)

df_gemini_tuning = convert_tuning_dataset_from_automl_jsonl(
    project_id=PROJECT_ID,
    automl_gcs_jsonl_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="training",
)

In [None]:
df_gemini_tuning

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store tuning dataset in GCS
gemini_tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_tuning.to_json(gemini_tuning_data_gcs_path, orient="records", lines=True)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=gemini_tuning_data_gcs_path)

##### Prepare Validation Dataset
Now we repeat the same process but with the validation dataset. When fine-tuning Gemini you can pass on two datasets: Training/Tuning Dataset (mandatory) and Validation Dataset (optional). If the validation dataset is provided, you can monitor also the metrics on this dataset during  the tuning process, however providing a validaiton dataset is optional.

In [None]:
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.jsonl"  # @param {type: "string"}
)

df_gemini_validation = convert_tuning_dataset_from_automl_jsonl(
    project_id=PROJECT_ID,
    automl_gcs_jsonl_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="validation",
)

We store this dataset in Google Cloud Storage to later on pass it when setting up the tuning job.

The expected format is JSONL, thus we will convert the pandas DataFrame to JSONL when storing it on Cloud Storage.

In [None]:
# store tuning dataset in GCS
gemini_validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_validation.to_json(
    gemini_validation_data_gcs_path, orient="records", lines=True
)

In order to make sure it is in the expected format and we won't get later on errors when launching the tuning job, we'll use our custom function validate the dataset has the format and the roles required for tuning Gemini models.

If the output is an empty list, it means there were no errors encountered. 

In [None]:
validate_gemini_tuning_jsonl(gcs_jsonl_path=gemini_validation_data_gcs_path)

---------------------------

### 4.2 Start fine-tuning job

- source_model: Specifies the base Gemini model version you want to fine-tune.
- train_dataset: Path to your training data in JSONL format.

Optional parameters

- validation_dataset: If provided, this data is used to evaluate the model during tuning.
- adapter_size: A higher adapter size means more trainable parameters.
- epochs: The number of training epochs to run.
- learning_rate_multiplier: A value to scale the learning rate during training.

We recommend to make a different set of experiments with different hyperparameter.  The below configurations are recommended to experiment based on our experiments, if your dataset is in the size of 1000s and you are including the system role in your dataset.

1. epochs: 4, learning_rate_multiplier: 1, adapter_size: 1
1. epochs: 12, learning_rate_multiplier: 4,  adapter_size: 1

If you are not including system role in your dataset, and only role user with the raw text and role models with the label, then we recommend to increase the adapter size. The below are some configurations you can start experimenting with.

1. epochs: 12, learning_rate_multiplier: 4, adapter_size: 4
1. epochs: 24, learning_rate_multiplier: 4,  adapter_size: 4


First, we set the parameters values for the first fine tuning job. 

In [None]:
# Tune a model using `train` method.

tuned_model_name = "<add-name-for-tuned-model>"  # @param {type: "string"}
epochs = 4  # @param
learning_rate_multiplier = 1  # @param
adapter_size = 1  # @param

Now, we trigger the tuning job. After running the below cell, you'll get a link to the console where you can monitor the tuning job such as metrics, and get statistics of the dataset used for tuning. After the tuning job finishes, you can also find the details for it.

In [None]:
sft_tuning_job = sft.train(
    tuned_model_display_name=tuned_model_name,
    source_model="gemini-1.0-pro-002",
    train_dataset=tuning_data_gcs_path,
    # Optional:
    validation_dataset=validation_data_gcs_path,
    epochs=epochs,
    learning_rate_multiplier=learning_rate_multiplier,
    adapter_size=adapter_size,
)

# Get the tuning job info.
sft_tuning_job.to_dict()

In [None]:
# Get the resource name of the tuning job
sft_tuning_job_name = sft_tuning_job.resource_name
sft_tuning_job_name

### 4.3 Get the tuned model and test it

To retrieve the full path from the console. You can go to [Vertex AI Studio tuning section](https://console.cloud.google.com/vertex-ai/generative/language/tuning?_ga=2.250955014.1608754049.1722498783-327343626.1722249232) and select the region where you launched your job, click on your tuning job and go to details. The last part of the Tuning Job path is the tuning job ID. Alternatively, you can also select the entire path and replace it directly as an argument for `sft.SupervisedTuningJob(...)`.

In [None]:
# Get tuning job
TUNING_JOB_ID = "<add your tuning job id>"  # @param example 952462564720115710
sft_tuning_job = sft.SupervisedTuningJob(
    f"projects/{PROJECT_ID}/locations/{LOCATION}/tuningJobs/{TUNING_JOB_ID}"
)

In [None]:
# tuned model endpoint name
tuned_model_endpoint_name = sft_tuning_job.tuned_model_endpoint_name
tuned_model_endpoint_name

In [None]:
# tuned model name
tuned_model_name = sft_tuning_job.tuned_model_name
tuned_model_name

Initiate the tuned model and test it on a single example. We will use the same generation and safety configuration as when doing in-context learning.

In [None]:
tuned_gemini_pro = GenerativeModel(
    tuned_model_endpoint_name,
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
response = tuned_gemini_pro.generate_content([test["text"].iloc[4]], stream=False)

In [None]:
print("predicted", response.text)
print("ground truth", test["label_text"].iloc[4])

### 4.4 Run evaluations on tuned model and log experiment

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].to_list()
# Compute the predictions using the zero-shot prompt
predictions_tuned_model = batch_predict(
    messages=messages_to_predict, model=tuned_gemini_pro, max_workers=4
)

In [None]:
df_evals["tuned-gem1.0-ep4-lrm1-rank4"] = predictions_tuned_model

In [None]:
metrics_tuned_gemini = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="tuned-gem1.0-ep4-lrm1-rank4",
    postprocessing=True,
)
metrics_tuned_gemini

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.5 Pro

params = {
    "model": tuned_model_name,
    "adaptation_type": "fine-tuning Gemini 1.0 Pro 002",
    "temperature": 0,
    "max_output_tokens": 10,
    "epochs": epochs,
    "lrm": learning_rate_multiplier,
    "adapter_size": adapter_size,
}

experiment_manager.log_run(
    run_name="<your-experiment-run-name",  # add your experiment name
    params=params,
    metrics=metrics_tuned_gemini,
)

## 5.  Evaluation comparisons

To assess the performance of your experiments in Vertex AI, you have two primary options. You can programmatically retrieve a comprehensive DataFrame containing all experiments and their associated metrics for in-depth analysis. Alternatively, Vertex AI offers a user-friendly visual UI enabling you to compare experiments, select specific runs for side-by-side comparisons, and gain rapid insights. For detailed instructions on both approaches, refer to the [Vertex AI documentation on evaluation comparisons](https://cloud.google.com/vertex-ai/docs/experiments/compare-analyze-runs).

In [None]:
df_experiments = experiment_manager.get_experiments_data_frame()

In [None]:
df_experiments

> Note: In the experiments with this dataset the most performant model was achieved by fine-tuning Gemini 1.0 Pro  with the below parameters:

```
epochs=6, learning_rate_multiplier= 1, and adapter_size=4
```

## [Optional] 6. Heuristics for Computing Confidence Scores

Due to the multitask essence of LLMs computing confidence scores is not as straightforward as it is with traditional predictive AI. Gemini models do not expose logprobs for the time being. However, the below snippets provide some options to use as a proxy for confidence scores in your predictions. You can expand these options to your own use cases and needs.


### [Option 1] -  Getting multiple responses from the model and generate a majority voting ratio

The overall idea is to generate different answers with the same model. Then pick the most "voted/returned" answer, and calculate its "confidence score" by dividing the number of votes among the total number of responses/candidates.

First we will define the function that will help us do the prediction and the numerical confidence.

In [None]:
def get_prediction_with_numeric_score(
    text_to_predict: str, model: Any, candidate_counts: int
) -> Dict[str, Union[float, str]]:
    """
    Generates multiple predictions from a model and determines
    the most frequent response along with its confidence score.

    Args:
        text_to_predict: The input text for which to generate predictions.
        model: The prediction model to use.
        candidate_counts: The number of predictions to generate.

    Returns:
        A dictionary containing the majority prediction and its confidence score.
        For example: {"prediction": "business", "confidence_score": 0.75}
    """
    responses = []
    for _ in range(candidate_counts):
        responses.append(model.generate_content(text_to_predict).text)

    counts = Counter(responses)
    max_value = max(counts.values())
    majority_response = [key for key in counts if counts[key] == max_value][0]
    confidence = max_value / len(responses)
    result = {"prediction": majority_response, "confidence_score": confidence}
    return result

Initialize the model to predict the class. In this example, we will use the tuned Gemini model we created before. We will use the same configurations used when doing in-context and in-weights learning.

In [None]:
tuned_gemini_pro = GenerativeModel(
    tuned_model_endpoint_name,
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

Get the predictions with its corresponding confidence score for an example text in our test dataset.

In [None]:
res = get_prediction_with_numeric_score(
    text_to_predict=test["text"].iloc[473], model=tuned_gemini_pro, candidate_counts=4
)

Print the response and the ground truth for comparison purposes

In [None]:
print("Predicted Response with Confidence Score: \n", res)
print("Ground Truth:\n", test["label_text"].iloc[473])

### [Option 2] -  Generating "Verbal Confidences" with an LLM

The idea is to make 2-calls per prediction, one for predicting the class, and a second one to ask the LLM to judge how confident it is about it, giving as options verbal confidences like "low", "medium" and "high".

In this example, we will use our tuned Gemini model to predict the class and frozen Gemini 1.5 Pro to judge the prediction verbally.


First we will define the function that will help us do the prediction and the verbal confidence.

In [None]:
def get_prediction_with_verbal_score(
    text_to_predict: str,
    model_to_predict_class: Any,
    model_to_eval_prediction: Any,
    possible_classes: List[str] = [
        "business",
        "entertainment",
        "sport",
        "tech",
        "politics",
    ],
) -> Dict[str, str]:
    """
    Generates a prediction and then evaluates its confidence using a separate model.

    Args:
        text_to_predict: The input text for which to generate predictions.
        model_to_predict_class: The model to predict the class.
        model_to_eval_prediction: The model to evaluate the confidence of the prediction.
        possible_classes: A list of possible classes.

    Returns:
        A dictionary containing the prediction and its verbal confidence score.
        For example: {"prediction": "business", "verbal_score": "very confident"}
    """
    prediction = model_to_predict_class.generate_content(text_to_predict).text
    remaining_classes = possible_classes.copy()
    remaining_classes.remove(prediction)
    formatted_prompt = f"""
    TEXT:
    {text_to_predict}

    PREDICTED CLASS:
    {prediction}

    OTHER POSSIBLE CLASSES:
    {remaining_classes}
    """
    confidence = model_to_eval_prediction.generate_content(formatted_prompt).text
    result = {"prediction": prediction, "verbal_score": confidence}
    return result

Configure the model parameters and initialize the model to predict the class. In this example, we will use the tuned Gemini model we created before. We will use the same configurations used when doing in-context and in-weights learning.

In [None]:
model_to_predict_class = GenerativeModel(
    tuned_model_endpoint_name,
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

Define the Prompt to steer the evaluation and verbal confidence.

In [39]:
# Define the configurations for the model which will evaluate the predictions

eval_prompt = """
You will get a text about a particular topic, the predicted class for the topic and a list of the other different classes that the model could have chosen.
Your task is to judge how well the predicted class fitted the text, based on the other possible classes.
You need to evaluate and judge your prediction, indicating how confident you are with your answer. You will judge the prediction as follows:

- If you are confident the text is correctly labeled with the given prediction, then respond with "High"
- If it can be that the model could match other classes, or you are not very sure the class corresponds to the text, then respond with "Medium"
- If you believe it makes no sense the class predicted for that text, then respond with "Low".

You MUST only output "High", "Medium" or "Low" without any further explanation.
"""

Initialize the model to be used for computing the verbal confidence

In [None]:
model_to_eval_class = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction=[eval_prompt],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

Call the function to compute the prediction including its verbal score

In [None]:
# Call the function to get the predictions with verbal score
res_verbal_conf = get_prediction_with_verbal_score(
    text_to_predict=test["text"].iloc[473],
    model_to_predict_class=model_to_predict_class,
    model_to_eval_prediction=model_to_eval_class,
    possible_classes=["business", "entertainment", "sport", "tech", "politics"],
)

In [None]:
print("Predicted Response with Verbal Score: \n", res_verbal_conf)
print("Ground Truth:\n", test["label_text"].iloc[473])

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.


Otherwise, you can delete the individual resources you created in this tutorial.

Refer to this [instructions](https://cloud.google.com/vertex-ai/docs/tutorials/image-classification-custom/cleanup#delete_resources) to delete the resources from console.

In [None]:
# Delete Experiment.
delete_experiments = True
if delete_experiments:
    experiments_list = aiplatform.Experiment.list()
    for experiment in experiments_list:
        if experiment.resource_name == EXPERIMENT_NAME:
            print(experiment.resource_name)
            experiment.delete()
            break

print("***" * 10)

# Delete Endpoint.
delete_endpoint = True
# If force is set to True, all deployed models on this
# Endpoint will be first undeployed.
if delete_endpoint:
    for endpoint in aiplatform.Endpoint.list():
        if endpoint.resource_name == tuned_model_endpoint_name:
            print(endpoint.resource_name)
            endpoint.delete(force=True)
            break

print("***" * 10)

# Delete Model.
delete_model = True
if delete_model:
    # Remove version from model name.
    tuned_model_name = tuned_model_name.split("@")[0]
    for model in aiplatform.Model.list():
        if model.resource_name == tuned_model_name:
            print(model.resource_name)
            model.delete()
            break

print("***" * 10)

# Delete Cloud Storage Bucket.
delete_bucket = True
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI