In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text Classification with Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Ftuning%2Fgemini_supervised_finetuning_text_classification.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/tuning/gemini_supervised_finetuning_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| | | |
|-|-|-|
|Author(s) | [Gabriela Hernandez Larios](https://github.com/gabrielahrlr) | [Elia Secchi](https://github.com/eliasecchig)|


## Overview

This tutorial demonstrates how to perform text classification with Gemini models. From in-context learning (using zero-shot and few-shot learning) to in-weights learning fine-tuning Gemini models for text classification.

### Objective
 We'll cover the development cycle from preparing the dataset, to setting up an evaluation framework to perform text classification tasks using Gemini. Additionally, you'll learn how to create and log experiments, adapting Gemini models to the text classification task with in-context and in-weights (fine-tuning) learning approaches, and compare the performances.

This tutorial uses the following Google Cloud ML Services and Resources:

- Google Cloud Storage
- Vertex AI Experiments
- Vertex AI Tuning
- Gemini 1.0 Pro, Gemini 1.5 Pro and Gemini 1.5 Flash

The steps performed include:
- [Load and split dataset](#scrollTo=EdvJRUWRNGHE&line=1&uniqifier=1)
- [Evaluation and Experiment Setup](#scrollTo=c2YOsromfcuB&line=6&uniqifier=1)
- [In-Context learning (zero-shot and few-shot) with Gemini Model](#scrollTo=EfKnRU-SfcuB)
- [Fine-tuning Gemini 1.0 Pro for text classification](#scrollTo=Qs9eHiL5fcuD)
- [Comparative Evaluation]()
- [[Optional] Heuristics for computing Confidence Scores](#scrollTo=KW7wPWQWuQT4)

### Dataset
The [BBC News dataset](http://mlg.ucd.ie/datasets/bbc.html) consists 2225 articles from the BBC news website corresponding to five topical areas: business, entertainment, politics, sport, and tech.  This dataset was downloaded from http://mlg.ucd.ie/datasets/bbc.html

**Dataset Citation**

```
@inproceedings{greene06icml,
	Author = {Derek Greene and P\'{a}draig Cunningham},
	Booktitle = {Proc. 23rd International Conference on Machine learning (ICML'06)},
	Pages = {377--384},
	Publisher = {ACM Press},
	Title = {Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering},
	Year = {2006}}
```

## Installation

### Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform
%pip install --user --quiet datasets
%pip install --user --quiet backoff
%pip install --user --quiet multiprocess

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries


In [None]:
import backoff
import multiprocess as mp
import traceback
import nest_asyncio
import warnings
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
import vertexai

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from google.api_core.exceptions import ResourceExhausted
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Union
from functools import partial
from tqdm import tqdm
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score
from typing import Dict
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, HarmCategory, HarmBlockThreshold
from vertexai.preview.tuning import sft

### Set Google Cloud project information, initialize Vertex AI SDK for Python and create a GCS bucket

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
BUCKET_NAME = "[yout-bucket-name]]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

**warning:** Only if your bucket doesn't already exist: Run the following cell to create your Cloud Storage bucket.


In [None]:
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents

In [None]:
! gsutil ls -al $BUCKET_URI

### Helper Functions

#### Batch Prediction - Helper functions

In [None]:
def backoff_hdlr(details):
    print("Backing off {wait:0.1f} seconds after {tries} tries ".format(**details))


def log_error(msg, *args):
    mp.get_logger().error(msg, *args)
    raise Exception(msg)


def handle_exception_threading(f):
    def applicator(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except:
            log_error(traceback.format_exc())

    return applicator


@handle_exception_threading
@backoff.on_exception(
    backoff.expo, ResourceExhausted, max_tries=30, on_backoff=backoff_hdlr
)
def _predict_message(message: str, model: GenerativeModel):
    """
    Predict messages
    """
    response = model.generate_content([message], stream=False)
    return response.text


def batch_predict(messages: list, model: GenerativeModel, max_workers: int = 4) -> list:
    """
    Predicts the classes for a list of messages

    Args:
        - messages: list of all messages to predict
        - model_name: model to use for predicting.
        - max_workers: number of workers to use for parallel predictions

    Returns:
        - list of predicted labels

    """
    predictions = list()
    with ThreadPoolExecutor(max_workers) as pool:
        partial_func = partial(_predict_message, model=model)
        for message in tqdm(pool.map(partial_func, messages), total=len(messages)):
            predictions.append(message)
            pass

    return predictions

#### Experiment Logging - Helper Functions

In [None]:
def create_experiment(
    experiment_name: str,
    project: str,
    location: str,
    experiment_description: str = "",
):
    """
    Create an Experiment on Vertex AI Experiments
    Args:
        - experiment_name: Provide a name for your experiment.
        - experiment_description: Provide a description for your experiment.
        - project: Your project ID. You can find these IDs in the Google Cloud console welcome page.
        - location: See List of available locations Be sure to use a region that supports TensorBoard if creating a TensorBoard instance.
    """
    aiplatform.init(
        experiment=experiment_name,
        experiment_description=experiment_description,
        experiment_tensorboard=False,
        project=project,
        location=location,
    )


def log_experiment_run_vertexai(
    experiment_name: str,
    run_name: str,
    params: Dict[str, Union[float, int, str]],
    metrics: Dict[str, float],
    project: str,
    location: str,
):
    aiplatform.init(experiment=experiment_name, project=project, location=location)
    aiplatform.start_run(run=run_name)
    aiplatform.log_params(params)
    aiplatform.log_metrics(metrics)
    aiplatform.end_run()


def get_experiments_data_frame_sample(
    experiment: str,
    project: str,
    location: str,
):
    aiplatform.init(experiment=experiment, project=project, location=location)
    experiments_df = aiplatform.get_experiment_df()
    return experiments_df

## 1. Load and Splitting Dataset
In this step, we will load the raw data and create training, validation and test sets. Later these datasets will be used to perform  different types of adaptations to Gemini models for the task under consideration.


In [None]:
datasets = load_dataset("SetFit/bbc-news")

In [None]:
train = pd.DataFrame(datasets["train"])
test = pd.DataFrame(datasets["test"])

In [None]:
train.head()

In [None]:
train.label_text.value_counts()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
val, test = train_test_split(
    test, test_size=0.5, shuffle=True, stratify=test["label_text"], random_state=2
)

In [None]:
print(val.shape)
print(test.shape)

In [None]:
val.label_text.value_counts()

In [None]:
test.label_text.value_counts()

##  2. Evaluation and Experiment Setup
We will create the required functions to evaluate our experiments and to log them in Vertex Experiments.


### Evaluation Setup
For this text classification task, we will use the below classification metrics to evaluate the performance of the models and it different adaptations. We will track the below metrics in our development.

- Overall Micro-F1
- Overall Macro-F1
- Overall Accuracy
- Overall Weighted Precision
- Overall Weighted Recall
- F1-Score (overall and per class)

In [None]:
from sklearn.metrics import classification_report, f1_score


def predictions_postprocessing(text):
    """
    Cleans the predicted class label string.

    Args:
        text (str): The predicted class label string.

    Returns:
        str: The cleaned class label string.
    """
    text = text.rstrip()
    text = text.lstrip()
    text = text.lower()
    return text


def evaluate_predictions(
    df: pd.DataFrame,
    target_column: str = "label_text",
    predictions_column: str = "predicted_labels",
    postprocessing: bool = True,
) -> dict:
    """ "
    Batch evaluation of predictions, returns a dictionary with the metric
    Args:
       - df (pandas.DataFrame):  a pandas dataframe with two mandatory columns, a target column with
       the actual true values, and a predictions column with the predicted values.
       - target_column (str): column name with the actual ground truth values
       - predictions_column (str): column name with the model predictions
    """

    if postprocessing:
        df[predictions_column] = df[predictions_column].apply(
            lambda x: predictions_postprocessing(x)
        )

    metrics_report = classification_report(
        df[target_column], df[predictions_column], output_dict=True
    )
    overall_macro_f1_score = f1_score(
        df[target_column], df[predictions_column], average="macro"
    )
    overall_micro_f1_score = f1_score(
        df[target_column], df[predictions_column], average="micro"
    )
    weighted_precision = precision_score(
        df[target_column], df[predictions_column], average="weighted"
    )
    weighted_recall = recall_score(
        df[target_column], df[predictions_column], average="weighted"
    )

    metrics = {
        "accuracy": metrics_report["accuracy"],
        "weighted precision": weighted_precision,
        "weighted recall": weighted_recall,
        "macro f1": overall_macro_f1_score,
        "micro f1": overall_micro_f1_score,
        "business_f1_score": metrics_report["business"]["f1-score"],
        "sport_f1_score": metrics_report["sport"]["f1-score"],
        "politics_f1_score": metrics_report["politics"]["f1-score"],
        "tech_f1_score": metrics_report["tech"]["f1-score"],
        "entertainment_f1_score": metrics_report["entertainment"]["f1-score"],
    }
    return metrics

### Experiment Setup
Before starting the development and experimentation process, we will setup Vertex AI Experiments, in order to log all the experiments we run and compare them based on the estipulated metrics. Some of the functions for setting up vertex experiments are in the [helper functions section](#scrollTo=0tKVjsJKfcuA). In this part we will just create an experiment where we will log all our different runs.

For more information about Vertex Experiments, please refer to its [documentation](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)

In [None]:
EXPERIMENT_NAME = "[your-experiment]"  # @param {type:"string"}

In [None]:
create_experiment(
    experiment_name=EXPERIMENT_NAME, project=PROJECT_ID, location=LOCATION
)

In [None]:
## Create an Evaluation dataframe yo store the predictions from all the experiments.
df_evals = test.copy()

## 3. In-Context Adaptation using Gemini models

In this section we'll do in-context learning to instruct Gemini models to perform the text classification task under consideration, using zero-shot and few-shot prompt engineering techniques.  We'll use and compare the performances of Gemini 1.0 Pro, Gemini 1.5 Pro and Gemini 1.5 Flash using the same prompts.

The prompts presented in this section are crafted for this task, and in our experiments they demonstrate superior results compared to other simpler prompts.

**Before fine-tuning a model, it is important to find the best prompt**: system instructions, examples, structure, etc., for the task under consideration. This will permit to get an understanding of which prompt works the best for the used model, and even boost more the performances when fine-tuning.

**Note:** Prompt Engineering is model-dependent. We recommend you to experiment with different prompting techniques per model. Techniques like Chain-of-Thought can increase performances, as well as Dynamic Few-Shots (using a RAG system to dynamically integrate the examples that are similar to the user input).

In [None]:
system_prompt_zero_shot = """TASK:
Classify the text into ONLY one of the following classes [business, entertainment, politics, sport, tech].

CLASSES:
- business
- entertainment
- politics
- sport
- tech

INSTRUCTIONS
- Respond with ONLY one class.
- You MUST use the exact word from the list above.
- DO NOT create or use any other classes.
- CAREFULLY analyze the text before choosing the best-fitting category from [business, entertainment, politics, sport, tech].

"""

For the few-shot prompt, we'll randomly pick an example from each category using the `train` dataset we previously computed.

In [None]:
system_prompt_few_shot = f"""TASK:
Classify the text into ONLY one of the following classes [business, entertainment, politics, sport, tech].

CLASSES:
- business
- entertainment
- politics
- sport
- tech

INSTRUCTIONS:
- Respond with ONLY one class.
- You MUST use the exact word from the list above.
- DO NOT create or use any other classes.
- CAREFULLY analyze the text before choosing the best-fitting category from [business, entertainment, politics, sport, tech].

EXAMPLES:
- EXAMPLE 1:
    <user>
    {train.loc[train["label_text"] == "business", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "business", "label_text"].iloc[10]}

- EXAMPLE 2:
    <user>
    {train.loc[train["label_text"] == "entertainment", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "entertainment", "label_text"].iloc[10]}

- EXAMPLE 3:
    <user>
    {train.loc[train["label_text"] == "politics", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "politics", "label_text"].iloc[10]}

- EXAMPLE 4:
    <user>
    {train.loc[train["label_text"] == "sport", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "sport", "label_text"].iloc[10]}

- EXAMPLE 4:
    <user>
    {train.loc[train["label_text"] == "tech", "text"].iloc[10]}
    <model>
    {train.loc[train["label_text"] == "tech", "label_text"].iloc[10]}

"""

For the below evaluations, we'll use the respective functions we have already set up. For in-context learning, we recommend to use the validation set to find the optimal performance and then apply it to the test set, to make sure the metrics remain consistent. In this notebook, we'll directly evaluate on the test dataset, as the validation and prompt engineering part has been already done.

### 3.1 Gemini 1.0 Pro in-context Evaluation

In [None]:
generation_config = {"max_output_tokens": 10, "temperature": 0}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

gem_pro_1_model_zero = GenerativeModel(
    "gemini-1.0-pro-002",
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].to_list()
# Compute the preictions
predictions_zero_shot = batch_predict(
    messages=messages_to_predict, model=gem_pro_1_model_zero, max_workers=4
)

In [None]:
df_evals["gem1.0-zero-shot_predictions"] = predictions_zero_shot
len(predictions_zero_shot)

In [None]:
# Compute Evaluation Metrics for zero-shot prompt
metrics_zero_shot = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.0-zero-shot_predictions",
    postprocessing=True,
)
metrics_zero_shot

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.0 Pro
params = {
    "model": "gemini-1.0-pro-002",
    "adaptation_type": "in-context zero-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

log_experiment_run_vertexai(
    experiment_name=EXPERIMENT_NAME,
    run_name="gemini-1-0-pro-002-zero-shot",
    params=params,
    metrics=metrics_zero_shot,
    project=PROJECT_ID,
    location=LOCATION,
)

In [None]:
# Test Few-Shot, and other prompts/possibilities
gem_pro_1_model_few = GenerativeModel(
    "gemini-1.0-pro-002",
    system_instruction=[system_prompt_few_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
predictions_few_shot = batch_predict(
    messages=messages_to_predict, model=gem_pro_1_model_few
)

In [None]:
df_evals["gem1.0-few-shot_predictions"] = predictions_few_shot
len(predictions_few_shot)

In [None]:
# Compute Evaluation Metrics for few-shot prompt
metrics_few_shot = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.0-few-shot_predictions",
    postprocessing=True,
)
metrics_few_shot

In [None]:
# Log Experiment with Few-Shot Prompt with Gemini 1.0 Pro

params = {
    "model": "gemini-1.0-pro-002",
    "adaptation_type": "in-context few-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

log_experiment_run_vertexai(
    experiment_name=EXPERIMENT_NAME,
    run_name="gemini-1-0-pro-few-shot",
    params=params,
    metrics=metrics_few_shot,
    project=PROJECT_ID,
    location=LOCATION,
)

### 3.2 Gemini 1.5 Pro in-context Evaluation

In [None]:
# Define the model and configurations
generation_config = {"max_output_tokens": 10, "temperature": 0}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

gem_pro_1_5_model_zero = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].sample(50).to_list()
# Compute the predictions using the zero-shot prompt
predictions_zero_shot_15_pro = batch_predict(
    messages=messages_to_predict, model=gem_pro_1_5_model_zero
)

In [None]:
df_evals["gem1.5-pro-zero-shot_predictions"] = predictions_zero_shot_15_pro

In [None]:
# Compute Evaluation Metrics for zero-shot prompt
metrics_zero_shot_15_pro = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.5-pro-zero-shot_predictions",
    postprocessing=True,
)
metrics_zero_shot_15_pro

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.5 Pro

params = {
    "model": "gemini-1.5-pro-001",
    "adaptation_type": "in-context zero-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

log_experiment_run_vertexai(
    experiment_name=EXPERIMENT_NAME,
    run_name="gemini-1-5-pro-zero-shot",
    params=params,
    metrics=metrics_zero_shot_15_pro,
    project=PROJECT_ID,
    location=LOCATION,
)

### 3.3 Gemini 1.5 Flash in-context Evaluation

In [None]:
# Define the model and configurations
generation_config = {"max_output_tokens": 10, "temperature": 0}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

gem_1_5_flash_model_zero = GenerativeModel(
    "gemini-1.5-flash-001",
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].to_list()
# Compute the predictions using the zero-shot prompt
predictions_zero_shot_15_flash = batch_predict(
    messages=messages_to_predict, model=gem_1_5_flash_model_zero
)

In [None]:
df_evals["gem1.5-flash-zero-shot_predictions"] = predictions_zero_shot_15_flash

In [None]:
# Compute Evaluation Metrics for zero-shot prompt
metrics_zero_shot_15_flash = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="gem1.5-flash-zero-shot_predictions",
    postprocessing=True,
)
metrics_zero_shot_15_flash

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.5 Pro

params = {
    "model": "gemini-1.5-flash-001",
    "adaptation_type": "in-context zero-shot",
    "temperature": 0,
    "max_output_tokens": 10,
}

log_experiment_run_vertexai(
    experiment_name=EXPERIMENT_NAME,
    run_name="gemini-1-5-flash-zero-shot",
    metrics=metrics_zero_shot_15_flash,
    project=PROJECT_ID,
    location=LOCATION,
)

## 4. Fine-tuning (Parameter Efficient) Gemini Pro 1.0
Supervised fine-tuning helps adapt foundation models to new tasks using smaller, highly relevant datasets. To ensure success, focus on:

- Using domain-specific data: Choose data closely matching your real-world use case.
- Accurate labeling: High-quality annotations are crucial.
- Clean data: Remove duplicates, fix errors, and ensure relevance to your task.
- Diverse but focused examples: Include variety within your target domain, avoiding irrelevant data.
- Balanced classes (for classification): Maintain a balance to prevent bias towards a specific class.

### 4.1 Prepare tuning datasets for fine-tuning Gemini Models on Vertex AI

Training data should be structured within a JSONL file located at a Google Cloud Storage (GCS) URI. Each line (or row) of the JSONL file must adhere to a specific schema: It should contain a "messages" array, with objects inside defining a "role" ("system" for the system context,  "user" for user input or "model" for model output) and the corresponding text "content". For example, a valid data row would look like this:

```
{
    "messages": [
      {
        "role": "system",
        "content": "You should classify the text into one of the following classes:[business, entertainment]"
      },
      { "role": "user", "content": "Diversify your investment portfolio" },
      { "role": "model", "content": "business" }
    ]
}
```

The role "system" is optional. You can find more information about the dataset format and preparation in the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about)

To run a tuning job, you need to upload your tuning and validation(optional) datasets to a Cloud Storage bucket. You can either create a new Cloud Storage bucket or use an existing one to store dataset files. We recommend that you use a bucket that's in the same Google Cloud project where you plan to tune your model.


In this section, we will provide guidelines to prepare the training and validation (optional) datasets based on three options:

1. [Option 1] From scratch, using the datasets we loaded and splitted at the beginning of this notebook.

1. [Option 2] Providing a function to convert an AutoML Dataset on CSV format to the expected format to fine-tune and validate Gemini Models.

1. [Option 3] Providing a function to convert an AutoML Dataset on JSONL format to the expected format to fine-tune and validate Gemini Models.



#### [Option 1] Prepare tuning and validation datasets from scratch

We need to prepare our training and validaiton (optional) datasets for the text classification task. It is recommended to ass a system role with the instructions on how to classify as part of the dataset. Since we are going to fine-tune the model, the need to add few-shot examples as part of the prompt is eliminated, and therefore we will reuse the `system_prompt_zero_shot` that we used previously.

In [None]:
# Create tuning dataset
# shuffle the data
train = train.sample(frac=1).reset_index(drop=True)
gemini_tuning_dataset = list()
for idx, row in train.iterrows():
    message_system = {"role": "system", "content": system_prompt_zero_shot}
    message_user = {"role": "user", "content": row["text"]}
    message_model = {"role": "model", "content": row["label_text"]}
    gemini_tuning_dataset.append(
        {"messages": [message_system, message_user, message_model]}
    )

gemini_tuning_df = pd.DataFrame(gemini_tuning_dataset)
print(gemini_tuning_df.shape)
gemini_tuning_df.head()

In [None]:
# store tuning dataset in GCS
tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}

gemini_tuning_df.to_json(tuning_data_gcs_path, orient="records", lines=True)

In [None]:
# Create validation dataset
gemini_validation_dataset = list()
# shuffle the data
val_sampled = val.sample(frac=1).reset_index(drop=True)
for idx, row in val_sampled.iterrows():
    message_system = {"role": "system", "content": system_prompt_zero_shot}
    message_user = {"role": "user", "content": row["text"]}
    message_model = {"role": "model", "content": row["label_text"]}
    gemini_validation_dataset.append(
        {"messages": [message_system, message_user, message_model]}
    )

gemini_validation_df = pd.DataFrame(gemini_validation_dataset)
print(gemini_validation_df.shape)
gemini_validation_df.head()

In [None]:
# store validation dataset in GCS
validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
gemini_validation_df.to_json(validation_data_gcs_path, orient="records", lines=True)

#### [Option 2] AutoML CSV training dataset format to Gemini tuning data format.
If you were previously using Vertex AI AutoML for text classification, and you have your data in the below csv format expected by AutoML:

```
[ml_use],gcs_file_uri|"inline_text",label
```

```
test,"inline_text",label1
test,"inline_text",label2
training,"inline_text",label3
validation,"inline_text",label1
```

Youu can utilize the below function to convert your AutoML CSV datasets for text classification to the format expected for the training datasets to fine-tune Gemini models.

In [None]:
import pandas as pd


def convert_automl_csv_dataset_to_gemini(
    automl_gcs_csv_path: str, system_prompt: str = "", partition="training"
):
    """
    Converts an AutoML CSV dataset for text classification to the Gemini tuning format.

    This function takes an AutoML CSV dataset path on Google Cloud Storage and a partition name as input.
    It reads the CSV file, filters the data based on the partition, and then
    converts it into the Gemini tuning format. The Gemini tuning format requires a
    list of dictionaries, where each dictionary represents a conversation turn
    with "role" and "content" keys.

    Args:
      automl_gcs_csv_path: The GCS path to the AutoML CSV dataset.
      system_prompt: the instructions to the model
      partition: The partition to extract from the dataset (e.g., "training",
        "validation", "test"). Defaults to "training".

    Returns:
      A pandas DataFrame containing the data in the Gemini tuning format.
    """

    df = pd.read_csv(gcs_path_automl_dataset, names=["partition", "text", "label"])

    df_automl = df.loc[df["partition"] == partition]
    gemini_dataset = list()

    if system_prompt != "":
        for idx, row in df_automl.iterrows():
            message_system = {"role": "system", "content": system_prompt}
            message_user = {"role": "user", "content": row["text"]}
            message_model = {"role": "model", "content": row["label"]}
            gemini_dataset.append(
                {"messages": [message_system, message_user, message_model]}
            )
    else:
        for idx, row in df_automl.iterrows():
            message_user = {"role": "user", "content": row["text"]}
            message_model = {"role": "model", "content": row["label"]}
            gemini_dataset.append(
                {"messages": [message_system, message_user, message_model]}
            )

    df_gemini = pd.DataFrame(gemini_dataset)
    return df_gemini

In [None]:
## Usage Example for Training dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.csv"  # @param {type: "string"}
)
df_gemini_tuning = convert_automl_csv_dataset_to_gemini(
    automl_gcs_csv_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="training",
)

# store tuning dataset in GCS
gemini_tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_tuning.to_json(gemini_tuning_data_gcs_path, orient="records", lines=True)

In [None]:
## Usage Example for validation dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.csv"  # @param {type: "string"}
)
df_gemini_validation = convert_automl_csv_dataset_to_gemini(
    automl_gcs_csv_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="validation",
)

# store tuning dataset in GCS
gemini_validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_validation.to_json(
    gemini_validation_data_gcs_path, orient="records", lines=True
)

####  [Option 3] AutoML JSONL training dataset format to Gemini tuning data format

If you were previously using Vertex AI AutoML for text classification, and you have your data in the below JSONL format expected by AutoML:

```
{
  "classificationAnnotation": {
    "displayName": "label"
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
{
  "classificationAnnotation": {
    "displayName": "label2"
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
```

You can utilize the below function to convert your AutoML JSONL datasets for text classification to the format expected for the training datasets to fine-tune Gemini models.

In [None]:
import pandas as pd


def convert_automl_jsonl_dataset_to_gemini(
    automl_gcs_jsonl_path: str, system_prompt: str = "", partition="training"
):
    """
    Converts an AutoML JSONL dataset for text classification to the Gemini tuning format.

    This function takes an AutoML JSONL dataset path on Google Cloud Storage and a partition name as input.
    It reads the JSONL file, filters the data based on the partition, and then
    converts it into the Gemini tuning format. The Gemini tuning format requires a
    list of dictionaries, where each dictionary represents a conversation turn
    with "role" and "content" keys.

    Args:
        automl_gcs_jsonl_path: The GCS path to the AutoML JSONL dataset for text classification.
        system_prompt: the instructions to the model
        partition: The partition to extract from the dataset (e.g., "training",
        "validation", "test"). Defaults to "training".

    Returns:
        A pandas DataFrame containing the data in the Gemini tuning format.
    """
    processed_data = []
    with open(automl_gcs_jsonl_path, "r") as f:
        for line in f:
            data = json.loads(line)
            reformatted_data = dict()
            reformatted_data["label"] = data["classificationAnnotation"]["displayName"]
            reformatted_data["text"] = data["textContent"]
            reformatted_data["partition"] = data["dataItemResourceLabels"][
                "aiplatform.googleapis.com/ml_use"
            ]
            processed_data.append(reformatted_data)

    df = pd.DataFrame(gemini_dataset)
    df_automl = df.loc[df["partition"] == partition]
    gemini_dataset = list()

    if system_prompt != "":
        for idx, row in df_automl.iterrows():
            message_system = {"role": "system", "content": system_prompt}
            message_user = {"role": "user", "content": row["text"]}
            message_model = {"role": "model", "content": row["label"]}
            gemini_dataset.append(
                {"messages": [message_system, message_user, message_model]}
            )
    else:
        for idx, row in df_automl.iterrows():
            message_user = {"role": "user", "content": row["text"]}
            message_model = {"role": "model", "content": row["label"]}
            gemini_dataset.append({"messages": [message_user, message_model]})

    df_gemini = pd.DataFrame(gemini_dataset)

    return df_gemini

In [None]:
## Usage Example for Training dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.jsonl"  # @param {type: "string"}
)
df_gemini_tuning = convert_automl_jsonl_dataset_to_gemini(
    automl_gcs_jsonl_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="training",
)

# store tuning dataset in GCS
gemini_tuning_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/tuning_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_tuning.to_json(gemini_tuning_data_gcs_path, orient="records", lines=True)

In [None]:
## Usage Example for validation dataset
gcs_path_automl_dataset = (
    "gs://<your-bucket-path>/<your-data>.csv"  # @param {type: "string"}
)
df_gemini_validation = convert_automl_jsonl_dataset_to_gemini(
    automl_gcs_jsonl_path=gcs_path_automl_dataset,
    system_prompt=system_prompt_zero_shot,
    partition="validation",
)

# store tuning dataset in GCS
gemini_validation_data_gcs_path = f"gs://{BUCKET_NAME}/tuning_experiments/validation_dataset_gemini.jsonl"  # @param {type: "string"}
df_gemini_validation.to_json(
    gemini_validation_data_gcs_path, orient="records", lines=True
)

### 4.2 Start fine-tuning job

- source_model: Specifies the base Gemini model version you want to fine-tune.
- train_dataset: Path to your training data in JSONL format.

Optional parameters

- validation_dataset: If provided, this data is used to evaluate the model during tuning.
- adapter_size: A higher adapter size means more trainable parameters.
- epochs: The number of training epochs to run.
- learning_rate_multiplier: A value to scale the learning rate during training.

We recommend to make a different set of experiments with different hyperparameter.  The below configurations are recommended to experiment based on our experiments, if your dataset is in the size of 1000s and you are including the system role in your dataset.

1. epochs: 4, learning_rate_multiplier: 1, adapter_size: 1
1. epochs: 12, learning_rate_multiplier: 4,  adapter_size: 1

If you are not including system role in your dataset, and only role user with the raw text and role models with the label, then we recommend to increase the adapter size. The below are some configurations you can start experimenting with.

1. epochs: 12, learning_rate_multiplier: 4, adapter_size: 4
1. epochs: 24, learning_rate_multiplier: 4,  adapter_size: 4


In [None]:
# Tune a model using `train` method.

tuned_model_name = "<add-name-for-tuned-model>"  # @param {type: "string"}
epochs = 4  # @param
learning_rate_multiplier = 1  # @param
adapter_size = 1  # @param


sft_tuning_job = sft.train(
    tuned_model_display_name=tuned_model_name,
    source_model="gemini-1.0-pro-002",
    train_dataset=tuning_data_gcs_path,
    # Optional:
    validation_dataset=validation_data_gcs_path,
    epochs=epochs,
    learning_rate_multiplier=learning_rate_multiplier,
    adapter_size=adapter_size,
)

# Get the tuning job info.
sft_tuning_job.to_dict()

In [None]:
# Get the resource name of the tuning job
sft_tuning_job_name = sft_tuning_job.resource_name
sft_tuning_job_name

### 4.3 Get the tuned model and test it is working as expected

You can get the full path by going to your tuning jobs in the console and then to details.

In [None]:
# Get tuning job
TUNING_JOB_ID = "<add your tuning job id>"  # @param
sft_tuning_job = sft.SupervisedTuningJob(
    f"projects/{PROJECT_ID}/locations/{LOCATION}/tuningJobs/{TUNING_JOB_ID}"
)

In [None]:
# tuned model endpoint name
tuned_model_endpoint_name = sft_tuning_job.tuned_model_endpoint_name
tuned_model_endpoint_name

In [None]:
# tuned model name
tuned_model_name = sft_tuning_job.tuned_model_name
tuned_model_name

In [None]:
tuned_model = GenerativeModel(
    tuned_model_endpoint_name, system_instruction=[system_prompt_zero_shot]
)

response = tuned_model.generate_content([test["text"].iloc[4]], stream=False)

In [None]:
print("predicted", response.text)
print("ground truth", test["label_text"].iloc[4])

### 4.4 Run evaluations on tuned model and log experiment

In [None]:
# Get the list of messages to predict
messages_to_predict = test["text"].to_list()
# Compute the predictions using the zero-shot prompt
predictions_tuned_model = batch_predict(
    messages=messages_to_predict,
    model_name=tuned_model_endpoint_name,
    system_prompt=system_prompt_zero_shot,
    temperature=0,
    max_workers=4,
)

In [None]:
df_evals["tuned-gem1.0-ep6-lrm1-rank4"] = predictions_tuned_model

In [None]:
metrics_tuned_gemini = evaluate_predictions(
    df_evals.copy(),
    target_column="label_text",
    predictions_column="tuned-gem1.0-ep6-lrm1-rank4",
    postprocessing=True,
)
metrics_tuned_gemini

In [None]:
# Log Experiment with zero-shot Prompt with Gemini 1.5 Pro

params = {
    "model": "<your_tuned_model>",
    "adaptation_type": "fine-tuning gemini 1.0 Pro 002",
    "temperature": 0,
    "max_output_tokens": 10,
    "epochs": epochs,
    "lrm": learning_rate_multiplier,  # learning rate multiplier
    "adapter_size": adapter_size,
}

log_experiment_run_vertexai(
    experiment_name=EXPERIMENT_NAME,
    run_name="<your-experiment-run-name",
    metrics=metrics_tuned_gemini,
    project=PROJECT_ID,
    location=LOCATION,
)

## 5.  Evaluation comparisons

To assess the performance of your experiments in Vertex AI, you have two primary options. You can programmatically retrieve a comprehensive table (pandas DataFrame) containing all experiments and their associated metrics for in-depth analysis. Alternatively, Vertex AI offers a user-friendly visual UI enabling you to compare experiments, select specific runs for side-by-side comparisons, and gain rapid insights. For detailed instructions on both approaches, refer to the [Vertex AI documentation on evaluation comparisons](https://cloud.google.com/vertex-ai/docs/experiments/compare-analyze-runs).

In [None]:
df_experiments = get_experiments_data_frame_sample(
    experiment=EXPERIMENT_NAME, project=PROJECT_ID, location=LOCATION
)

In [None]:
df_experiments



> In our experiments with this dataset the most performant model was achieved by tuning Gemini 1.0 Pro @002, with the below parameters:

```
epochs=6, learning_rate_multiplier= 1, and adapter_size=4
```

## [Optional] 6. Heuristics for Computing Confidence Scores

Due to the multitask essence of LLMs computing confidence scores is not as straightforwad as it is with traditional predictive AI. Gemini models do not expose logprobs for the time being. However, the below snippets provide some options to use as a proxy for confidence scores in your predictions. You can expand these options to your own use cases and needs.





### [Option 1] -  Getting multiple responses from the model and generate a majority voting ratio

The overall idea is to generate different answers with the same model. Then pick the most "voted/returned" answer, and calculate its "confidence score" by dividing the number of votes among the total number of responses/candidates.

In [None]:
from collections import Counter


def get_prediction_with_numeric_score(
    text_to_predict: str, model: Any, candidate_counts
):
    """
    Generates multiple predictions from a model and determines
    the most frequent response along with its confidence score.

    Args:
        text_to_predict (str): The input text for which to generate predictions.
        model (Any): The prediction model to use.
        candidate_counts (int): The number of predictions to generate.

    Returns:
        dict: A dictionary containing the majority prediction and its confidence score.
              For example: {"prediction": "business", "confidence_score": 0.75}
    """
    responses = list()
    for i in range(candidate_counts):
        responses.append(model.generate_content(text_to_predict).text)

    counts = Counter(responses)
    max_value = max(counts.values())
    majority_response = [key for key in counts if counts[key] == max_value][0]
    confidence = max_value / len(responses)
    result = {"prediction": majority_response, "confidence_score": confidence}
    return result

In [None]:
# Define the model and configurations
generation_config = {"max_output_tokens": 10, "temperature": 0, "candidate_count": 1}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

tuned_model_multiple_responses = GenerativeModel(
    tuned_model_endpoint_name,
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
res = get_prediction_with_numeric_score(
    text_to_predict=test["text"].iloc[473],
    model=tuned_model_multiple_responses,
    candidate_counts=4,
)

In [None]:
print("Predicted Response with Confidence Score: \n", res)
print("Ground Truth:\n", test["label_text"].iloc[473])

### [Option 2] -  Generating "Verbal Confidences" with an LLM

The idea is to make 2-calls per prediction, one for predicting the class, and a second one to ask the LLM to judge how confident it is about it, giving as options verbal confidences like "low", "medium" and "high".

In this example, we will use our tuned gemini model to predict the class and frozen Gemini 1.5 Pro to judge the prediction verbally.


In [None]:
def get_prediction_with_verbal_score(
    text_to_predict: str,
    model_to_predict_class: Any,
    model_to_eval_prediction: Any,
    possible_classes: list = ["business", "entertainment", "sport", "tech", "politics"],
):
    prediction = model_to_predict_class.generate_content(text_to_predict).text
    remaining_classes = possible_classes.remove(prediction)
    formatted_prompt = f""""TEXT:
    {text_to_predict}

    PREDICTED CLASS:
    {prediction}

    OTHER POSSIBLE CLASSES:
    {remaining_classes}
    """
    confidence = model_to_eval_prediction.generate_content(formatted_prompt).text
    result = {"prediction": prediction, "verbal_score": confidence}
    return result

In [None]:
# Define the configurations for the model which will conduct the class prediciton
generation_config = {"max_output_tokens": 10, "temperature": 0, "candidate_count": 1}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

model_to_predict_class = GenerativeModel(
    tuned_model_endpoint_name,
    system_instruction=[system_prompt_zero_shot],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
# Define the configurations for the model which will evaluate the predictions

eval_prompt = f"""
You will get a text about a particular topic, the predicted class for the topic and a list of the other different classes that the model could have chosen.
Your task is to judge how well the predicted class fitted the text, based on the other possible classes.
You need to evaluate and judge your prediction, indicating how confidente you are with your answer. You will judge the prediction as follows:

- If you are confident the text is correctly labeled with the given prediction, then respond with "High"
- If it can be that the model could match other classes, or you are not very sure the class corresponds to the text, then respond with "Medium"
- If you believe it makes no sense the class predicted for that text, then respond with "Low".

You MUST only output "High", "Medium" or "Low" without any further explanation.
"""

model_to_eval_class = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction=[eval_prompt],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
# Call the function to get the predictions with verbal score
res_verbal_conf = get_prediction_with_verbal_score(
    text_to_predict=test["text"].iloc[473],
    model_to_predict_class=tuned_model_multiple_responses,
    model_to_eval_prediction=model_to_eval_class,
    possible_classes=["business", "entertainment", "sport", "tech", "politics"],
)

In [None]:
print("Predicted Response with Verbal Score: \n", res_verbal_conf)
print("Ground Truth:\n", test["label_text"].iloc[473])

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.


Otherwise, you can delete the individual resources you created in this tutorial.

Refer to this [instructions](https://cloud.google.com/vertex-ai/docs/tutorials/image-classification-custom/cleanup#delete_resources) to delete the resources from console.

In [None]:
# Delete Experiment.
delete_experiments = True
if delete_experiments:
    experiments_list = aiplatform.Experiment.list()
    for experiment in experiments_list:
        if experiment.resource_name == EXPERIMENT_NAME:
            print(experiment.resource_name)
            experiment.delete()
            break

print("***" * 10)

# Delete Endpoint.
delete_endpoint = True
# If force is set to True, all deployed models on this
# Endpoint will be first undeployed.
if delete_endpoint:
    for endpoint in aiplatform.Endpoint.list():
        if endpoint.resource_name == tuned_model_endpoint_name:
            print(endpoint.resource_name)
            endpoint.delete(force=True)
            break

print("***" * 10)

# Delete Model.
delete_model = True
if delete_model:
    # Remove version from model name.
    tuned_model_name = tuned_model_name.split("@")[0]
    for model in aiplatform.Model.list():
        if model.resource_name == tuned_model_name:
            print(model.resource_name)
            model.delete()
            break

print("***" * 10)

# Delete Cloud Storage Bucket.
delete_bucket = True
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI