In [None]:
# @title Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Tuned Text-Embeddings on Vertex AI

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/tuned_text-embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo">Open in Colab
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/tuned_text-embeddings.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/tuned_text-embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook will walk you through the process of fine-tuning text-embeddings. By adapting a text-embedding model to your specific domain or task, you can achieve better results. See also [tuning text-embeddings](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings).

### Objective

In this tutorial, you learn how to tune a text-embedding model, textembedding-gecko.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI
- Cloud Storage

The steps include:

- Run a text-embedding model tuning job on Vertex AI Pipelines.
- Deploy your tuned text-embedding model to an endpoint.
- Examine the quality metrics to assess your tuned model.
- Get embeddings from your tuned model for downstream tasks.

### Dataset

This notebook uses a synthetic dataset (corpus, queries, and labels) from [Alphabet's annual financial performance report (10K form)](https://abc.xyz/assets/ff/7c/06d6f493f6462caf08e8502ffc33/596de1b094c32cf0592a08edfe84ae74.html).

* [corpus.jsonl](https://storage.googleapis.com/cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/corpus.jsonl) (397Ki),
* [queries.jsonl](https://storage.googleapis.com/cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/queries.jsonl) (321Ki)
* [test.tsv](https://storage.googleapis.com/cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/test.tsv) (3.7Ki)
* [train.tsv](https://storage.googleapis.com/cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/train.tsv) (29Ki)
* [validation.tsv](https://storage.googleapis.com/cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/validation.tsv) (3.7Ki)

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

This tutorial requires you to install the `google-cloud-aiplatform` package.

In [None]:
# @title (Required) Install AI platform package { run: "auto" }
AI_PLATFORM_REQUIREMENT = "1.56.0"  # @param {type:"raw"}

import builtins
import os
import sys

try:
    import vertexai

    assert vertexai.__version__ >= AI_PLATFORM_REQUIREMENT
    print(f"Got {vertexai.__version__=}.")
except:
    !pip3 install f"google-cloud-aiplatform>={AI_PLATFORM_REQUIREMENT}"

    running_in_colab = "google.colab" in sys.modules and hasattr(
        builtins, "get_ipython"
    )
    if running_in_colab and not os.getenv("IS_TESTING"):
        print("Please wait a minute. Colab session is rebooting.")
        get_ipython().instance().kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,documentai.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

See also [setting up permissions & resources for tuning text-embedding models](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#project-setup).

### Initialize Vertex AI Platform

Import and Initialize AI platform for your project and region.

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See also [Locate the project ID](https://support.google.com/googleapi/answer/7014113).

See also [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
# @title (Required) Set PROJECT_ID and REGION
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
if not PROJECT_ID.strip():
    raise ValueError("'PROJECT_ID' is required.")
if not REGION.strip():
    raise ValueError("'REGION' is required.")
!gcloud config set project {PROJECT_ID}

import vertexai
from google.cloud.aiplatform import pipeline_jobs
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

vertexai.init(project=PROJECT_ID, location=REGION)

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Colab:**

In [None]:
# @title (Required on Colab) `authenticate_user()`
import builtins
import os
import sys

running_in_colab = "google.colab" in sys.modules and hasattr(builtins, "get_ipython")
if running_in_colab and not os.getenv("IS_TESTING"):
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)

**2. Vertex AI Workbench**

Make sure that [the Compute Engine default service account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account) running a Vertex AI workbench instance has the permission iam.serviceAccounts.actAs (most likely through [roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)) at [the IAM & Admin page of the Cloud Console](https://console.cloud.google.com/iam-admin). This permission allows a workbench instance to act as the service account when interacting with other Google Cloud services.

**3. Local JupyterLab instance, uncomment and run:**

In [None]:
# !gcloud auth login

## Tune Text-Embeddings

(Optionally), to resume this tutorial from where you left off with your previous tuning session, set **`TUNING_JOB_ID`** accordingly. Or, clear **`TUNING_JOB_ID`** to start over a fresh tuning session.

This tutorial creates a tuning job of a Vertex AI pipeline for tuning a text-embedding model within your project. See also [creating tuning jobs with parameters and defaults](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#create-embedding-tuning-job), the latest [text-embedding models and eligible tasks](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#api_changes_to_models_released_on_or_after_august_2023).

In [None]:
# @title (Optional) Set `TUNING_JOB_ID` to resume an existing tuning session: { run: "auto" }
TUNING_JOB_ID = ""  # @param {type: "string"}

In [None]:
# @title (Required) Resume an existing or start a fresh tuning session (depending on TUNING_JOB_ID)
BASE_MODEL = "text-embedding-004"  # @param ["textembedding-gecko@003", "text-embedding-004", "textembedding-gecko-multilingual@001", "text-multilingual-embedding-002"]
TASK = "DEFAULT"  # @param ["DEFAULT", "RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING", "QUESTION_ANSWERING", "FACT_VERIFICATION"]
if TASK in ["QUESTION_ANSWERING", "FACT_VERIFICATION"] and BASE_MODEL not in [
    "text-embedding-004",
    "text-multilingual-embedding-002",
]:
    raise ValueError(f"TASK '{TASK}' is not valid for model '{BASE_MODEL}'.")

CORPUS_DATA = "gs://cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/corpus.jsonl"  # @param {type: "string"}
QUERIES_DATA = "gs://cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/queries.jsonl"  # @param {type: "string"}
TRAINING_DATA = "gs://cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/train.tsv"  # @param{type: "string"}
VALIDATION_DATA = "gs://cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/validation.tsv"  # @param{type: "string"}
TEST_DATA = "gs://cloud-samples-data/ai-platform/embedding/goog-10k-2024/r11/test.tsv"  # @param{type: "string"}
BATCH_SIZE = 128  # @param {type: "number"}
TRAIN_STEPS = 1000  # @param {type: "number"}

base_model = TextEmbeddingModel.from_pretrained(BASE_MODEL)
if "TUNING_JOB_ID" in locals() and TUNING_JOB_ID:
    filter = f'pipelineJobUserId="{TUNING_JOB_ID}"'
    tuning_job = next(iter(pipeline_jobs.PipelineJob.list(filter=filter)))
    print(
        f"Got an existing tuning job '{tuning_job.name}' (state: {tuning_job.state.name})."
    )
else:
    tuning_result = base_model.tune_model(
        task_type=TASK,
        corpus_data=CORPUS_DATA,
        queries_data=QUERIES_DATA,
        training_data=TRAINING_DATA,
        validation_data=VALIDATION_DATA,
        test_data=TEST_DATA,
        batch_size=BATCH_SIZE,
        train_steps=TRAIN_STEPS,
        tuned_model_location=REGION,
    )
    tuning_job = pipeline_jobs.PipelineJob.get(tuning_result.pipeline_job_name)
    print(
        f"Got a fresh tuning job '{tuning_job.name}' (state: {tuning_job.state.name})."
    )
    print(
        f"(OPTIONAL) Set 'TUNING_JOB_ID' to '{tuning_job.name}' when you want to resume this tuning session."
    )

### Deploy the Tuned Text-Embedding Model

You can deploy a tuned model to an endpoint, once a tuning job is completed. Deploying a model can be done in one of these ways:

* Calling `deploy_tuned_model` method on a tuning result object from `tune_model` method on a base model object.
* Calling class method `TextEmbeddingModel.deploy_tuned_model` with a tuned model resource name. You can retrieve from a tuning job the uploaded, tuned model's resource name. This resource name is a string that follows the pattern projects/{PROJECT_ID}/locations/{REGION}/models/{MODEL_ID}.

**IMPORTANT**: Deployment of tuned text-embedding models, being custom trained, requires the allocation of serving resources like machines and accelerators within your project. `deploy_tuned_model`, upon its initial call, creates a tuned model deployment onto a fresh serving resource. Upon subsequent calls, it simply retrieves the tuned model deployment from the existing serving resource.

See also [using tuned models](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#use-tuned-model).


In [None]:
MACHINE_TYPE = "a2-highgpu-1g"  # @param {type: "string"}
ACCELERATOR = "NVIDIA_TESLA_A100"  # @param {type: "string"}
ACCELERATOR_COUNT = 1  # @param {type: "number"}

# CAVEAT: Colab disruptions may cause 'tuning_result' to be undefined.
running_interactively = not os.getenv("IS_TESTING")
if "tuning_job" not in locals() and running_interactively:
    message = "[Action Required] Run the preceding code cells to define 'tuning_job'."
    raise RuntimeError(message)
if "tuning_job" in locals() and running_interactively:
    if "tuning_result" in locals():
        model = tuning_result.deploy_tuned_model(
            machine_type=MACHINE_TYPE,
            accelerator=ACCELERATOR,
            accelerator_count=ACCELERATOR_COUNT,
        )
        print(f"Got deployed, tuned {model=}")
    else:
        tuning_job.wait()
        tasks = tuning_job.task_details
        upload_task = next(t for t in tasks if "uploader" in t.task_name)
        upload_metadata = dict(upload_task.execution.metadata)
        tuned_model_name = upload_metadata["output:model_resource_name"]
        model = TextEmbeddingModel.deploy_tuned_model(
            tuned_model_name=tuned_model_name,
            machine_type=MACHINE_TYPE,
            accelerator=ACCELERATOR,
            accelerator_count=ACCELERATOR_COUNT,
        )
        print(f"Got deployed, tuned model '{tuned_model_name}'.")

### Examine Quality Metrics from Tuning Job

The pipeline assesses the performance of both the base and tuned models during tuning by calculating NDCG@10 metrics using validation and test datasets. These metrics are available in the pipeline job's `metrics` artifact.

In [None]:
import pandas as pd

if "tuning_job" in locals() and tuning_job.done():
    tasks = tuning_job.task_details
    eval_task = next(t for t in tasks if "evaluator" in t.task_name)
    metrics = dict(eval_task.outputs["metrics"].artifacts[0].metadata)
    metrics_df = pd.DataFrame.from_dict(
        {"metric": metrics.keys(), "value": metrics.values()}
    )
    display(metrics_df.sort_values(by="metric", ignore_index=True))

### Get Tuned Embeddings for Downstream Tasks

Get embeddings from your tuned model for downstream tasks.

In [None]:
import pandas as pd

if "model" in locals():
    texts = ["banana muffins?"]  # @param {type:"raw"}
    titles = ["none"]  # @param {type:"raw"}
    embedding_inputs = [
        TextEmbeddingInput(text=text, task_type=TASK, title=title)
        for text, title in zip(texts, titles)
    ]
    tuned_embeddings = [
        pd.Series(e.values) for e in model.get_embeddings(embedding_inputs)
    ]
    display(tuned_embeddings)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.