In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text Embeddings for Semantic Search


<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/generative_ai/text_embedding_api_cloud_next_new_models.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/generative_ai/text_embedding_api_cloud_next_new_models.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/generative_ai/text_embedding_api_cloud_next_new_models.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.10

## Overview

This colab is used as a code example for how to call our newly released text embedding models (textembedding-gecko@latest and textembedding-gecko-multilingual@latest).

Learn more about [text embedding api](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings).

This tutorial uses the following Google Cloud ML services and resources:
- Vertex LLM SDK

The steps performed include:
- Installation and imports
- Generate embeddings


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [2]:
from google.colab import auth
auth.authenticate_user()

## Installation

Install the following packages required to execute this notebook.

**Remember to restart the runtime after installation.**

In [3]:
!pip install git+https://github.com/googleapis/python-aiplatform.git

Collecting git+https://github.com/googleapis/python-aiplatform.git
  Cloning https://github.com/googleapis/python-aiplatform.git to /tmp/pip-req-build-rii_fnl3
  Running command git clone --filter=blob:none --quiet https://github.com/googleapis/python-aiplatform.git /tmp/pip-req-build-rii_fnl3
  Resolved https://github.com/googleapis/python-aiplatform.git to commit 50c15917e9c102241319d94efddb60d548ea98fd
  Preparing metadata (setup.py) ... [?25l[?25hdone


### Please restart the runtime.
In your top menu:
**Runtime < Restart runtime**

Install ScaNN

In [1]:
!pip install scann



### Imports libraries

In [2]:
import vertexai
import json
import time

import numpy as np
import pandas as pd
import scann

from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

#### Set your project ID and initiate Vertex AI

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [3]:
PROJECT_ID = "acn-lkmaigcp"  # @param {type:"string"}
REGION = "us-central1"

# Set the project id
! gcloud config set project {PROJECT_ID}

# Initiate Vertex AI
vertexai.init(project=PROJECT_ID, location=REGION)

Updated property [core/project].


## Generate embeddings

In [4]:
# Set the model name.
MODEL_NAME = "textembedding-gecko@latest"  # @param ["textembedding-gecko@latest", "textembedding-gecko-multilingual@latest"]

# Set the task_type, text and optional title as the model inputs.
TASK_TYPE = "RETRIEVAL_DOCUMENT"  # @param ["RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING"]
TITLE = "Google"  # @param {type:"string"}
TEXT = "Embed text."  # @param {type:"string"}

# Verify the input is valid.
if not MODEL_NAME:
    raise ValueError("Please set MODEL_NAME.")
if not TASK_TYPE:
    raise ValueError("Please set TASK_TYPE.")
if not TEXT:
    raise ValueError("Please set TEXT.")
if TITLE and TASK_TYPE != "RETRIEVAL_DOCUMENT":
    raise ValueError("Title can only be provided if the task_type is RETRIEVAL_DOCUMENT")

In [5]:
def text_embedding(
  model_name: str, task_type: str, text: str, title: str = "") -> list:
    """Generate text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(model_name)

    text_embedding_input = TextEmbeddingInput(
        task_type=task_type, title=title, text=text)
    embeddings = model.get_embeddings([text_embedding_input])
    return embeddings[0].values

embedding = text_embedding(
    model_name=MODEL_NAME, task_type=TASK_TYPE, text=TEXT, title=TITLE)
print(len(embedding))

768


In [6]:
model = TextEmbeddingModel.from_pretrained("google/textembedding-gecko@001")

# Create embedding dataset
The dataset demonstrates the use of the Text Embedding API with a vector database. It is not intended to be used for any other purpose, such as evaluating models. The dataset is small and does not represent a comprehensive sample of all possible text.

In [7]:
DATASET_URI = "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl"  # @param {type:"string"}

In [8]:
!gsutil cp gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl .

Copying gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl...
/ [0 files][    0.0 B/ 14.1 KiB]                                                / [1 files][ 14.1 KiB/ 14.1 KiB]                                                
Operation completed over 1 objects/14.1 KiB.                                     


In [9]:
# reads a JSON file and stores the records in a list
records = []
with open("wide_and_deep_trainer_container_tests_input.jsonl") as f:
    for line in f:
        record = json.loads(line)
        records.append(record)

In [10]:
# Peek at the data.
df = pd.DataFrame(records)
df.head(10)

Unnamed: 0,textContent,classificationAnnotation,dataItemResourceLabels
0,"Cats are good pets, for they are clean and are...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
1,More RVs were seen in the storage lot than at ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
2,"When he asked her favorite number, she answere...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
3,Greetings from the real universe.,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
4,As he entered the church he could hear the sof...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
5,"They got there early, and they got really good...","{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
6,Pink horses galloped across the sea.,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
7,Even though he thought the world was flat he d...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
8,He wondered if she would appreciate his toenai...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
9,They say people remember important moments in ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}


In [11]:
# This function takes a text string as input
# and returns the embedding of the text


def get_embedding(text: str) -> list:
    try:
        embeddings = model.get_embeddings([text])
        return embeddings[0].values
    except:
        return []


get_embedding.counter = 0

# This may take several minutes to complete.
df["embedding"] = df["textContent"].apply(lambda x: get_embedding(x))

In [12]:
# Peek at the data.
df.head()

Unnamed: 0,textContent,classificationAnnotation,dataItemResourceLabels,embedding
0,"Cats are good pets, for they are clean and are...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.037178341299295425, -0.010241042822599411, ..."
1,More RVs were seen in the storage lot than at ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.011439577676355839, -0.032596345990896225,..."
2,"When he asked her favorite number, she answere...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.020134523510932922, -0.0022515689488500357,..."
3,Greetings from the real universe.,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.016334038227796555, 0.0007779737352393568,..."
4,As he entered the church he could hear the sof...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.005115862935781479, -0.034255024045705795, ..."


In [13]:
record_count = len(records)
dataset = np.array([df.embedding[i] for i in range(record_count)])


normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = (
    scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
    .tree(
        num_leaves=record_count,
        num_leaves_to_search=record_count,
        training_sample_size=record_count,
    )
    .score_ah(2, anisotropic_quantization_threshold=0.2)
    .reorder(100)
    .build()
)

In [14]:
def search(query: str) -> None:
    start = time.time()
    query = model.get_embeddings([query])[0].values
    neighbors, distances = searcher.search(query, final_num_neighbors=3)
    end = time.time()

    for id, dist in zip(neighbors, distances):
        print(f"[docid:{id}] [{dist}] -- {df.textContent[int(id)][:125]}...")
    print("Latency (ms):", 1000 * (end - start))

In [15]:
search("tell me about an animal")

[docid:20] [0.6497225761413574] -- Hit me with your pet shark!...
[docid:24] [0.6352805495262146] -- Today I dressed my unicorn in preparation for the race....
[docid:30] [0.6284741163253784] -- A kangaroo is really just a rabbit on steroids....
Latency (ms): 56.44989013671875


In [16]:
search("tell me about an important moment or event in your life")

[docid:9] [0.6281773447990417] -- They say people remember important moments in their life well, yet no one even remembers their own birth....
[docid:19] [0.5852192640304565] -- The near-death experience brought new ideas to light....
[docid:36] [0.5711853504180908] -- The most exciting eureka moment I've had was when I realized that the instructions on food packets were just guidelines....
Latency (ms): 56.664466857910156
