In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Semantic Search using Embeddings

Semantic search is a type of search that uses the meaning of words, phrases, and context to find the most relevant results. Semantic searches rely on vector embeddings which can best match the user query to the most similar result.

An embedding in this scenario is a vector which represents words. The closer the items are in a vector space, the more simelar they are. So when you query an embedding, items that are the closest match to your input (from your training input) are returned.

In this tutorial, we demonstrate how to create an embedding generated from text and perform a semantic search. The embeddings are generated using [Google ScaNN: Efficient Vector Similarity Search](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html),

## Pre-requisites:
- Vertex LLM SDK
- ScaNN [github](https://github.com/google-research/google-research/tree/master/scann)

## Install Vertex LLM SDK


In [None]:
from google.colab import auth as google_auth

google_auth.authenticate_user()

In [None]:
!pip install --upgrade git+https://github.com/googleapis/python-aiplatform.git@refs/pull/2345/merge "shapely<2.0"

Collecting git+https://github.com/googleapis/python-aiplatform.git@refs/pull/2345/merge
  Cloning https://github.com/googleapis/python-aiplatform.git (to revision refs/pull/2345/merge) to /tmp/pip-req-build-tt32diok
  Running command git clone --filter=blob:none --quiet https://github.com/googleapis/python-aiplatform.git /tmp/pip-req-build-tt32diok
[0m  Running command git fetch -q https://github.com/googleapis/python-aiplatform.git refs/pull/2345/merge
  Running command git checkout -q df9993a1263f09202bb4d1bf9fe33111aa764dd4
  Resolved https://github.com/googleapis/python-aiplatform.git to commit df9993a1263f09202bb4d1bf9fe33111aa764dd4
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
PROJECT_ID = "YOUR_PROJECT_ID"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

**Attention**: you need to restart the runtime so that the right package is installed.

## Import TextEmbeddingModel

In [None]:
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("google/textembedding-gecko@001")

## Install ScaNN Package

In [None]:
!pip install scann



## Imports packages

In [None]:
import json
import time

import numpy as np
import pandas as pd
import scann

## Create Embedding Dataset

The dataset is solely to demonstrate the use of the Text Embedding API with a vector database. It is not intended to be used for any other purpose, such as evaluating models. The dataset is small and does not represent a comprehensive sample of all possible text.

In [None]:
!gsutil cp gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl .

Copying gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl...
/ [1 files][ 14.1 KiB/ 14.1 KiB]                                                
Operation completed over 1 objects/14.1 KiB.                                     


In [None]:
records = []
with open("wide_and_deep_trainer_container_tests_input.jsonl") as f:
    for line in f:
        record = json.loads(line)
        records.append(record)

In [None]:
# Peek at the data.
df = pd.DataFrame(records)
df.head(50)

Unnamed: 0,textContent,classificationAnnotation,dataItemResourceLabels
0,"Cats are good pets, for they are clean and are...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
1,More RVs were seen in the storage lot than at ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
2,"When he asked her favorite number, she answere...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
3,Greetings from the real universe.,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
4,As he entered the church he could hear the sof...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
5,"They got there early, and they got really good...","{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
6,Pink horses galloped across the sea.,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
7,Even though he thought the world was flat he d...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}
8,He wondered if she would appreciate his toenai...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'}
9,They say people remember important moments in ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'}


In [None]:
def get_embedding(text):
    get_embedding.counter += 1
    try:
        if get_embedding.counter % 100 == 0:
            time.sleep(3)
        embeddings = model.get_embeddings([text])
        return embeddings[0].values
    except:
        return []


get_embedding.counter = 0

# This may take several minutes to complete.
df["embedding"] = df["textContent"].apply(lambda x: get_embedding(x))

In [None]:
df

Unnamed: 0,textContent,classificationAnnotation,dataItemResourceLabels,embedding
0,"Cats are good pets, for they are clean and are...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.037178341299295425, -0.010241042822599411, ..."
1,More RVs were seen in the storage lot than at ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.011439577676355839, -0.032596345990896225,..."
2,"When he asked her favorite number, she answere...","{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.020134523510932922, -0.0022515689488500357,..."
3,Greetings from the real universe.,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.016334038227796555, 0.0007779737352393568,..."
4,As he entered the church he could hear the sof...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.005115862935781479, -0.034255024045705795, ..."
5,"They got there early, and they got really good...","{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.05407888814806938, -0.02428186498582363, -..."
6,Pink horses galloped across the sea.,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.03886456787586212, 0.007716272957623005, -..."
7,Even though he thought the world was flat he d...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.0053230952471494675, 0.016568051651120186, ..."
8,He wondered if she would appreciate his toenai...,"{'displayName': 'FirstClass', 'annotationResou...",{'aiplatform.googleapis.com/ml_use': 'training'},"[0.043843794614076614, 0.03215761110186577, 0...."
9,They say people remember important moments in ...,"{'displayName': 'SecondClass', 'annotationReso...",{'aiplatform.googleapis.com/ml_use': 'training'},"[-0.0407303050160408, 0.01760895736515522, -0...."


## Create an Index

In [None]:
record_count = len(records)
dataset = np.empty((record_count, 768))
for i in range(record_count):
    dataset[i] = df.embedding[i]

normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = (
    scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
    .tree(
        num_leaves=record_count,
        num_leaves_to_search=record_count,
        training_sample_size=record_count,
    )
    .score_ah(2, anisotropic_quantization_threshold=0.2)
    .reorder(100)
    .build()
)

## Query the Index

In [None]:
def search(query):
    start = time.time()
    query = model.get_embeddings([query])[0].values
    neighbors, distances = searcher.search(query, final_num_neighbors=3)
    end = time.time()

    for id, dist in zip(neighbors, distances):
        print(f"[docid:{id}] [{dist}] -- {df.textContent[int(id)][:125]}...")
    print("Latency (ms):", 1000 * (end - start))

In [None]:
search("tell me about an animal")

[docid:20] [0.7676745057106018] -- Hit me with your pet shark!...
[docid:30] [0.6126559972763062] -- A kangaroo is really just a rabbit on steroids....
[docid:21] [0.6056331396102905] -- Most shark attacks occur about 10 feet from the beach since that's where the people are....
Latency (ms): 91.91465377807617


In [None]:
search("tell me about an important moment or event in your life")

[docid:9] [0.6281773447990417] -- They say people remember important moments in their life well, yet no one even remembers their own birth....
[docid:19] [0.5852192640304565] -- The near-death experience brought new ideas to light....
[docid:36] [0.5711853504180908] -- The most exciting eureka moment I've had was when I realized that the instructions on food packets were just guidelines....
Latency (ms): 82.97181129455566
