In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using Vertex AI Matching Engine for StackOverflow Questions
![ ](https://www.google-analytics.com/collect?v=2&tid=G-L6X3ECH596&cid=1&en=page_view&sid=1&dt=sdk_matching_engine_create_stack_overflow_embeddings.ipynb&dl=notebooks%2Fofficial%2Fmatching_engine%2Fsdk_matching_engine_create_stack_overflow_embeddings.ipynb)
<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_create_stack_overflow_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_create_stack_overflow_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
      <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/matching_engine/sdk_matching_engine_create_stack_overflow_embeddings.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Overview

This example demonstrates how to encode custom text embeddings using the StackOverflow dataset and the sentence-T5 model. These are uploaded to the Vertex AI Matching Engine service. It is a high scale, low latency solution, to find similar vectors (or more specifically "embeddings") for a large corpus. Moreover, it is a fully managed offering, further reducing operational overhead. It is built upon [Approximate Nearest Neighbor (ANN) technology](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) developed by Google Research.

**Pre-requisite**: This notebook requires you to already have a VPC network set up. See the "Prepare a VPC network" section in [Create Vertex AI Matching Engine index notebook](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_for_indexing.ipynb).

Learn more about [Vertex AI Matching Engine](https://cloud.google.com/vertex-ai/docs/matching-engine/overview).

### Objective

In this notebook, you learn how to encode custom text embeddings, create an Approximate Nearest Neighbor (ANN) index, and query against indexes.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Matching Engine`

The steps performed include:

* Create ANN index
* Create an index endpoint with VPC Network
* Deploy ANN index
* Perform online query


### Dataset

The dataset used for this tutorial is the [StackOverflow dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow).

> Stack Overflow is the largest online community for programmers to learn, share their knowledge, and advance their careers. Updated on a quarterly basis, this BigQuery dataset includes an archive of Stack Overflow content, including posts, votes, tags, and badges. This dataset is updated to mirror the Stack Overflow content on the Internet Archive, and is also available through the Stack Exchange Data Explorer.

## Installation

Install the latest version of Cloud Storage, BigQuery, and the Vertex AI SDK for Python.

In [None]:
# Install the packages
! pip3 install --upgrade google-cloud-aiplatform \
                        google-cloud-storage \
                        'google-cloud-bigquery[pandas]'

Install the latest version of tensorflow and tensorflow_text to encode embeddings

In [None]:
# Install the packages
! pip3 install --upgrade tensorflow \
                        tensorflow_text \
                        tensorflow-hub

Install the latest version of Redis for low-latency data retrieval

In [None]:
# Install the redis package
! pip install --upgrade redis

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin
#### Set your project ID

If you don't know your project ID, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[YOUR-PROJECT-ID]"

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [2]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [3]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [4]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

* Authentication: Rerun the `gcloud auth login` command in the Vertex AI Workbench notebook terminal when you are logged out and need the credential again.

### Colab only: Uncomment the following cell to restart the kernel.

In [5]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [6]:
BUCKET_URI = "gs://your-bucket-name-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

## Prepare the data

You will use [Stack Overflow dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow) of question and answers hosted on BigQuery.

> This public dataset is hosted in Google BigQuery and is included in BigQuery's 1TB/mo of free tier processing. This means that each user receives 1TB of free BigQuery processing every month, which can be used to run queries on this public dataset.

In [None]:
%%time
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

NUM_ROWS = 1000

QUERY = f"""
        SELECT distinct q.id, q.title, q.body, q.tags, a.body as answers, a.score 
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` where Score>0 ORDER BY View_Count desc) AS q 
        INNER JOIN (SELECT * FROM `bigquery-public-data.stackoverflow.posts_answers`  where Score>0 ORDER BY Score desc) AS a ON q.id = a.parent_id 
        where q.tags like '%python%'
        LIMIT {NUM_ROWS};
        """

query_job = client.query(QUERY)
rows = query_job.result()

In [9]:
# Convert to a dataframe
df = rows.to_dataframe()

# Examine the data
df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,28819470,Numbers of Day in Month,"<p>I have a data frame with a date time index,...",python|pandas,"<p>There is now a <a href=""https://pandas.pyda...",11
1,35064304,"""RuntimeError: Make sure the Graphviz executab...",<p>I downloaded <code>Graphviz 2.38</code> MSI...,python|installation|graphviz,<p><strong>Step 1:</strong> Install Graphviz b...,22
2,18783390,python pip specify a library directory and an ...,<p>I am using pip and trying to install a pyth...,python|shared-libraries|pip|include-path|pyodbc,<p>Another way to indicate the location of inc...,19
3,1066933,How to extract top-level domain name (TLD) fro...,<p>how would you extract the domain name from ...,python|url|parsing|dns|extract,"<p>No, there is no ""intrinsic"" way of knowing ...",52
4,3817529,syntax for creating a dictionary into another ...,<blockquote>\n <p><strong>Possible Duplicate:...,python,<p>You can declare a dictionary inside a dicti...,106


In [None]:
# Extract the question ids and question text
ids = df.id.tolist()
questions = df.title.tolist()

# Verify the length
len(ids)

#### Instantiate the text encoding model

Use the [sentence-t5 encoder](https://tfhub.dev/google/sentence-t5/st5-base/1) developed by Google for converting text to embeddings.

> The sentence-T5 family of models encode text into high-dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language processing tasks.
>
> Our model is built on top of T5 (i.e. the Text-To-Text Transfer Transformer). It is trained on a variety of data sources and initialized from pre-trained T5 models with different model sizes. The input is variable-length English text and the output is a 768-dimensional vector. The sentence-T5 base model employs a 12-layer transformer architecture as the T5 base model does.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
# Registers the ops.
import tensorflow_text as text  # noqa: F401

hub_url = "https://tfhub.dev/google/sentence-t5/st5-base/1"

encoder = hub.KerasLayer(hub_url)

#### Defining an encoding function

Define a function to be used later that will take sentences and convert them to embeddings.

In [None]:
from typing import List

import numpy as np
from tqdm.auto import tqdm


def encode_text_to_embedding(
    text_encoder: hub.KerasLayer, sentences: List[str], batch_size: int = 100
) -> np.ndarray:
    embeddings_list = []

    # Process data in chunks to prevent out-of-memory errors
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i : i + batch_size]
        embeddings_list.append(text_encoder(tf.constant(batch)))

    return np.squeeze(np.column_stack(embeddings_list))

#### Test the encoding function

Encode a subset of data and see if the embeddings and distance metrics make sense.

According to the [sentence-T5 research paper](https://arxiv.org/pdf/2108.08877.pdf), the similarity of embeddings is calculated using the dot-product. 

In [None]:
# Encode 500 questions
questions = df.title.tolist()[:500]
question_embeddings = encode_text_to_embedding(
    text_encoder=encoder, sentences=questions
)

Save the dimension size for later usage when creating the index.

In [21]:
DIMENSIONS = len(question_embeddings[0])

DIMENSIONS

768

In [27]:
question_index = 0

print(f"Query question = {questions[question_index]}")
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")

Query question = Numbers of Day in Month
	0: Numbers of Day in Month: 0.9999998807907104
	1: Python: Number of the Week in a Month: 0.8649452924728394
	2: Simulate Poisson arrival times given count of arrivals per day: 0.8294509053230286
	3: How to workout if a datetime is older than x months in Python: 0.808755099773407
	4: How to efficiently add seconds place to date time: 0.7867542505264282
	5: Show if restaurant is open or closed based on weekday, opening and closing time: 0.7858442068099976
	6: How to subtract datetimes based on transition events in another column: 0.7808101177215576
	7: Doing DateTime Comparisons in Filter SQLAlchemy: 0.7684456706047058
	8: Python datetime module and getting current time: 0.7682346701622009
	9: Multiples of 10 in a list: 0.7647985816001892
	10: Counting differences between two strings: 0.7633612155914307
	11: Function That Computes Sum of Squares of Numbers in List: 0.7624084949493408
	12: Python/Matplotlib - Colorbar Range and Display Values: 0.

#### Save the train split in JSONL format.

The data must be formatted in JSONL format, which means each embedding dictionary is written as a JSON string on its own line.

See more information in the docs at [Input data format and structure](https://cloud.google.com/vertex-ai/docs/matching-engine/match-eng-setup#input-data-format).

In [None]:
import tempfile

# Create temporary file to write embeddings to
embeddings_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)

embeddings_file.name

In [None]:
import json

BATCH_SIZE = 100

with open(embeddings_file.name, "a") as f:
    for i in tqdm(range(0, len(questions), BATCH_SIZE)):
        id_chunk = ids[i : i + BATCH_SIZE]

        question_chunk_embeddings = encode_text_to_embedding(
            text_encoder=encoder, sentences=questions[i : i + BATCH_SIZE]
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(id_chunk, question_chunk_embeddings)
        ]
        f.writelines(embeddings_formatted)

Upload the training data to a Google Cloud Storage bucket.

In [None]:
UNIQUE_FOLDER_NAME = "embeddings_folder_unique"
remote_folder = f"{BUCKET_URI}/{UNIQUE_FOLDER_NAME}/"
! gsutil cp {embeddings_file.name} {remote_folder}

## Create Indexes


### Create ANN Index (for Production Usage)

In [39]:
DISPLAY_NAME = "stack_overflow"
DESCRIPTION = "questions from stackoverflow"

Create the ANN index configuration:

To learn more about configuring the index, see [Input data format and structure](https://cloud.google.com/vertex-ai/docs/matching-engine/match-eng-setup#input-data-format).


In [40]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=remote_folder,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

Using the resource name, you can retrieve an existing MatchingEngineIndex.

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

## Create an IndexEndpoint with VPC Network

In [None]:
# Retrieve the project number
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

VPC_NETWORK = "[your-network-name]"
VPC_NETWORK_FULL = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, VPC_NETWORK)
VPC_NETWORK_FULL

In [None]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    network=VPC_NETWORK_FULL,
)

## Deploy Indexes

### Deploy ANN Index

In [None]:
DEPLOYED_INDEX_ID = "deployed_index_id_unique"

In [None]:
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

## Create Online Queries

After you built your indexes, you may query against the deployed index to find nearest neighbors.

Note: For the DOT_PRODUCT_DISTANCE distance type, the "distance" property returned with each MatchNeighbor actually refers to the similarity.

In [None]:
test_embeddings = encode_text_to_embedding(
    text_encoder=encoder, sentences=["How do I install tensorflow with GPU support?"]
)

In [None]:
# Test query
NUM_NEIGHBOURS = 20

response = my_index_endpoint.match(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=[test_embeddings.tolist()],
    num_neighbors=NUM_NEIGHBOURS,
)

response

Print titles to verify neighbors make sense

In [None]:
neighbor_ids = [neighbor.id for neighbor in response[0]]
neighbor_distances = [neighbor.distance for neighbor in response[0]]

for match_index, neighbor in enumerate(response[0]):
    titles = df[df.id.astype(str) == neighbor.id].title.tolist()

    if len(titles) > 0:
        print(
            f"{match_index}: title = '{titles[0]}', distance = {neighbor.distance:0.2f}"
        )

## Storing and retrieving titles from a Redis data store
When you productionize this code into a service, you will need to convert the nearest nearest id's returned from Vertex AI Matching Engine into data usable by downstream services.

In this case, you'll need to convert the id's to titles.

You can use Google Cloud's Memorystore to deploy a managed Redis instance to save the id-title key-value pairs.

See more information on [Memorystore](https://cloud.google.com/memorystore/docs/redis/create-manage-instances?hl=en)

In [None]:
REDIS_INSTANCE_NAME = "stackoverflow-questions-unique"

# Create a Redis instance
! gcloud redis instances create '{REDIS_INSTANCE_NAME}' --size=5 --region={REGION} --network={VPC_NETWORK_FULL} --connect-mode=private-service-access

In [None]:
# Get host and port info
REDIS_HOST = ! gcloud redis instances list --filter="INSTANCE_NAME:'{REDIS_INSTANCE_NAME}'" --region {REGION}  --format='value(HOST)'
REDIS_PORT = ! gcloud redis instances list --filter="INSTANCE_NAME:'{REDIS_INSTANCE_NAME}'" --region {REGION} --format='value(PORT)'

if isinstance(REDIS_HOST, list):
    REDIS_HOST = REDIS_HOST[0]

if isinstance(REDIS_PORT, list):
    REDIS_PORT = REDIS_PORT[0]

print(f"REDIS_HOST = {REDIS_HOST}")
print(f"REDIS_PORT = {REDIS_PORT}")

In [None]:
# Connect to the instance
import redis

redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT)

In [None]:
# Convert the id -> title relationship into a dict and write to redis
redis_client.mset({str(id): str(title) for id, title in zip(df.id, df.title)})

In [None]:
# Verify that redis can retrieve the correct information
[
    f"Actual = {title}, Retrieved = {redis_client.get(str(id))}"
    for id, title in list(zip(df.id, df.title))[:10]
]

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.
You can also manually delete resources that you created by running the following code.

In [None]:
# Force undeployment of indexes and delete endpoint
my_index_endpoint.delete(force=True)

In [None]:
# Delete indexes
tree_ah_index.delete()

In [None]:
# Delete redis instance
! gcloud redis instances delete '{REDIS_INSTANCE_NAME}' --region {REGION} --quiet