Semantic search is a type of search that uses the meaning of words and phrases to find relevant results.

In this tutorial, we will demonstrate how to do semantic search with embeddings generated from the news text (taken from a sample dataset in Google Cloud) and using Google ScaNN: Efficient Vector Similarity Search to retrieve the most relevant news semantically.The generated embeddings form a vector space which is then compared to a similar vector space of the news data to do a vector similarity search.

In [None]:
#Install Vertex AI Python SDK
!pip3 install google-cloud-aiplatform>=1.25 "shapely<2.0.0"

In [None]:
#Initialize SDK
PROJECT_ID = "acn-lkmaigcp"
LOCATION = "us-central1" 

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
#Import Text Embedding Model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [None]:
#Install ScaNN Package
!pip3 install scann

In [None]:
#Install other required packages
import json
import time

import numpy as np
import pandas as pd
import scann

from typing import Union
from google.cloud import aiplatform
from google.cloud import bigquery

Getting Stackoverflow data from BigQuery

In [None]:
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Run a BigQuery query and return the job ID or result as a DataFrame
    Args:
        sql: SQL query, as a string, to execute in BigQuery
    Returns:
        df: DataFrame of results from query,  or error, if any
    """

    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")

    return df

In [None]:
#Defining the query
records = run_bq_query(
    """SELECT
    CONCAT(q.title, q.body) as input_text,
    a.body AS output_text
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
JOIN
    `bigquery-public-data.stackoverflow.posts_answers` a
ON
    q.accepted_answer_id = a.id
WHERE
    q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND
    a.creation_date >= "2020-01-01"
LIMIT
    100
"""
)
records.head()

In [None]:
# Peek at the data.
df = pd.DataFrame(records)
df.head(50)

In [None]:
# Get Embeddings
def get_embedding(text):
    get_embedding.counter += 1
    try:
        if get_embedding.counter % 100 == 0:
            time.sleep(3)
        return model.get_embeddings([text])[0].values
    except:
        return []


get_embedding.counter = 0

# This may take several minutes to complete.
df["embedding"] = df["input_text"].apply(lambda x: get_embedding(x))

Create an index

In [None]:
record_count = len(records)
dataset = np.empty((record_count, 768))
for i in range(record_count):
    dataset[i] = df.embedding[i]

normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = (
    scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
    .tree(
        num_leaves=record_count,
        num_leaves_to_search=record_count,
        training_sample_size=record_count,
    )
    .score_ah(2, anisotropic_quantization_threshold=0.2)
    .reorder(100)
    .build()
)

Query the index

In [None]:
record_count = len(records)
dataset = np.empty((record_count, 768))
for i in range(record_count):
    dataset[i] = df.embedding[i]

normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = (
    scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
    .tree(
        num_leaves=record_count,
        num_leaves_to_search=record_count,
        training_sample_size=record_count,
    )
    .score_ah(2, anisotropic_quantization_threshold=0.2)
    .reorder(100)
    .build()
)

Query the index

In [None]:
def search(query):
    start = time.time()
    query = model.get_embeddings([query])[0].values
    neighbors, distances = searcher.search(query, final_num_neighbors=3)
    end = time.time()

    for id, dist in zip(neighbors, distances):
        print(f"[docid:{id}] [{dist}] -- {df.input_text[int(id)][:125]}...")
    print("Latency (ms):", 1000 * (end - start))

In [None]:
search("How can I convert videos")

In [None]:
search("tell me about Pandas")