In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Document Q&A With Retrieval Augmented Generation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

---

* Author: Holt Skinner

---

This notebook demonstrates how to:

  - Get text embeddings using [`textembedding-gecko` in Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)
  - Convert embeddings into the [format expected by Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/prepare-data#unstructured)
  - [Create a search app with custom embeddings](https://cloud.google.com/generative-ai-app-builder/docs/bring-embeddings)



## Getting started

### Install libraries

In [None]:
%pip install -q --upgrade --user google-cloud-aiplatform google-cloud-discoveryengine google-cloud-storage google-cloud-bigquery[pandas] ipywidgets retrying


---
#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.
---

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()


### Import libraries

In [None]:
from typing import List
import requests
from retrying import retry

import itertools
import numpy as np
import pandas as pd
import numpy.linalg
import vertexai

from google.api_core import retry
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
from tqdm.auto import tqdm
from bs4 import BeautifulSoup, Tag

tqdm.pandas()

## Configure notebook environment

### Set the following constants to reflect your environment

In [None]:
# Define project information for Vertex AI
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
PROJECT_ID = "document-ai-test-337818"
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)


## Creating embeddings with Vertex AI

### Data Preparation

We will be using [the Stack Overflow public dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow) hosted on BigQuery table `bigquery-public-data.stackoverflow.posts_questions`.

This is a very big dataset with 23 million rows that doesn't fit into the memory. We are going to limit it to 1000 rows for this tutorial.

- Fetch the data from BigQuery
- Get the HTML from the StackOverflow Question page
   - Upload it to GCS as the Document Store/for displayed search results
- Concat the Title and Body, and create embeddings from the text.
- Save the rest of the fields as Metadata
- Create a JSONL file and upload to Cloud Storage
- Import JSONL file as Unstructured with Metadata

In [None]:
# load the BQ Table into a Pandas Dataframe
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = 1000

bq_client = bigquery.Client(project=PROJECT_ID)
query = f"""
SELECT
  DISTINCT 
  q.id,
  q.title,
  q.body,
  q.answer_count,
  q.comment_count,
  q.creation_date,
  q.favorite_count,
  q.last_activity_date,
  q.score,
  q.tags,
  q.view_count
FROM
  `bigquery-public-data.stackoverflow.posts_questions` AS q
WHERE
  q.score > 0
ORDER BY
  q.view_count DESC
LIMIT
  {QUESTIONS_SIZE};
"""

query_job = bq_client.query(query)
rows = query_job.result()
df = rows.to_dataframe()

# Convert ID to String
df["id"] = df["id"].apply(str)

# examine the data
df.head()

### Call the API to generate embeddings

With the Stack Overflow dataset, we will use the `title` column (the question title) and generate embedding for it with Embeddings for Text API. The API is available under the [`vertexai`](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai) package of the SDK.

You may see some warning messages from the TensorFlow library but you can ignore them.

From the package, import [`TextEmbeddingModel`](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai.language_models.TextEmbeddingModel) and get a model.

In [None]:
# Load the text embeddings model
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")


In [None]:
import time
from tqdm import tqdm  # to show a progress bar

# Get embeddings for a list of texts
BATCH_SIZE = 5

def get_embeddings_wrapper(texts, batch_size: int = BATCH_SIZE) -> List:
    embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        time.sleep(1)  # to avoid the quota error

        # Create embeddings optimized for document retrieval
        # (supported in textembedding-gecko@002)
        result = model.get_embeddings(
            [
                TextEmbeddingInput(text=text, task_type="RETRIEVAL_DOCUMENT")
                for text in texts[i : i + batch_size]
            ]
        )
        embs.extend([e.values for e in result])
    return embs

Get embeddings for the question titles/body and add them as the `"embedding"` column.

In [None]:
df["title_body"] = df["title"] + "\n" + df["body"]

df = df.assign(embedding=get_embeddings_wrapper(df.title_body))
df.head()

## Scrape HTML from Question Pages

- Grab HTML to upload to Cloud Storage
- This will be used for the search results links

In [None]:
QUESTION_BASE_URL = "https://stackoverflow.com/questions/"
JSONL_MIME_TYPE = "application/jsonl"
HTML_MIME_TYPE = "text/html"

BUCKET_NAME = "ucs-demo"
DIRECTORY = "embeddings-stackoverflow"
BLOB_PREFIX = f"{DIRECTORY}/html/"

GCS_URI_PREFIX = f"gs://{BUCKET_NAME}/{BLOB_PREFIX}"

from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)


def scrape_question(id_: str) -> str:
    question_url = f"{QUESTION_BASE_URL}{id_}"
    response = requests.get(question_url)

    if response.status_code != 200 or not response.content:
        print(f"ID: {id_} Code: {response.status_code}")
        return None

    print(f"scraping {question_url}")

    link_title = response.url.split("/")[-1] + ".html"
    gcs_uri = f"{GCS_URI_PREFIX}{link_title}"

    # Upload HTML to Google Cloud Storage
    blob = bucket.blob(f"{BLOB_PREFIX}{link_title}")
    blob.upload_from_string(response.content, content_type=HTML_MIME_TYPE)
    time.sleep(1)
    return gcs_uri

In [None]:
df["uri"] = df["id"].head(100).apply(scrape_question)
# df.tail()


Restructure the embeddings data to follow Vertex AI Search format (Unstructured with Metadata)

In [None]:
def format_row(row):
    return {
        "id": row["id"],
        "content": {"mimeType": HTML_MIME_TYPE, "uri": row["uri"]},
        "structData": {
            "embedding_vector": row["embedding"],
            "title": row["title"],
            "body": row["body"],
            "answer_count": row["answer_count"],
        },
    }


jsonl_filename = "vais_embeddings.jsonl"

with open(jsonl_filename, "w") as f:
    f.write(
        df.head(100)
        .apply(format_row, axis=1)
        .to_json(orient="records", lines=True, force_ascii=False)
        .replace("\/", "/")  # To prevent escaping the / characters
    )

Upload the JSONL file to Google Cloud Storage

In [None]:
embeddings_file = f"gs://{BUCKET_NAME}/{DIRECTORY}/{jsonl_filename}"
!gsutil mv {jsonl_filename} {embeddings_file}

In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine_v1alpha as discoveryengine

DATA_STORE_LOCATION = "global"

client_options = (
    ClientOptions(api_endpoint=f"{DATA_STORE_LOCATION}-discoveryengine.googleapis.com")
    if DATA_STORE_LOCATION != "global"
    else None
)

In [None]:
from google.api_core.exceptions import GoogleAPICallError

def create_data_store(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    # Initialize request argument(s)
    data_store = discoveryengine.DataStore(
        display_name=data_store_name,
        industry_vertical="GENERIC",
        content_config="CONTENT_REQUIRED",
        solution_types=["SOLUTION_TYPE_SEARCH"],
    )

    request = discoveryengine.CreateDataStoreRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        data_store=data_store,
        data_store_id=data_store_id,
    )
    operation = client.create_data_store(request=request)

    try:
        operation.result()
    except GoogleAPICallError:
        pass

def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,
):
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    request = discoveryengine.ImportDocumentsRequest(
        parent=parent,
        gcs_source=discoveryengine.GcsSource(
            input_uris=[gcs_uri]
        ),
        # Options: `FULL`, `INCREMENTAL`
        reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.FULL,
    )

    # Make the request
    operation = client.import_documents(request=request)

    # response = operation.result()

    # # Once the operation is complete,
    # # get information from operation metadata
    # metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # # Handle the response
    # return operation.operation.name


def update_schema(
    project_id: str,
    location: str,
    data_store_id: str,
):
    client = discoveryengine.SchemaServiceClient(client_options=client_options)

    schema = discoveryengine.Schema(
        name=client.schema_path(project_id, location, data_store_id, "default_schema"),
        struct_schema={
            "$schema": "https://json-schema.org/draft/2020-12/schema",
            "type": "object",
            "properties": {
                "embedding_vector": {
                    "type": "array",
                    "keyPropertyMapping": "embedding_vector",
                    "dimension": 768,
                    "items": {"type": "number"},
                }
            },
        }
    )

    operation = client.update_schema(request=discoveryengine.UpdateSchemaRequest(schema=schema))

    print("Waiting for operation to complete...")

    response = operation.result()

    # Handle the response
    print(response)


def create_engine(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    client = discoveryengine.EngineServiceClient(client_options=client_options)

    # Initialize request argument(s)
    config = discoveryengine.Engine.SearchEngineConfig(
        search_tier="SEARCH_TIER_ENTERPRISE", search_add_ons=["SEARCH_ADD_ON_LLM"]
    )

    engine = discoveryengine.Engine(
        display_name=data_store_name,
        solution_type="SOLUTION_TYPE_SEARCH",
        industry_vertical="GENERIC",
        data_store_ids=[data_store_id],
        search_engine_config=config,
    )

    request = discoveryengine.CreateEngineRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        engine=engine,
        engine_id=engine.display_name,
    )

    # Make the request
    operation = client.create_engine(request=request)
    response = operation.result(timeout=90)

In [None]:
DATA_STORE_NAME = "stackoverflow-embeddings2"
DATA_STORE_ID = f"{DATA_STORE_NAME}-id"

create_data_store(PROJECT_ID, DATA_STORE_LOCATION, DATA_STORE_NAME, DATA_STORE_ID)
update_schema(PROJECT_ID, DATA_STORE_LOCATION, DATA_STORE_ID)
import_documents(PROJECT_ID, DATA_STORE_LOCATION, DATA_STORE_ID, embeddings_file)

create_engine(PROJECT_ID, DATA_STORE_LOCATION, DATA_STORE_NAME, DATA_STORE_ID)

# Add in specify an embedding globally
# https://cloud.google.com/generative-ai-app-builder/docs/bring-embeddings#global