In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Document Q&A With Retrieval Augmented Generation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/search/custom-embeddings/custom_embeddings.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

---

* Author: Holt Skinner

---

This notebook demonstrates how to:

  - Get text embeddings using [`textembedding-gecko` in Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)
  - Convert embeddings into the [format expected by Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/prepare-data#unstructured)
  - [Create a search app with custom embeddings](https://cloud.google.com/generative-ai-app-builder/docs/bring-embeddings)



## Getting started

### Install libraries

In [5]:
%pip install -q --upgrade --user google-cloud-aiplatform google-cloud-discoveryengine google-cloud-storage 'google-cloud-bigquery[pandas]'



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


---
#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.
---

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()


### Import libraries

In [1]:
import requests
import itertools
import numpy as np
import pandas as pd
import numpy.linalg
import vertexai

from google.api_core import retry
from vertexai.language_models import TextEmbeddingModel
from tqdm.auto import tqdm
from bs4 import BeautifulSoup, Tag

tqdm.pandas()


  from .autonotebook import tqdm as notebook_tqdm


## Configure notebook environment

### Set the following constants to reflect your environment

In [2]:
# Define project information for Vertex AI
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
PROJECT_ID = "document-ai-test-337818"
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)


## Creating emeddings with Vertex 

### Data Preparation

We will be using [the Stack Overflow public dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow) hosted on BigQuery table `bigquery-public-data.stackoverflow.posts_questions`.

This is a very big dataset with 23 million rows that doesn't fit into the memory. We are going to limit it to 1000 rows for this tutorial.

- Fetch the data from BigQuery
- Get the HTML from the StackOverflow Question page
   - Upload it to GCS as the Document Store/for displayed search results
- Concat the Title and Body, and create embeddings from the text.
- Save the rest of the fields as Metadata
- Create a JSONL file and upload to Cloud Storage
- Import JSONL file as Unstructured with Metadata

In [3]:
# load the BQ Table into a Pandas Dataframe
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = 1000

bq_client = bigquery.Client(project=PROJECT_ID)
query = f"""
SELECT
  DISTINCT 
  q.id,
  q.title,
  q.body,
  q.answer_count,
  q.comment_count,
  q.creation_date,
  q.favorite_count,
  q.last_activity_date,
  q.score,
  q.tags,
  q.view_count
FROM
  `bigquery-public-data.stackoverflow.posts_questions` AS q
WHERE
  q.score > 0
ORDER BY
  q.view_count DESC
LIMIT
  {QUESTIONS_SIZE};
"""

query_job = bq_client.query(query)
rows = query_job.result()
df = rows.to_dataframe()

# examine the data
df.head()


Unnamed: 0,id,title,body,answer_count,comment_count,creation_date,favorite_count,last_activity_date,score,tags,view_count
0,927358,How do I undo the most recent local commits in...,<p>I accidentally committed the wrong files to...,100,12,2009-05-29 18:09:14.627000+00:00,7666,2022-09-09 08:13:22.747000+00:00,24809,git|version-control|git-commit|undo,11649204
1,5767325,How can I remove a specific item from an array?,<p>How do I remove a specific value from an ar...,137,7,2011-04-23 22:17:18.487000+00:00,1631,2022-09-16 16:24:04.310000+00:00,10953,javascript|arrays,10493798
2,2003505,How do I delete a Git branch locally and remot...,<h4>Failed Attempts to Delete a Remote Branch:...,42,10,2010-01-05 01:12:15.867000+00:00,5953,2022-09-20 09:16:37.687000+00:00,19556,git|version-control|git-branch|git-push|git-re...,10278934
3,16956810,How to find all files containing specific text...,<p>How do I find all files containing a specif...,53,9,2013-06-06 08:06:45.533000+00:00,3270,2022-09-04 13:42:00.477000+00:00,6894,linux|text|grep|directory|find,9378947
4,4114095,How do I revert a Git repository to a previous...,<p>How do I revert from my current state to a ...,41,3,2010-11-06 16:58:14.550000+00:00,4116,2022-09-02 06:25:46.480000+00:00,7617,git|git-checkout|git-reset|git-revert,8956751


### Call the API to generate embeddings

With the Stack Overflow dataset, we will use the `title` column (the question title) and generate embedding for it with Embeddings for Text API. The API is available under the [`vertexai`](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai) package of the SDK.

You may see some warning messages from the TensorFlow library but you can ignore them.

From the package, import [`TextEmbeddingModel`](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai.language_models.TextEmbeddingModel) and get a model.

In [4]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel, TextEmbeddingInput

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")


In [13]:
import time
from tqdm import tqdm # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 10


# def get_embeddings_wrapper(texts):
#     return [
#         e.values
#         for i in tqdm(
#             range(0, len(texts), BATCH_SIZE),
#             desc="Processing batches"
        )
#         # Create embeddings optimized for document retrieval
#         # (supported in textembedding-gecko@002)
#         for e in model.get_embeddings([
#             TextEmbeddingInput(text=text, task_type="RETRIEVAL_DOCUMENT")
#             for text in texts[i:i + BATCH_SIZE]
#         ])
#     ]

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1) # to avoid the quota error

        # Create embeddings optimized for document retrieval
        # (supported in textembedding-gecko@002)
        result = model.get_embeddings(
            [
                TextEmbeddingInput(text=text, task_type="RETRIEVAL_DOCUMENT")
                for text in texts[i : i + BATCH_SIZE]
            ]
        )
        embs.extend([e.values for e in result])
    return embs


Get embeddings for the question titles/body and add them as the `"embedding"` column.

In [14]:
df["title_body"] = df["title"] + "\n" + df["body"]

df = df.assign(embedding=get_embeddings_wrapper(df.title_body))
df.head()


100%|██████████| 100/100 [02:06<00:00,  1.26s/it]


Unnamed: 0,id,title,body,answer_count,comment_count,creation_date,favorite_count,last_activity_date,score,tags,view_count,title_body,embedding
0,927358,How do I undo the most recent local commits in...,<p>I accidentally committed the wrong files to...,100,12,2009-05-29 18:09:14.627000+00:00,7666,2022-09-09 08:13:22.747000+00:00,24809,git|version-control|git-commit|undo,11649204,How do I undo the most recent local commits in...,"[0.0137466536834836, 2.998062200276763e-06, 0...."
1,5767325,How can I remove a specific item from an array?,<p>How do I remove a specific value from an ar...,137,7,2011-04-23 22:17:18.487000+00:00,1631,2022-09-16 16:24:04.310000+00:00,10953,javascript|arrays,10493798,How can I remove a specific item from an array...,"[-0.01670827530324459, -0.0213627852499485, 0...."
2,2003505,How do I delete a Git branch locally and remot...,<h4>Failed Attempts to Delete a Remote Branch:...,42,10,2010-01-05 01:12:15.867000+00:00,5953,2022-09-20 09:16:37.687000+00:00,19556,git|version-control|git-branch|git-push|git-re...,10278934,How do I delete a Git branch locally and remot...,"[-0.015323596075177193, -0.013528505340218544,..."
3,16956810,How to find all files containing specific text...,<p>How do I find all files containing a specif...,53,9,2013-06-06 08:06:45.533000+00:00,3270,2022-09-04 13:42:00.477000+00:00,6894,linux|text|grep|directory|find,9378947,How to find all files containing specific text...,"[-0.015853306278586388, -0.0010032434947788715..."
4,4114095,How do I revert a Git repository to a previous...,<p>How do I revert from my current state to a ...,41,3,2010-11-06 16:58:14.550000+00:00,4116,2022-09-02 06:25:46.480000+00:00,7617,git|git-checkout|git-reset|git-revert,8956751,How do I revert a Git repository to a previous...,"[0.003642600728198886, -0.005185546353459358, ..."


## Scrape HTML from Question Pages

- Grab HTML to upload to Cloud Storage

In [53]:
QUESTION_BASE_URL = "https://stackoverflow.com/questions/"
JSONL_MIME_TYPE = "application/jsonl"
HTML_MIME_TYPE = "text/html"

BUCKET_NAME = "ucs-demo"
DIRECTORY = "embeddings-stackoverflow"
BLOB_PREFIX = f"{DIRECTORY}/html"

GCS_URI_PREFIX = f"gs://{BUCKET_NAME}/{BLOB_PREFIX}/"

from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

def scrape_question(ids_):
    for id_ in ids_:
        question_url = f"{QUESTION_BASE_URL}{id_}"
        response = requests.get(question_url)
        if response.status_code == 200 and response.content:
            print("scraping")
            content = response.content
            link_title = response.url.split("/")[-1] + "html"
            gcs_uri = f"{GCS_URI_PREFIX}{link_title}"
            # Upload HTML to Google Cloud Storage
            blob = bucket.blob(f"{BLOB_PREFIX}{link_title}")
            blob.upload_from_string(content)
            return gcs_uri


Restructure the embeddings data to follow Vertex AI Search format (Unstructured with Metadata)

In [54]:

df = df.assign(uri=scrape_question(df.id))

# {
#     "id": id_,
#     "content": {
#         "mimeType": "text/html",
#         "uri": uri
#     },
#     "structData": {
#         "embedding_vector": embedding,
#         "title": title,
#         "body": body,
#         "answer_count": answer_count
#     },
# }


scraping


In [56]:
df["uri"]


0      gs://ucs-demo/embeddings-stackoverflow/html/ho...
1      gs://ucs-demo/embeddings-stackoverflow/html/ho...
2      gs://ucs-demo/embeddings-stackoverflow/html/ho...
3      gs://ucs-demo/embeddings-stackoverflow/html/ho...
4      gs://ucs-demo/embeddings-stackoverflow/html/ho...
                             ...                        
995    gs://ucs-demo/embeddings-stackoverflow/html/ho...
996    gs://ucs-demo/embeddings-stackoverflow/html/ho...
997    gs://ucs-demo/embeddings-stackoverflow/html/ho...
998    gs://ucs-demo/embeddings-stackoverflow/html/ho...
999    gs://ucs-demo/embeddings-stackoverflow/html/ho...
Name: uri, Length: 1000, dtype: object