In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Augment Gemini Output with Vector Embeddings from BigQuery

---

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretrieval-augmented-generation%2Frag_vector_embedding_in_bigquery.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_vector_embedding_in_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
|Author(s) | [Logan Ramalingam](https://github.com/logan-google) |

## Overview

This notebook shows how to use BigQuery to create generate embeddings from text in a BigQuery table, store them within BigQuery, and then use the embeddings to augment the results from LLM in Vector Search.

In this notebook, we create text embeddings for publicly available abstracts from [patents data](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data) and use them in our LLM search. Google Patents Public Data, provided by IFI CLAIMS Patent Services, is a worldwide bibliographic and US full-text dataset of patent publications.


```patents-public-data.google_patents_research.publications```

This notebook references the steps mentioned in [Perform semantic search and retrieval-augmented generation](https://cloud.google.com/bigquery/docs/vector-index-text-search-tutorial)

## Required roles and permissions

To create a connection, you need membership in the following Identity and Access Management (IAM) role:

* ```roles/bigquery.connectionAdmin```

To grant permissions to the connection's service account, you need the following permission:

* ```resourcemanager.projects.setIamPolicy```

The IAM permissions needed in this tutorial for the remaining BigQuery operations are included in the following two roles:


*   BigQuery Data Editor (```roles/bigquery.dataEditor```) to create models, tables, and indexes.

*   BigQuery User (```roles/bigquery.user```) to run BigQuery jobs.

## Getting Started

### Install Vertex AI SDK and other required packages

In [None]:
pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-bigquery lxml google-cloud-bigquery-connection

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After its restarted, continue to the next step.

In [None]:
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

## Set Google Cloud project information and initialize BigQuery Connect

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Google Cloud Project ID
PROJECT_ID = "your-project-id"  # @param {type:"string"}

# BigQuery Dataset for storing embeddings and model
DATASET_ID = "bq_vector_embeddings"  # @param {type:"string"}

# BigQuery Region
REGION = "US"  # @param {type: "string"}

# BigQuery Connection name
CONN_NAME = "bqml_llm_conn"

# Embeddings Remote Model name in BigQuery
EMBEDDINGS_MODEL_ID = "llm_gecko"  # @param {type:"string"}

# Embeddings Table name in BigQuery
EMBEDDINGS_TABLE_ID = "embeddings"  # @param {type:"string"}

# LLM Remote Model name in BigQuery
LLM_MODEL_ID = "llm_gemini"  # @param {type:"string"}

# Embeddings Model to use
EMBEDDINGS_ENDPOINT_TYPE = "text-embedding-004"  # @param {type:"string"}

# LLM Model to use
LLM_ENDPOINT_TYPE = "gemini-1.5-pro"  # @param {type:"string"}

In [None]:
# Set the project id
! gcloud config set project {PROJECT_ID}

## Import libraries

Let's start by importing the libraries that we will need for this tutorial

In [None]:
# if in Colab, enable data_table format
if "google.colab" in sys.modules:
    from google.colab import data_table

    data_table.enable_dataframe_formatter()

from google.cloud import bigquery
from google.cloud import bigquery_connection_v1 as bq_connection
from google.cloud.exceptions import NotFound

## Setup BigQuery Environment

### Initialize Google BigQuery Client

In [None]:
client = bigquery.Client(project=PROJECT_ID)

### Wrapper to use BigQuery client to run query and return result

In [None]:
def run_bq_query(sql: str):
    """
    Input: SQL query, as a string, to execute in BigQuery
    Returns the query results or error, if any
    """
    try:
        query_job = client.query(sql)
        result = query_job.result()
        print(f"JOB ID: {query_job.job_id} STATUS: {query_job.state}")
        return result

    except Exception as e:
        raise Exception(str(e))

### Create BigQuery Dataset

In [None]:
# Set dataset_id to the ID of the dataset to create.
dataset = f"{PROJECT_ID}.{DATASET_ID}"

# Construct a full Dataset object to send to the API.
dataset_object = bigquery.Dataset(dataset)

# Specify the geographic location where the dataset should reside.
dataset_object.location = "US"

# Send the dataset to the API for creation, with an explicit timeout.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
try:
    client.get_dataset(dataset_object)  # Make an API request.
    print(f"Dataset {dataset} already exists")
except NotFound:
    dataset = client.create_dataset(dataset_object, timeout=30)  # Make an API request.
    print(f"Created dataset {client.project}.{dataset_object.dataset}")

### Create BigQuery Cloud resource connection

You will need to create a [Cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection) to enable BigQuery to interact with Vertex AI services:

In [None]:
conn_client = bq_connection.ConnectionServiceClient()
new_conn_parent = f"projects/{PROJECT_ID}/locations/{REGION}"
exists_conn_parent = f"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}"
cloud_resource_properties = bq_connection.CloudResourceProperties({})

# Try to use an existing connection if one already exists. If not, create a new one.
try:
    request = conn_client.get_connection(
        request=bq_connection.GetConnectionRequest(name=exists_conn_parent)
    )
    conn_service_account = f"serviceAccount:{request.cloud_resource.service_account_id}"
except Exception:
    connection = bq_connection.types.Connection(
        {"friendly_name": CONN_NAME, "cloud_resource": cloud_resource_properties}
    )
    request = bq_connection.CreateConnectionRequest(
        {
            "parent": new_conn_parent,
            "connection_id": CONN_NAME,
            "connection": connection,
        }
    )
    response = conn_client.create_connection(request)
    conn_service_account = (
        f"serviceAccount:{response.cloud_resource.service_account_id}"
    )
print(conn_service_account)

### Set permissions for Service Account
The resource connection service account requires certain project-level permissions which are outlined in the <a href="https://cloud.google.com/bigquery/docs/bigquery-ml-remote-model-tutorial#set_up_access" target="_blank">Vertex AI function documentation</a>.

<br>

**Note:** If you are using Vertex AI Workbench, the service account used by Vertex AI may not have sufficient permissions to add IAM policy bindings.

The [IAM Grant Access](https://cloud.google.com/iam/docs/granting-changing-revoking-access#grant-single-role) page gives instructions on how these policy bindings can be added using Cloud Shell.

In [None]:
import time

!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={conn_service_account} --role='roles/serviceusage.serviceUsageConsumer'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={conn_service_account} --role='roles/bigquery.connectionUser'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={conn_service_account} --role='roles/aiplatform.user'
!gcloud services enable bigqueryconnection.googleapis.com
# wait 60 seconds, give IAM updates time to propagate, otherwise, following cells will fail
time.sleep(60)

# Configure Vertex AI Embeddings Model in BigQuery

## Create the remote model for text embeddings generation
Create a remote model that represents a hosted Vertex AI text embeddings generation model.

The query takes several seconds to complete, after which the model ```EMBEDDINGS_MODEL_ID``` appears in the ```DATASET_ID``` in the Explorer pane.

In [None]:
sql = f"""CREATE OR REPLACE MODEL
            `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_MODEL_ID}`
          REMOTE WITH CONNECTION
            `{PROJECT_ID}.{REGION}.{CONN_NAME}`
          OPTIONS (ENDPOINT = '{EMBEDDINGS_ENDPOINT_TYPE}');"""
result = run_bq_query(sql)

## Generate text embeddings
Generate text embeddings from patent abstracts using the ```ML.GENERATE_TEXT_EMBEDDING``` function, and then write them to a BigQuery table so that they can be searched.

**Note: Query might take up to 10 minutes to run.**

In [None]:
sql = f"""
      CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}` AS
      SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(
        MODEL `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_MODEL_ID}`,
        (
          SELECT *, abstract AS content
          FROM `patents-public-data.google_patents_research.publications`
          WHERE LENGTH(abstract) > 0 AND LENGTH(title) > 0 AND country = 'Singapore'
        )
      )
      WHERE ARRAY_LENGTH(text_embedding) > 0;
      """
result = run_bq_query(sql)

## Create Vector index

A vector index is a data structure designed to let the ```VECTOR_SEARCH``` function perform a more efficient vector search of embeddings. When ```VECTOR_SEARCH``` is able to use a vector index, the function uses the Approximate Nearest Neighbor search technique to help improve search performance, with the trade-off of reducing recall and thus returning more approximate results.

**NOTE: Query might take up to 5 minutes to run.**

In [None]:
sql = f"""CREATE OR REPLACE VECTOR INDEX my_index ON `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}`(text_embedding) OPTIONS(index_type = 'IVF',
distance_type = 'COSINE',   ivf_options = '{{"num_lists":500}}')"""
result = run_bq_query(sql)

### Verify vector index creation

The vector index is populated asynchronously. You can check whether the index is ready to be used by querying the ```INFORMATION_SCHEMA.VECTOR_INDEXES``` view and verifying that the coverage_percentage column value is greater than 0 and the ```last_refresh_time``` column value isn't ```NULL```.

In [None]:
# Check vector index creation status, 'coverage_percentage' should be 100
sql = f"""

    SELECT table_name, index_name, index_status, coverage_percentage, last_refresh_time, disable_reason
    FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.VECTOR_INDEXES`
    WHERE table_name = '{EMBEDDINGS_TABLE_ID}'
    """

result = run_bq_query(sql).to_dataframe()
print(result)

## Perform a text similarity search using the vector index

Use the ```VECTOR_SEARCH``` function to search for the top 5 relevant patents that match embeddings generated from a text query. The model you use to generate the embeddings in this query must be the same as the one you use to generate the embeddings in the table you are comparing against, otherwise the search results won't be accurate.

In [None]:
sql = f"""
  SELECT
    query.query,
    base.content,
    distance
  FROM
    VECTOR_SEARCH( TABLE `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}`,
      'text_embedding',
      (
      SELECT
        text_embedding,
        content AS query
      FROM
        ML.GENERATE_TEXT_EMBEDDING( MODEL `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_MODEL_ID}`,
          (
          SELECT 'improving password security' AS content))
      ),
      top_k => 5,
      OPTIONS => '{{"fraction_lists_to_search":0.01}}');"""

result = run_bq_query(sql).to_dataframe()
print(result)

# Generate text using embeddings

## Create the remote model for text generation

Create a remote model that represents a hosted Gemini Model

In [None]:
sql = f"""
      CREATE OR REPLACE MODEL
        `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_ID}`
        REMOTE WITH CONNECTION
          `{PROJECT_ID}.{REGION}.{CONN_NAME}`
        OPTIONS (ENDPOINT = '{LLM_ENDPOINT_TYPE}');
      """
result = run_bq_query(sql)

## Generate text augmented by vector search results

Feed the search results as prompts to generate text with the ```ML.GENERATE_TEXT``` function

In [None]:
sql = f"""SELECT ml_generate_text_llm_result AS generated, prompt
FROM ML.GENERATE_TEXT(
  MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_ID}`,
  (
    SELECT CONCAT(
      'Propose some project ideas to improve user password security using the context below. Add the patent title and url to each idea: ',
      STRING_AGG(
        FORMAT("patent title: %s, patent abstract: %s", base.title, base.abstract))
      ) AS prompt,
    FROM VECTOR_SEARCH(
      TABLE `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}`, 'text_embedding',
      (
        SELECT text_embedding, content AS query
        FROM ML.GENERATE_TEXT_EMBEDDING(
          MODEL `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_MODEL_ID}`,
         (SELECT 'improving password security' AS content)
        )
      ),
    top_k => 5, options => '{{"fraction_lists_to_search": 0.01}}')
  ),
  STRUCT(600 AS max_output_tokens, TRUE AS flatten_json_output));"""

query_job = client.query(sql)
rows = query_job.result()

for row in rows:
    print(row[0])

## Cleaning up

Clean up resources created in this notebook

In [None]:
# Delete Vector Index
sql = f"""DROP VECTOR INDEX my_index ON `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}`"""
result = run_bq_query(sql)

# Delete Gemini Model
sql = f"""DROP MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_ID}`"""
result = run_bq_query(sql)

# Delete Embeddings Model
sql = f"""DROP MODEL `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_MODEL_ID}`"""
result = run_bq_query(sql)

# Delete Embeddings Table
sql = f"""DROP TABLE `{PROJECT_ID}.{DATASET_ID}.{EMBEDDINGS_TABLE_ID}`"""
result = run_bq_query(sql)

# Delete BigQuery Connection
request = bq_connection.DeleteConnectionRequest({"name": exists_conn_parent})
response = conn_client.delete_connection(request)

# Delete Dataset
client.delete_dataset(dataset_object, delete_contents=True, not_found_ok=True)

# Close BigQuery Connection
client.close()