In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Run RAG Pipelines in BigQuery with BQML and Vector Search

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretrieval-augmented-generation%2Frag_with_bigquery.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_with_bigquery.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
| Author(s) | [Jeff Nelson](https://github.com/jeffonelson/), Eric Hao |

## Overview

This notebook demonstrates a basic end-to-end retrieval-augmented generation (RAG) pipeline using [BigQuery](https://cloud.google.com/bigquery/) and [BigQuery ML](https://cloud.google.com/bigquery/docs/bqml-introduction) functions. To do so, we:

* Complete setup steps to download sample data and access [Vertex AI](https://cloud.google.com/vertex-ai) from BigQuery
* Generate [object table](https://cloud.google.com/bigquery/docs/object-table-introduction) to access unstructured PDFs that reside in [Cloud Storage](https://cloud.google.com/storage)
* Create a remote model, so BigQuery can call [Document AI](https://cloud.google.com/document-ai) to parse the PDF inputs
* Parse response from Document AI into chunks and metadata, then generate vector embeddings for the chunks
* Run a [vector search](https://cloud.google.com/bigquery/docs/vector-search) against embeddings in BigQuery, return relevant chunks, and summarize them with Gemini

## How to open this notebook in BigQuery Studio

This notebook was written to be compatible for use within BigQuery Studio. To open this notebook in BigQuery, click to [Run in Colab Enterprise](https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretrieval-augmented-generation%2Frag_with_bigquery.ipynb). This will open a new window in the Cloud Console and prompt you to confirm import. Then, navigate to BigQuery, where you will find the notebook available in the Explorer pane under Notebooks.

## About the dataset

This example uses [the Federal Reserve's 2023 Survey of Consumer Finances](https://www.federalreserve.gov/publications/files/scf23.pdf) (SCF) report. The document contains information around US family income, net worth, credit use, and other common household financial indicators.

## Services and Costs

This tutorial uses the following Google Cloud data analytics and ML services, they are billable components of Google Cloud:

* BigQuery & BigQuery ML [(pricing)](https://cloud.google.com/bigquery/pricing)
* Vertex AI Generative AI models [(pricing)](https://cloud.google.com/vertex-ai/generative-ai/pricing)
* Document AI [(pricing)](https://cloud.google.com/document-ai/pricing)
* Cloud Storage [(pricing)](https://cloud.google.com/storage/pricing)

Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Setup Steps to access Vertex AI models from BigQuery and enable APIs

### Install Document AI SDK

In [None]:
%pip install --quiet google-cloud-documentai==2.31.0

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [3]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()
    print("Authenticated")

### Define your Google Cloud project

In [4]:
PROJECT_ID = "your-project-id"  # @param {type: "string"}
PROJECT_NUMBER = "your-project-number"  # @param {type: "string"}

### Enable Data Table Display

This makes it easier to visualize tabular data within a Notebook environment later on.

In [None]:
%load_ext google.colab.data_table

### Create a new dataset in BigQuery

This will house any tables created throughout this notebook.

In [None]:
!bq mk --location=us --dataset --project_id={PROJECT_ID} docai_demo

### Create a Cloud resource connection

[Cloud resource connections](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection) enable BigQuery to access other Cloud services, like Cloud Storage and Vertex AI.

In [None]:
!bq mk --connection --connection_type=CLOUD_RESOURCE --location=us --project_id={PROJECT_ID} "demo_conn"
!bq show --location=us --connection --project_id={PROJECT_ID} "demo_conn"

### Add permissions to Cloud resource connection service account

The Cloud resource connection is associated with a service account. The following cell enables the service account to access services like Document AI, Cloud Storage, and Vertex AI.

**Note:** Copy the service account ID from the prior cell and input it below. It will look like `your-copied-service-account@gcp-sa-bigquery-condel.iam.gserviceaccount.com`.

In [None]:
connection_service_account = "your-copied-service-account@gcp-sa-bigquery-condel.iam.gserviceaccount.com"  # @param {type: "string"}
connection_member = f"serviceAccount:{connection_service_account}"


!gcloud projects add-iam-policy-binding {PROJECT_ID} --member={connection_member} --role='roles/documentai.viewer' --condition=None --quiet
!gcloud projects add-iam-policy-binding {PROJECT_ID} --member={connection_member} --role='roles/storage.objectViewer' --condition=None --quiet
!gcloud projects add-iam-policy-binding {PROJECT_ID} --member={connection_member} --role='roles/aiplatform.user' --condition=None --quiet

### Download the sample PDF used for this notebook and store it in a new Cloud Storage bucket

In [None]:
import random

# Create a unique Cloud Storage bucket name
bucket_name = f"{PROJECT_ID}-{random.randint(10000, 99999)}"

# Create the bucket
!gsutil mb -l US -p {PROJECT_ID} gs://{bucket_name}

# Download the PDF sample
!wget scf23.pdf "https://www.federalreserve.gov/publications/files/scf23.pdf"

# Upload the PDF sample to the newly created Cloud Storage bucket
!gsutil cp scf23.pdf gs://{bucket_name}/

# Print confirmation
print(f"PDF uploaded to gs://{bucket_name}/scf23.pdf")

## Create an object table

An object table allows BigQuery to read unstructured data in Google Cloud Storage. This uses the BigQuery Python client library to continue using the `bucket_name` variable.

In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

query = f"""
CREATE OR REPLACE EXTERNAL TABLE `docai_demo.object_table`
WITH CONNECTION `us.demo_conn`  -- Replace with your connection ID
OPTIONS (
  uris = ['gs://{bucket_name}/scf23.pdf'],
  object_metadata = 'DIRECTORY'
);
"""

query_job = client.query(query)  # API request
query_job.result()  # Waits for the query to complete

print("External table docai_demo.object_table created or replaced successfully.")

### Show the object table

Confirm that the results display the PDF document in your Cloud Storage bucket.

In [None]:
%%bigquery --project $PROJECT_ID

SELECT * 
FROM `docai_demo.object_table`;

## Use BQML and Document AI to parse documents

### Create a Layout Parser Processor in Document AI

[Create a new processor](https://cloud.google.com/document-ai/docs/create-processor#documentai_fetch_processor_types-python) in Document AI with the type `LAYOUT_PARSER_PROCESSOR`.

In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

location = "us"
processor_display_name = "layout_parser_processor"
processor_type = "LAYOUT_PARSER_PROCESSOR"


def create_processor_sample(
    PROJECT_ID: str, location: str, processor_display_name: str, processor_type: str
) -> None:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    parent = client.common_location_path(PROJECT_ID, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )

    # Return the processor ID needed for creating a BigQuery connection
    return processor.name.split("/")[-1]


# Call this function to create the processor and return its ID
processor_id = create_processor_sample(
    PROJECT_ID, location, processor_display_name, processor_type
)

### Create a remote model in BigQuery that connects with your Document AI Layout Parser Processor

This one-time setup step allows BigQuery to reference the Document AI Processor you just created.

**Note:** If if you receive an 400 GET error "permission denied for document processor", you may need to wait a minute for permissions to propagate from earlier steps.

In [None]:
query = f"""
CREATE OR REPLACE MODEL `docai_demo.layout_parser` 
REMOTE WITH CONNECTION `us.demo_conn`
OPTIONS(remote_service_type="CLOUD_AI_DOCUMENT_V1", document_processor="{processor_id}")
"""

query_job = client.query(query)  # API request
query_job.result()  # Waits for the query to complete

print("Remote model docai_demo.layout_parser created or replaced successfully.")

### Process the document using BigQuery ML

Use the [`ML.PROCESS_DOCUMENT` function](https://cloud.google.com/bigquery/docs/process-document) from BigQuery to call your Document AI processor and pass through the PDF. This uses the Layout Parser configuration and chunks your document.

**Note:** this may take a minute or so to complete.

In [None]:
%%bigquery --project $PROJECT_ID --location us

CREATE or REPLACE TABLE docai_demo.demo_result AS (
  SELECT * FROM ML.PROCESS_DOCUMENT(
  MODEL docai_demo.layout_parser,
  TABLE docai_demo.object_table,
  PROCESS_OPTIONS => (JSON '{"layout_config": {"chunking_config": {"chunk_size": 250}}}')
  )
);

### Parse the JSON results returned to BigQuery

The `ML.PROCESS_DOCUMENT` function parses the PDF from Cloud Storage and returns a JSON blob to BigQuery. In this step, we'll parse the JSON, extract document chunks and metadata, and return it to a new BigQuery table.

In [None]:
%%bigquery --project $PROJECT_ID --location us

CREATE OR REPLACE TABLE docai_demo.demo_result_parsed AS (
SELECT
  uri,
  JSON_EXTRACT_SCALAR(json , '$.chunkId') AS id,
  JSON_EXTRACT_SCALAR(json , '$.content') AS content,
  JSON_EXTRACT_SCALAR(json , '$.pageFooters[0].text') AS page_footers_text,
  JSON_EXTRACT_SCALAR(json , '$.pageSpan.pageStart') AS page_span_start,
  JSON_EXTRACT_SCALAR(json , '$.pageSpan.pageEnd') AS page_span_end
FROM docai_demo.demo_result, UNNEST(JSON_EXTRACT_ARRAY(ml_process_document_result.chunkedDocument.chunks, '$')) json
);

### Display the parsed document chunks

Show a preview of the parsed results and metadata.

In [None]:
%%bigquery --project $PROJECT_ID --location us

SELECT *
FROM docai_demo.demo_result_parsed
ORDER BY id
LIMIT 5;

## Connect to Vertex AI embedding generation and Gemini access

### Connect to a text embedding model

[Create a remote model](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-remote-model) allowing BigQuery access to a text embedding model hosted in Vertex AI.

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE MODEL `docai_demo.embedding_model` 
REMOTE WITH CONNECTION `us.demo_conn` OPTIONS(endpoint="text-embedding-004")

### Generate embeddings

Use the [`ML.GENERATE_EMBEDDING` function](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding) in BigQuery to generate embeddings for all text chunks in the document.

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE TABLE `docai_demo.embeddings` AS
SELECT * FROM ML.GENERATE_EMBEDDING(
  MODEL `docai_demo.embedding_model`,
  TABLE `docai_demo.demo_result_parsed`
);

### Connect to a Gemini LLM endpoint

Create a remote model allowing BigQuery access to a Gemini foundation model hosted in Vertex AI.

In [None]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE MODEL `docai_demo.gemini_flash` REMOTE
WITH CONNECTION `us.demo_conn` OPTIONS(endpoint="gemini-1.5-flash")

## Run vector search, return results, and pass them to Gemini for text generation

### Sample BigQuery vector search

Run a sample BigQuery vector search against your chunks. This query takes your text input, creates an embedding using the `ML.GENERATE_EMBEDDING` function, and then passes the embedding through to the [`VECTOR_SEARCH` function](https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions#vector_search). The results are the top ten chunks that are most semantically related to your input.

In the search query below, the input text asks "Did the typical family net worth increase? If so, by how much?"

In [None]:
%%bigquery --project $PROJECT_ID

SELECT query.query, base.uri, base.id, base.content, distance
    FROM
      VECTOR_SEARCH( TABLE `docai_demo.embeddings`,
        'ml_generate_embedding_result',
        (
        SELECT
          ml_generate_embedding_result,
          content AS query
        FROM
          ML.GENERATE_EMBEDDING( MODEL `docai_demo.embedding_model`,
            ( SELECT 'Did the typical family net worth increase? If so, by how much?' AS content)
          ) 
        ),
        top_k => 10,
        OPTIONS => '{"fraction_lists_to_search": 0.01}') 
ORDER BY distance DESC;

## Generate text augmented by vector search results

This step builds upon the prior one - but instead of simply returning the top text chunks, it calls the `ML.GENERATE_TEXT` function to summarize them alongside the question we input.

In this query you:
* **Retrieve** the closest chunks semantically using the `VECTOR_SEARCH` function (this is what was done in the prior query)
* **Augment** the Gemini LLM with this knowledge
* **Generate** a succinct answer using the `ML.GENERATE_TEXT` function

In [None]:
%%bigquery --project $PROJECT_ID

SELECT
  ml_generate_text_llm_result AS generated,
  -- prompt -- Commented out, but please feel free to uncomment if you would like to see the full context passed to the Gemini model
FROM
  ML.GENERATE_TEXT( MODEL `docai_demo.gemini_flash`,
    (
    SELECT
    CONCAT( 'Did the typical family net worth change? How does this compare the SCF survey a decade earlier? Be concise and use the following context:',
    STRING_AGG(FORMAT("context: %s and reference: %s", base.content, base.uri), ',\n')) AS prompt,
    FROM
      VECTOR_SEARCH( TABLE 
        `docai_demo.embeddings`,
        'ml_generate_embedding_result',
        (
        SELECT
          ml_generate_embedding_result,
          content AS query
        FROM
          ML.GENERATE_EMBEDDING( MODEL `docai_demo.embedding_model`,
            (
            SELECT
              'Did the typical family net worth change? How does this compare the SCF survey a decade earlier?' AS content
            )
          ) 
        ),
        top_k => 10,
        OPTIONS => '{"fraction_lists_to_search": 0.01}') 
      ),
      STRUCT(512 AS max_output_tokens, TRUE AS flatten_json_output)
  );

### Sample questions to try out:

Here are a list of a few other questions to spark your imagination. Feel free to try your own too!
* Did the amount of debt families own on their home increase between 2019 and 2022?
* Did younger or older families see their net worth increase more?
* How much did the median family income change between 2018 and 2021?

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial by uncommenting the below:

In [None]:
# # Deletes the BigQuery assets and Google Cloud Storage bucket

# !bq rm -r -f $PROJECT_ID:docai_demo
# !bq rm --connection --project_id=$PROJECT_ID --location=us demo_conn
# !gsutil rm -r gs://{bucket_name}


# # Deletes the Document AI processor
# def delete_processor_sample(
#     PROJECT_ID: str, location: str, processor_id: str
# ) -> None:
#     """Deletes a processor."""

#     opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

#     client = documentai.DocumentProcessorServiceClient(client_options=opts)

#     # The full resource name of the processor
#     name = f"projects/{PROJECT_ID}/locations/{location}/processors/{processor_id}"

#     try:
#         client.delete_processor(name=name)
#         print(f"Processor {processor_id} deleted successfully.")
#     except Exception as e:
#         print(f"Error deleting processor: {e}")


# # Call this function to delete the processor
# delete_processor_sample(PROJECT_ID, location, processor_id)

# Wrap up

This notebook demonstrates an example of how to achieve a basic end-to-end retrieval-augmented generation pipeline using BigQuery. It integrates BigQuery ML functions like `ML.PROCESS_DOCUMENT` to call Document AI and parse PDFs, `ML.GENERATE_EMBEDDING` to generate embeddings on text chunks and input queries, and `ML.GENERATE_TEXT` to provide a concise answer. It also uses the `VECTOR_SEARCH` function to identify similar text (using embeddings) in BigQuery using familiar SQL syntax.

To continue learn more, check out our documentation on [BigQuery ML](https://cloud.google.com/bigquery/docs/bqml-introduction) and [BigQuery Vector Search](https://cloud.google.com/bigquery/docs/vector-search).