In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Grounding with Vertex AI Search

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fgrounding%2Fgrounding_with_vais.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/grounding/grounding_with_vais.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/grounding/grounding_with_vais.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) |  [Diem Vu](https://github.com/diemtvu/) [Zhen Hu](https://github.com/undertwig/) |

## Overview

This notebook demonstrates how to use Vertex AI Search for grounding Vertex LLMs.  For more general information on grounding, see [Getting Started with Grounding with Gemini in Vertex AI](./intro-grounding-gemini.ipynb).

In this tutorial, we will cover:

* How to create a Vertex AI Search datastore with your data.
* Provide an example LLM request that uses this data for grounding.

## Get started

### Install Vertex AI SDK and other required packages

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-discoveryengine

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

[link text](https://)<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Setup OAuth using service accounts

The Vertex AI Search API lets you perform both keyword search and semantic search on your own data. Since it's **your data**, this needs stricter access controls than API keys. Authenticate with OAuth with [service accounts](#service-oauth).
The Vertex AI Search API lets you perform both keyword search and semantic search on your own data. Since it's **your data**, this needs stricter access controls than API keys. Authenticate with OAuth with [service accounts](#service-oauth).

This quickstart uses a simplified authentication approach meant for a testing environment, and service account setups are typically easier to start from. For a production environment, learn about [authentication and authorization](https://developers.google.com/workspace/guides/auth-overview) before choosing the [access credentials](https://developers.google.com/workspace/guides/create-credentials#choose_the_access_credential_that_is_right_for_you) that are appropriate for your app.

Follow the steps below to setup OAuth using service accounts:

1. Enable the [Discovery Engine API](https://console.cloud.google.com/flows/enableapi?apiid=discoveryengine.googleapis.com):{.external} (a.k.a Vertex AI Search API).

<img width=400 src="https://ai.google.dev/tutorials/images/semantic_retriever_enable_api.png">

2. Create the Service Account by following the [documentation](https://developers.google.com/identity/protocols/oauth2/service-account#creatinganaccount).

 * After creating the service account, generate a service account key.
 * Grant `Discovery Engine Admin`, `GCE Storage Bucket Admin
`, Storage Admin IAM role to the service account.

<img width=400 src="https://ai.google.dev/tutorials/images/semantic_retriever_service_account.png">

3. Upload your service account file by using the file icon on the left sidebar, then the upload icon, as shown in the screenshot below.

 * Rename the uploaded file to `service_account_key.json` or change the variable `service_account_file_name` in the code below.

<img width=400 src="https://ai.google.dev/tutorials/images/colab_upload.png">

In [None]:
%pip install -U google-auth-oauthlib

In [None]:
service_account_file_name = "cloud-ai-retail-search-test.json"

from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    service_account_file_name
)

scoped_credentials = credentials.with_scopes(
    ["https://www.googleapis.com/auth/cloud-platform"]
)

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

You also need to choose a region for your Vertex AI Search datastore.  Vertex AI Search is a multi-region service, supporting `us`, `eu`, and `global` regions.

In [None]:
VAIS_LOCATION = "global"  # @param {type: "string"}

## Create Vertex AI Search datastore

### Initialize the VAIS client library using the service account credentials.

In [None]:
import json
import time

from google.api_core.client_options import ClientOptions
import google.cloud.discoveryengine_v1 as vais

client_options = (
    ClientOptions(api_endpoint=f"{VAIS_LOCATION}-discoveryengine.googleapis.com")
    if VAIS_LOCATION != "global"
    else None
)
data_store_service_client = vais.DataStoreServiceClient(
    client_options=client_options, credentials=scoped_credentials
)
document_service_client = vais.DocumentServiceClient(
    client_options=client_options, credentials=scoped_credentials
)


def wait_for_operation_finish(operation):
    while not operation.done:
        time.sleep(2)  # sleep 2 seconds

### Create a DataStore

In order to ingest your documents and perform searches, you need to create a datastore which means corpora.

Here we are creating a generic search data store with pdf documents.

In [None]:
DATASTORE_ID = "your-data-store-id"  # @param {type:"string"}

# Create a generic search datastore with unstrucuted data content.
data_store = vais.DataStore(
    display_name="Grounding Demo DataStore",
    industry_vertical="GENERIC",
    solution_types=["SOLUTION_TYPE_SEARCH"],
    content_config="CONTENT_REQUIRED",
)

create_data_store_request = vais.CreateDataStoreRequest(
    parent=f"projects/{PROJECT_ID}/locations/{VAIS_REGION}/collections/default_collection",
    data_store=data_store,
    data_store_id=DATASTORE_ID,
)

# The api returns long running operation as response.
create_data_store_operation = data_store_service_client.create_data_store(
    create_data_store_request
)
print(
    f"Waiting for create datastore operation to complete: {create_data_store_operation.operation.name}"
)
created_data_store = create_data_store_operation.result()
data_store_name = created_data_store.name
print(f"Data store {data_store_name} is created.")

In [None]:
print("Getting data store:")
get_data_store_request = vais.GetDataStoreRequest(name=data_store_name)
get_data_store_response = data_store_service_client.get_data_store(
    get_data_store_request
)
print(get_data_store_response)

### Create a document

Upload your own documents to datastore

#### Prepare the document in Google Cloud Storage

Vertex AI Search supports a variety of data sources. This example uses Google Cloud Storage.

In [None]:
# install Google Cloud storage client library
%pip install -U google-cloud-storage

In [None]:
from google.cloud import storage

# Instantiates a client
storage_client = storage.Client(credentials=scoped_credentials)

# @markdown a GCS bucket used for VAIS document ingestion
BUCKET_NAME = "your-demo-bucket"  # @param {type:"string"}

# Creates the new bucket
bucket = storage_client.create_bucket(BUCKET_NAME)

print(f"Bucket {bucket.name} created.")

In [None]:
# Write the html documents into GCS
import requests

file_urls = [
    "https://abc.xyz/assets/investor/static/pdf/2022_Q1_Earnings_Transcript.pdf",
    "https://abc.xyz/assets/investor/static/pdf/2022_Q2_Earnings_Transcript.pdf",
    "https://abc.xyz/assets/investor/static/pdf/2022_Q3_Earnings_Transcript.pdf",
    "https://abc.xyz/assets/investor/static/pdf/2022_Q4_Earnings_Transcript.pdf",
]

bucket = storage_client.bucket(bucket_name)

for url in file_urls:
    file_name = url.split("/")[-1]
    print(f"Downloading: {file_name}")

    try:
        response = requests.get(url)
        response.raise_for_status()

        # Construct the full blob path (including prefix)
        blob_name = f"{file_name}"
        blob = bucket.blob(blob_name)

        blob.upload_from_string(response.content)
        print(f"Uploaded: {blob_name}")  # Print the uploaded blob path
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {file_name}: {e}")

In [None]:
# Check the created files on GCS
for blob in bucket.list_blobs():
    print(blob.name)

#### Upload documents to Datastore


In [None]:
# Create document with GCS file.

created_document_names = []
id = 1
for url in file_urls:
    file_name = url.split("/")[-1]
    metadata = {"uri": url}

    document = vais.Document(
        content=vais.Document.Content(
            mime_type="application/pdf",
            uri=f"gs://{bucket_name}/{file_name}",
        ),
        # meta data
        json_data=json.dumps(metadata),
    )

    create_document_request = vais.CreateDocumentRequest(
        parent=f"{data_store_name}/branches/0",
        document_id=f"document-{id}",
        document=document,
    )
    id += 1
    created_document = document_service_client.create_document(create_document_request)
    document_name = created_document.name
    created_document_names.append(document_name)
    print(f"Document {document_name} is created:")

## LLM Grounding with your data

Indexing may take a few minutes to complete. Once your datastore is ready, you can use it as a grounding source in a Vertex LLM call, as shown below:

In [None]:
import genai
from genai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Tool,
    grounding,
)

genai.init(project=project, location="us-central1", credentials=scoped_credentials)

model = GenerativeModel("gemini-1.5-flash-001")

# The datastore,documents could have < 15m delay after creation.
tool = Tool.from_retrieval(
    grounding.Retrieval(
        grounding.VertexAISearch(
            datastore=data_store_id,
            project=project,
            location=location,
        )
    )
)

prompt = "What is the 2022 Q2 revenue?"
response = model.generate_content(
    prompt,
    tools=[tool],
    generation_config=GenerationConfig(
        temperature=0.0,
    ),
)

print(response.text)

## Cleaning up

Use [`DeleteCorpusRequest`](https://ai.google.dev/api/python/google/generativeai/protos/DeleteCorpusRequest) to delete a user corpus and all associated `Document`s & `Chunk`s.

Note that non-empty corpora will throw an error without specifying an `force=True` flag. If you set `force=True`, any `Chunk`s and objects related to this `Document` will also be deleted.

If `force=False` (the default) and the `Document` contains any `Chunk`s, a `FAILED_PRECONDITION` error will be returned.

In [None]:
# Delete the documents
for document_name in created_document_names:
    delete_document_request = vais.DeleteDocumentRequest(name=document_name)
    document_service_client.delete_document(delete_document_request)
    print(f"Successfully deleted document: {document_name}")

In [None]:
# Delete the data store
delete_data_store_request = vais.DeleteDataStoreRequest(name=data_store_name)
delete_data_store_operation = data_store_service_client.delete_data_store(
    delete_data_store_request
)
print(
    f"Waiting for delete datastore operation to complete: {delete_data_store_operation.operation.name}"
)
wait_for_operation_finish(delete_data_store_operation)
print(f"Successfully deleted data store {data_store_name}")

## Appendix: Setup OAuth with user credentials {:#user-oauth}

Follow the steps below from the [OAuth Quickstart](https://ai.google.dev/docs/oauth_quickstart) to setup OAuth authentication.

1. [Configure the OAuth consent screen](https://ai.google.dev/docs/oauth_quickstart#configure-oauth).

1. [Authorize credentials for a desktop application](https://ai.google.dev/docs/oauth_quickstart#authorize-credentials). To run this notebook in Colab, first rename your credential file (usually `client_secret_*.json`) to just `client_secret.json`. Then upload the file by using the file icon on the left sidebar, then the upload icon, as shown in the screenshot below.

<img width=400 src="https://ai.google.dev/tutorials/images/colab_upload.png">

In [None]:
# Replace your-email@gmail.com with the email added as a test user in the OAuth Quickstart
EMAIL = "your-email@gmail.com"  #  @param {type:"string"}

# IMPORTANT: Follow the instructions from the output - you must copy the command
# to your terminal and copy the output after authentication back here.
!gcloud config set project $PROJECT_ID
!gcloud config set account $EMAIL

# NOTE: The simplified project setup in this tutorial triggers a "Google hasn't verified this app." dialog.
# This is normal, click "Advanced" -> "Go to [app name] (unsafe)"
!gcloud auth application-default login --no-browser --client-id-file=$service_account_file_name --scopes="https://www.googleapis.com/auth/generative-language.retriever,https://www.googleapis.com/auth/cloud-platform"