In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Feature Store Based LLM Grounding Tutorial

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

In this tutorial, you learn how to chunk user-provided data, and then generate embedding vectors for each chunk using a Vertex LLM (Large Language Model) having embedding generation capabilities. The resulting embedding vector dataset can then be loaded into Vertex AI Feature Store, enabling fast feature retrieval and efficient online serving.

Learn more about [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore/overview).

### Objective

In this tutorial, you learn how to create and use an online feature store instance to host and serve data in `BigQuery` with `Vertex AI Feature Store` in an end to end workflow of features serving and vector retrieval user journey.

This tutorial uses the following Google Cloud ML services and resources:

- `Vertex AI Feature Store`

The steps performed include:

- Provision an online feature store instance to host and serve data.
- Create an online feature store instance to serve a `BigQuery` table.
- Use the online server to search nearest neighbors.

### Note
This is a Preview release. By using the feature, you acknowledge that you're aware of the open issues and that this preview is provided “as is” under the pre-GA terms of service.


### Dataset

This tutorial uses the [Google Patents Public Data](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data) dataset from the `BigQuery` public datasets.


### Costs

This tutorial uses billable components of Google Cloud:

* `Vertex AI`
* `BigQuery`
* `Cloud Storage`

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
[BigQuery pricing](https://cloud.google.com/bigquery/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the following packages required to run this notebook.

In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform\
                                 google-cloud-bigquery\
                                 db-dtypes

! pip3 install --upgrade kfp -q --no-warn-conflicts

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# # Automatically restart the kernel after installation so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

4. If you're running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations). Note that the new Feature Store capability showed in the colab is currently only available in the following regions:
* `us-central1`
* `us-east1`
* `us-west1`
* `europe-west4`
* `asia-southeast1`

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you might have to manually authenticate. Follow these instructions:

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Import libraries

In [None]:
import uuid

from google.cloud import aiplatform, bigquery
from google.cloud.aiplatform_v1beta1 import (
    FeatureOnlineStoreAdminServiceClient, FeatureOnlineStoreServiceClient)
from google.cloud.aiplatform_v1beta1.types import NearestNeighborQuery
from google.cloud.aiplatform_v1beta1.types import \
    feature_online_store as feature_online_store_pb2
from google.cloud.aiplatform_v1beta1.types import \
    feature_online_store_admin_service as \
    feature_online_store_admin_service_pb2
from google.cloud.aiplatform_v1beta1.types import \
    feature_online_store_service as feature_online_store_service_pb2
from google.cloud.aiplatform_v1beta1.types import \
    feature_view as feature_view_pb2

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

## Set up and start online serving

To serve embedding data in Vertex AI Feature Store, do the following:

1. Prepare the data source in BigQuery.
2. Create an FeatureOnlineStore instance to host the data.
3. Define the data (`FeatureView`) to be served by the newly-created instance.

### Prepare BigQuery data source for feature view creation

In [None]:
GCS_BUCKET = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.


In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {GCS_BUCKET}

#### Prepare data in Google Cloud Storage (GCS)

In [None]:
INPUT_TEXT_GCS_DIR = f"{GCS_BUCKET}/fs_grounding/data"

import tarfile
from urllib.request import urlretrieve


def untar(file_name):
    output_folder_name = file_name[:-7]
    file = tarfile.open(file_name)
    file.extractall(output_folder_name)
    return output_folder_name


# Download data from https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz
url = "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz"
filename = "nfcorpus.tar.gz"
path, _ = urlretrieve(url, filename)
print(f"Downloaded {path}")

# Copy text files to GCS.
output_folder_name = f"{untar(path)}/nfcorpus"
dev_all_queries = f"{output_folder_name}/dev.all.queries"
dev_docs = f"{output_folder_name}/dev.docs"
! gsutil cp {dev_all_queries} {INPUT_TEXT_GCS_DIR}/queries
! gsutil cp {dev_docs} {INPUT_TEXT_GCS_DIR}/docs

#### Create BigQuery dataset

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)

BQ_DATASET_ID = "fs_grounding"  # @param {type:"string"}
dataset = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET_ID}")
dataset.location = REGION
dataset = bq_client.create_dataset(
    dataset, exists_ok=True, timeout=30
)  # Make an API request.

# Confirm dataset created.
print(f"Created dataset {dataset}.{BQ_DATASET_ID}")

#### Launch pipeline

In [None]:
run_id = str(uuid.uuid4())

PIPELINE_TEMPLATE_URI = "gs://vertex-evaluation-pipeline-templates/20240117_0005/feature_store_grounding_pipeline_pipeline.yaml"
BIGQUERY_BP_INPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_input"
BIGQUERY_BP_OUTPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_output"

PARAMS = {
    "project": PROJECT_ID,
    "location": REGION,
    "bigquery_bp_input_uri": BIGQUERY_BP_INPUT_URI,
    "bigquery_bp_output_uri": BIGQUERY_BP_OUTPUT_URI,
    "input_text_gcs_dir": INPUT_TEXT_GCS_DIR,
    "output_text_gcs_dir": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_output",
    "output_error_file_path": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_error_output",
    "model_name": "publishers/google/models/textembedding-gecko@latest",
    "generation_threshold_microseconds": "0",
}


def run_pipeline(
    parameters: dict,
    project: str,
    pipeline_root: str,
    location: str = "us-central1",
) -> aiplatform.PipelineJob:
    aiplatform.init(
        project=project,
        location=location,
    )

    test_prefix = "your-test-prefix"  # @param {type:"string"}
    pipeline_name = "feature-store-grounding-pipeline"  # @param {type:"string"}

    test_name = f"{test_prefix}-{pipeline_name}-{run_id}"
    job = aiplatform.PipelineJob(
        display_name=test_name,
        template_path=PIPELINE_TEMPLATE_URI,
        job_id=test_name,
        pipeline_root=pipeline_root,
        parameter_values=parameters,
        enable_caching=False,
    )

    job.submit()

    return job


job = run_pipeline(
    parameters=PARAMS,
    project=PROJECT_ID,
    pipeline_root=f"{GCS_BUCKET}/fs_based/pipeline_root",
    location=REGION,
)
job.wait()

#### BQ format conversion

In [None]:
def compose_bq_query_format_conversion(
    bigquery_bp_input_uri: str, bigquery_bp_output_uri: str
) -> str:
    """Compose the BQ query for format conversion.

    Args:
      bigquery_bp_input_uri: The URI to a bigquery table as the input for the
        batch prediction component. The chunking component will populate data to
        this uri first before batch prediction.
      bigquery_bp_output_uri: The URI to a bigquery table as the output for the
        batch prediction component.

    Returns:
      The composed query for BigQuery format conversion.
    """

    if bigquery_bp_input_uri.startswith("bq://"):
        bigquery_bp_input_uri = bigquery_bp_input_uri.replace("bq://", "")

    if bigquery_bp_output_uri.startswith("bq://"):
        bigquery_bp_output_uri = bigquery_bp_output_uri.replace("bq://", "")

    inseration_query = (
        f"UPDATE `{bigquery_bp_input_uri}` destTable"
        " SET embedding=ARRAY( select cast (str_element as float64) from"
        " unnest(JSON_VALUE_ARRAY(prediction, '$.embeddings.values')) as"
        " str_element)"
    )
    fetch_data_query = (
        "FROM (SELECT vertex_generated_chunk_id, prediction FROM"
        f" `{bigquery_bp_output_uri}` cross join"
        " unnest(JSON_EXTRACT_ARRAY(predictions)) as prediction) sourceTable"
        " WHERE"
        " destTable.vertex_generated_chunk_id=sourceTable.vertex_generated_chunk_id"
    )
    return f"{inseration_query} {fetch_data_query};"


bq_query = compose_bq_query_format_conversion(
    bigquery_bp_input_uri=BIGQUERY_BP_INPUT_URI,
    bigquery_bp_output_uri=BIGQUERY_BP_OUTPUT_URI,
)

bq_job = bq_client.query(bq_query)
bq_job.result()

### Initialize Admin Service Client

Load the Feature Store SDK.

In [None]:
admin_client = FeatureOnlineStoreAdminServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

### Create Feature Online Store

Create a feature online store with embedding management enabled.

In [None]:
FEATURE_ONLINE_STORE_ID = "my_feature_online_store_unique"  # @param {type: "string"}

In [None]:
online_store_config = feature_online_store_pb2.FeatureOnlineStore(
    bigtable=feature_online_store_pb2.FeatureOnlineStore.Bigtable(
        auto_scaling=feature_online_store_pb2.FeatureOnlineStore.Bigtable.AutoScaling(
            min_node_count=1, max_node_count=3, cpu_utilization_target=50
        )
    ),
    embedding_management=feature_online_store_pb2.FeatureOnlineStore.EmbeddingManagement(
        enabled=True
    ),
)

create_store_lro = admin_client.create_feature_online_store(
    feature_online_store_admin_service_pb2.CreateFeatureOnlineStoreRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}",
        feature_online_store_id=FEATURE_ONLINE_STORE_ID,
        feature_online_store=online_store_config,
    )
)

### Verify online store instance creation

After the long-running operation (LRO) is complete, show the result.

> **Note:** This operation might take up to 10 minutes to complete.

In [None]:
# Wait for the LRO to finish and get the LRO result.
print(create_store_lro.result())

#### Verify `FeatureOnlineStore` instance creation by retrieving the online store instance

In [None]:
# Use get to verify the store is created.
admin_client.get_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

#### List all online stores for the location

In [None]:
# Use list to verify the store is created.
admin_client.list_feature_online_stores(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}"
)

### Create feature view instance

After creating a `FeatureOnlineStore` instance, you can define the features to serve with it. To do this, create a `FeatureView` instance, which specifies the following:

* A data source (BigQuery table or view URI or `FeatureGroup/features`) synced to the `FeatureOnlineStore` instance for serving.
* The [cron](https://en.wikipedia.org/wiki/Cron) schedule to run the sync pipeline.

During feature view creation, a sync job will be scheduled, and either started immediately or following the cron schedule. In the sync job, data is exported to Cloud Bigtable, a index is built and deployed to GKE cluster.

In [None]:
FEATURE_VIEW_ID = "fs_grounding_test_new"  # @param {type: "string"}
# A schedule will be created based on cron setting.
# If cron is unspecified, a sync job is started immediately.
CRON_SCHEDULE = "TZ=America/Los_Angeles 00 13 11 8 *"  # @param {type: "string"}

In [None]:
# Vector search configs
DIMENSIONS = 768  # @param {type: "number"}
EMBEDDING_COLUMN = "embedding"  # @param {type: "string"}
# Optional
LEAF_NODE_EMBEDDING_COUNT = 10000  # @param {type: "number"}
# Optional
# CROWDING_COLUMN = "cited_by_filing_date"  # @param {type: "string"}
# # Optional
# FILTER_COLUMNS = ["country"]  # @param

In [None]:
DATA_SOURCE = BIGQUERY_BP_INPUT_URI

In [None]:
big_query_source = feature_view_pb2.FeatureView.BigQuerySource(
    uri=DATA_SOURCE, entity_id_columns=["vertex_generated_chunk_id"]
)

sync_config = feature_view_pb2.FeatureView.SyncConfig(cron=CRON_SCHEDULE)

vector_search_config = feature_view_pb2.FeatureView.VectorSearchConfig(
    embedding_column=EMBEDDING_COLUMN,
    # filter_columns=FILTER_COLUMNS,
    # crowding_column=CROWDING_COLUMN,
    embedding_dimension=DIMENSIONS,
    tree_ah_config=feature_view_pb2.FeatureView.VectorSearchConfig.TreeAHConfig(),
)

print(f"vector_search_config: {vector_search_config}")

create_view_lro = admin_client.create_feature_view(
    feature_online_store_admin_service_pb2.CreateFeatureViewRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}",
        feature_view_id=FEATURE_VIEW_ID,
        feature_view=feature_view_pb2.FeatureView(
            big_query_source=big_query_source,
            sync_config=sync_config,
            vector_search_config=vector_search_config,
        ),
    )
)

 Wait for LRO to complete and show result

In [None]:
print(create_view_lro.result())

### Verify feature view creation

Verify `FeatureView` instance creation by retrieving the feature view.

In [None]:
admin_client.get_feature_view(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

Verify that the FeatureView instance is created by listing all the feature views within the online store.

In [None]:
admin_client.list_feature_views(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

In [None]:
# Optional: Delete feature views to avoid exceeding the deployed index nodes quota.
# views = admin_client.list_feature_views(
#     parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
# )
# for view in views:
#     admin_client.delete_feature_view(name=view.name)

### Feature view syncs

The sync pipeline executes according to the schedule specified in the `FeatureView` instance.

To skip the wait and execute the sync pipeline immediately, start the sync manually.

In [None]:
sync_response = admin_client.sync_feature_view(
    feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

The `sync_response` contains the ID of the sync job.

#### Use `get_feature_view_sync` to check the status of the job

In [None]:
import time

while True:
    feature_view_sync = admin_client.get_feature_view_sync(
        name=sync_response.feature_view_sync
    )
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}.")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

#### Use `list_feature_view_syncs` to view all your syncs

In [None]:
admin_client.list_feature_view_syncs(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

### Start online serving

After the data sync is complete, use the `FetchFeatureValuesRequest` and `SearchNearestEntities` APIs to retrieve the public endpoint domain name.

Get public endpoint domain name.

In [None]:
# Verify online store creation.
featore_online_store_instance = admin_client.get_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)
PUBLIC_ENDPOINT = (
    featore_online_store_instance.dedicated_serving_endpoint.public_endpoint_domain_name
)

print(f"PUBLIC_ENDPOINT for online serving: {PUBLIC_ENDPOINT}")

#### Initialize the data client

In [None]:
data_client = FeatureOnlineStoreServiceClient(
    client_options={"api_endpoint": PUBLIC_ENDPOINT}
)

#### Search with `ENTITY_ID`

In [None]:
bq_query = f'SELECT * FROM `{BIGQUERY_BP_INPUT_URI.replace("bq://", "")}` LIMIT 1'

bq_query_job = bq_client.query(bq_query)
result = bq_query_job.result().to_dataframe()

print(result)

ENTITY_ID = result["vertex_generated_chunk_id"][0]

In [None]:
# A vertex_generated_chunk_id for testing
data_client.search_nearest_entities(
    request=feature_online_store_service_pb2.SearchNearestEntitiesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        query=NearestNeighborQuery(
            entity_id=ENTITY_ID,
            neighbor_count=5,
        ),
        return_full_entity=True,  # returning entities with metadata
    )
)

#### Search with `Embedding`

In [None]:
EMBEDDINGS = [1] * DIMENSIONS

In [None]:
data_client.search_nearest_entities(
    request=feature_online_store_service_pb2.SearchNearestEntitiesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        query=NearestNeighborQuery(
            embedding=NearestNeighborQuery.Embedding(value=EMBEDDINGS),
            neighbor_count=10,
        ),
        return_full_entity=True,  # returning entities with metadata
    )
)

#### Use the `FetchFeatureValues` API to retrieve the full data without search


In [None]:
data_client.fetch_feature_values(
    request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        id=ENTITY_ID,
    )
)

## Cleaning up

To clean up all Google Cloud resources used in this project, [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, delete the individual resources you created in this tutorial.

In [None]:
# Delete Feature View
admin_client.delete_feature_view(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

# Delete Feature Online Store
admin_client.delete_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}",
    force=True,
)

# Delete Cloud Storage objects that were created
import os

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $GCS_BUCKET