In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Feature Store Based LLM Grounding Tutorial - Offline Version

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

In this tutorial, you learn how to chunk user-provided data, and then generate embedding vectors for each chunk using a Vertex LLM (Large Language Model) having embedding generation capabilities. The resulting embedding vector dataset can then be loaded into Vertex AI Feature Store, enabling fast feature retrieval and efficient online serving.

Learn more about [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore/overview).

### Objective

In this tutorial, you learn how to create and use an offline feature store instance to host and serve data in `BigQuery` with `Vertex AI Feature Store`.

This tutorial uses the following Google Cloud ML services and resources:

- `Vertex AI Feature Store`

The steps performed include:

- Provision an online feature store instance to host and serve data.
- Create an online feature store instance to serve a `BigQuery` table.
- Perform CRUD operations
- Note the data lineage

### Note
This is a Preview release. By using the feature, you acknowledge that you're aware of the open issues and that this preview is provided “as is” under the pre-GA terms of service.


### Dataset

This tutorial uses the [Google Patents Public Data](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data) dataset from the `BigQuery` public datasets.


### Costs

This tutorial uses billable components of Google Cloud:

* `Vertex AI`
* `BigQuery`
* `Cloud Storage`

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
[BigQuery pricing](https://cloud.google.com/bigquery/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the following packages required to run this notebook.

In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform\
                                 google-cloud-bigquery\
                                 db-dtypes

! pip3 install --upgrade kfp -q --no-warn-conflicts

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# # Automatically restart the kernel after installation so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

4. If you're running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "your-project-id"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations). Note that the new Feature Store capability showed in the colab is currently only available in the following regions:
* `us-central1`
* `us-east1`
* `us-west1`
* `europe-west4`
* `asia-southeast1`

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you might have to manually authenticate. Follow these instructions:

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Import libraries

In [None]:
import uuid

from google.cloud import aiplatform, bigquery
from google.cloud.aiplatform_v1beta1 import (
    FeatureRegistryServiceClient,
    CreateFeatureGroupRequest,
    FeatureGroup,
    CreateFeatureRequest,
    ListFeatureGroupsRequest,
    ListFeaturesRequest,
    DeleteFeatureGroupRequest,
)

from google.cloud.aiplatform_v1beta1.types import (
    Feature,
)

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

## Set up and start online serving

To serve embedding data in Vertex AI Feature Store, do the following:

1. Prepare the data source in BigQuery.
2. Create an FeatureOnlineStore instance to host the data.
3. Define the data (`FeatureView`) to be served by the newly-created instance.

### Prepare BigQuery data source for feature view creation

In [None]:
GCS_BUCKET = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.


In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {GCS_BUCKET}

#### Prepare data in Google Cloud Storage (GCS)

In [None]:
INPUT_TEXT_GCS_DIR = f"{GCS_BUCKET}/fs_grounding/data"

import tarfile
from urllib.request import urlretrieve


def untar(file_name):
    output_folder_name = file_name[:-7]
    file = tarfile.open(file_name)
    file.extractall(output_folder_name)
    return output_folder_name


# Download data from https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz
url = "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz"
filename = "nfcorpus.tar.gz"
path, _ = urlretrieve(url, filename)
print(f"Downloaded {path}")

# Copy text files to GCS.
output_folder_name = f"{untar(path)}/nfcorpus"
dev_all_queries = f"{output_folder_name}/dev.all.queries"
dev_docs = f"{output_folder_name}/dev.docs"
! gsutil cp {dev_all_queries} {INPUT_TEXT_GCS_DIR}/queries
! gsutil cp {dev_docs} {INPUT_TEXT_GCS_DIR}/docs

#### Create BigQuery dataset

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)

BQ_DATASET_ID = "fs_grounding"  # @param {type:"string"}
dataset = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET_ID}")
dataset.location = REGION
dataset = bq_client.create_dataset(
    dataset, exists_ok=True, timeout=30
)  # Make an API request.

# Confirm dataset created.
print(f"Created dataset {dataset}.{BQ_DATASET_ID}")

#### Launch pipeline

In [None]:
run_id = str(uuid.uuid4())

PIPELINE_TEMPLATE_URI = "gs://vertex-evaluation-pipeline-templates/20240117_0005/feature_store_grounding_pipeline_pipeline.yaml"
BIGQUERY_BP_INPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_input"
BIGQUERY_BP_OUTPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_output"

PARAMS = {
    "project": PROJECT_ID,
    "location": REGION,
    "bigquery_bp_input_uri": BIGQUERY_BP_INPUT_URI,
    "bigquery_bp_output_uri": BIGQUERY_BP_OUTPUT_URI,
    "input_text_gcs_dir": INPUT_TEXT_GCS_DIR,
    "output_text_gcs_dir": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_output",
    "output_error_file_path": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_error_output",
    "model_name": "publishers/google/models/textembedding-gecko@003",
    "generation_threshold_microseconds": "0",
}


def run_pipeline(
    parameters: dict,
    project: str,
    pipeline_root: str,
    location: str = "us-central1",
) -> aiplatform.PipelineJob:
    aiplatform.init(
        project=project,
        location=location,
    )

    test_prefix = "your-test-prefix"  # @param {type:"string"}
    pipeline_name = "feature-store-grounding-pipeline"  # @param {type:"string"}

    test_name = f"{test_prefix}-{pipeline_name}-{run_id}"
    job = aiplatform.PipelineJob(
        display_name=test_name,
        template_path=PIPELINE_TEMPLATE_URI,
        job_id=test_name,
        pipeline_root=pipeline_root,
        parameter_values=parameters,
        enable_caching=False,
    )

    job.submit()

    return job


job = run_pipeline(
    parameters=PARAMS,
    project=PROJECT_ID,
    pipeline_root=f"{GCS_BUCKET}/fs_based/pipeline_root",
    location=REGION,
)
job.wait()

#### BQ format conversion

This table is loading data with embeddings. It also has a few extra special fields that assist with such as `feature_timestamp` which provides tells FS what the latest value of a given `entity_id`. Note that you can map different fields to surrogate for the default `entity_id` by using the `entity_id_columns` below.

For any BigQuery table or view that you associate with a feature group, you need to [ensure the following](https://cloud.google.com/vertex-ai/docs/featurestore/latest/create-featuregroup):

- The schema of the data source conforms to the Data source preparation guidelines.

- The data source contains the entity IDs as string values in a column named entity_id.

- The data source contains the feature timestamps of type timestamp in a column called feature_timestamp.

In [None]:
def compose_bq_query_format_conversion(
    bigquery_bp_input_uri: str, bigquery_bp_output_uri: str
) -> str:
    """Compose the BQ query for format conversion.

    Args:
      bigquery_bp_input_uri: The URI to a bigquery table as the input for the
        batch prediction component. The chunking component will populate data to
        this uri first before batch prediction.
      bigquery_bp_output_uri: The URI to a bigquery table as the output for the
        batch prediction component.

    Returns:
      The composed query for BigQuery format conversion.
    """

    if bigquery_bp_input_uri.startswith("bq://"):
        bigquery_bp_input_uri = bigquery_bp_input_uri.replace("bq://", "")

    if bigquery_bp_output_uri.startswith("bq://"):
        bigquery_bp_output_uri = bigquery_bp_output_uri.replace("bq://", "")

    inseration_query = (
        f"UPDATE `{bigquery_bp_input_uri}` destTable"
        " SET embedding=ARRAY( select cast (str_element as float64) from"
        " unnest(JSON_VALUE_ARRAY(prediction, '$.embeddings.values')) as"
        " str_element)"
    )
    fetch_data_query = (
        "FROM (SELECT vertex_generated_chunk_id, prediction FROM"
        f" `{bigquery_bp_output_uri}` cross join"
        " unnest(JSON_EXTRACT_ARRAY(predictions)) as prediction) sourceTable"
        " WHERE"
        " destTable.vertex_generated_chunk_id=sourceTable.vertex_generated_chunk_id"
    )
    return f"{inseration_query} {fetch_data_query};"


bq_query = compose_bq_query_format_conversion(
    bigquery_bp_input_uri=BIGQUERY_BP_INPUT_URI,
    bigquery_bp_output_uri=BIGQUERY_BP_OUTPUT_URI,
)
DATA_SOURCE = BIGQUERY_BP_INPUT_URI
bq_job = bq_client.query(bq_query)
bq_job.result()

# Goal Offline use of embeddings using SDK CRUD ops

1. Imports

In [None]:
# set parent
parent = f"projects/{PROJECT_ID}/locations/{REGION}"

print(parent)

#### 0. Configure the client and explore SDK options

Not required but you can understand the options of the admin client by exploring the methods and classes in the module below

In [None]:
admin_client_offline_fs = FeatureRegistryServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

In [None]:
# playground - lots of useful things in here
admin_client_offline_fs.create_feature()

#### 2. Make a feature group

In [None]:
print("Our bq table for the feature group: ", DATA_SOURCE)
FG_ID = 'my-embedding-feature-group'

In [None]:
from typing import Dict, List


def sample_create_feature_group(
    bq_uri: str,
    parent: str,
    feature_group_name: str,
    entity_id_columns: List[str],
    client_options: Dict[str, str] = {"api_endpoint": API_ENDPOINT},
):
    # Create a client
    client = FeatureRegistryServiceClient(client_options=client_options)

    # Initialize request argument(s)
    feature_group = FeatureGroup()
    feature_group.big_query.big_query_source.input_uri = bq_uri
    feature_group.big_query.entity_id_columns = entity_id_columns

    request = CreateFeatureGroupRequest(
        parent=parent,
        feature_group=feature_group,
        feature_group_id=feature_group_name,
    )

    # Make the request
    operation = client.create_feature_group(request=request)

    print("Waiting for operation to complete...")

    response = operation.result()

    # Handle the response
    print(response)

In [None]:
sample_create_feature_group(
    bq_uri=DATA_SOURCE,
    entity_id_columns=["vertex_generated_chunk_id"],
    parent=parent,
    feature_group_name=FG_ID,
)

#### Add features to the Feature Group


 IMPORTANT NOTE:

 BILLING SERVICES ENABLED: Lineage requires both Data Lineage API and Data Catalog API to be enabled in project _____. Enabling them may have billing implications .


In [None]:
def sample_create_feature(
    parent: str,
    feature_group: str,
    feature_name: str,
    feature_description: str,
    client_options: Dict[str, str] = {"api_endpoint": API_ENDPOINT},
):
    # Create a client
    client = FeatureRegistryServiceClient(client_options=client_options)
    # projects/{project}/locations/{location}/featureGroups/{feature_group}/features/{feature}
    parent_feature_group = f"{parent}/featureGroups/{feature_group}"
    fqn = f"{parent_feature_group}/features/{feature_name}"
    feature = Feature(name=fqn, description=feature_description)
    # Initialize request argument(s)
    request = CreateFeatureRequest(
        parent=parent_feature_group, feature=feature, feature_id=feature_name
    )

    # Make the request
    operation = client.create_feature(request=request)

    print("Waiting for operation to complete...")

    response = operation.result()

    # Handle the response
    print(response)

In [None]:
sample_create_feature(
    parent=parent,
    feature_group=FG_ID,
    feature_name="embedding",
    feature_description="text gecko003 experimental test i did",
)

In [None]:
sample_create_feature(
    parent=parent,
    feature_group=FG_ID,
    feature_name="content",
    feature_description="the source text for the text gecko test i did",
)

In [None]:
sample_create_feature(
    parent=parent,
    feature_group=FG_ID,
    feature_name="chunk_size",
    feature_description="chunk size for the embedding passage",
)
sample_create_feature(
    parent=parent,
    feature_group=FG_ID,
    feature_name="overlap_size",
    feature_description="number of characters the chunks overlapped for the embedding passages",
)

#### 3. list feature groups

In [None]:
def sample_list_feature_groups(
    parent: str, client_options: Dict[str, str] = {"api_endpoint": API_ENDPOINT}
):
    # Create a client
    client = FeatureRegistryServiceClient(client_options=client_options)

    # Initialize request argument(s)
    request = ListFeatureGroupsRequest(
        parent=parent,
    )

    # Make the request
    page_result = client.list_feature_groups(request=request)

    # Handle the response
    for response in page_result:
        print(response)

In [None]:
fgs = sample_list_feature_groups(parent=parent)

#### 4. List Features

In [None]:
def sample_list_features(
    parent: str,
    feature_group_name: str,
    client_options: Dict[str, str] = {"api_endpoint": API_ENDPOINT},
):
    # Create a client
    client = FeatureRegistryServiceClient(client_options=client_options)

    fqn = f"{parent}/featureGroups/{feature_group_name}"
    # Initialize request argument(s)
    request = ListFeaturesRequest(
        parent=fqn,
    )

    # Make the request
    page_result = client.list_features(request=request)

    # Handle the response
    for response in page_result:
        print(response)

In [None]:
sample_list_features(parent=parent, feature_group_name=FG_ID)

## Cleaning up

To clean up all Google Cloud resources used in this project, [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, delete the individual resources you created in this tutorial.

In [None]:
# Delete Feature Group

def sample_delete_feature_group(parent: str, feature_group_name: str, client_options: Dict [str,str ]= {"api_endpoint": API_ENDPOINT}):
                # Create a client
                client = FeatureRegistryServiceClient(client_options=client_options)
                fqn = f'{parent}/featureGroups/{feature_group_name}'
                # Initialize request argument(s)
                request = DeleteFeatureGroupRequest(
                    name=fqn,
                )

                # Make the request
                operation = client.delete_feature_group(request=request)

                print("Waiting for operation to complete...")

                response = operation.result()

                # Handle the response
                print(response)

sample_delete_feature_group(
    parent=parent, feature_group_name=FG_NAME)


# Delete Cloud Storage objects that were created
import os

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $GCS_BUCKET