In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Fetch multiple entities

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_fetching_multiple_entities.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Ffeature_store%2Fonline_feature_serving_fetching_multiple_entities.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/online_feature_serving_fetching_multiple_entities.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_fetching_multiple_entities.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

In this tutorial, you will learn how to use the Vertex AI SDK for Python to
fetch multiple entities (and their feature values) in real-time. This notebook
introduces the *StreamingFetchFeatureValues* (SFFV) API for faster multi-entity
lookup.

This tutorial uses the following Google Cloud ML services and resources: 

* Vertex AI Feature Store
* BigQuery

The steps performed include the following:

* Setup BigQuery data
* Setup Feature Online Store
* Setup Feature View
* Fetch multiple entities
* Cleanup

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform bigframes

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Multiple entity fetch tutorial

### Imports and IDs

Import the packages required to use setup and use real-time serving in the
Vertex AI SDK for Python.

In [None]:
import bigframes
import bigframes.pandas
from google.cloud import bigquery
from vertexai.resources.preview.feature_store import (FeatureOnlineStore,
                                                      FeatureView)
from vertexai.resources.preview.feature_store import utils as fs_utils

The following variables set BigQuery and Feature Store resources that will be
used or created.

In [None]:
BQ_DATASET_ID = "sffv_dataset_unique"  # @param {type:"string"}
BQ_TABLE_ID = "sffv_table_unique"  # @param {type:"string"}
BQ_TABLE_URI = f"{PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"

FOS_ID = "sffv_fos_unique"  # @param {type:"string"}
FV_ID = "sffv_fv_unique"  # @param {type:"string"}

### Write BigQuery table for use in Feature View

The following query queries table
`bigquery-public-data.thelook_ecommerce.products` and builds usable data for use
in the Feature View.

In [None]:
QUERY = """
WITH
 product_order_agg AS (
   SELECT cast(product_id as string) as entity_id,
     countif(status in ("Shipped", "Complete")) as good_order_count,
     countif(status in ("Returned", "Cancelled")) as bad_order_count
   FROM `bigquery-public-data.thelook_ecommerce.order_items`
   WHERE
     timestamp_trunc(created_at, day) >= timestamp_trunc(timestamp_sub(CURRENT_TIMESTAMP(), interval 30 day), day) and
     timestamp_trunc(created_at, day) < timestamp_trunc(CURRENT_TIMESTAMP(), day)
   group by 1
   order by entity_id),
 product_basic AS (
   SELECT cast(id as string) AS entity_id,
     lower(name) as name,
     lower(category) as category,
     lower(brand) as brand,
     cost,
     retail_price
   FROM `bigquery-public-data.thelook_ecommerce.products`)
SELECT *, current_timestamp() as feature_timestamp
FROM product_basic
LEFT OUTER JOIN product_order_agg
USING (entity_id)
"""

Use BigQuery DataFrames to load our query.

In [None]:
session = bigframes.connect(
    bigframes.BigQueryOptions(
        project=PROJECT_ID,
        location="US",
    )
)
df = session.read_gbq_query(QUERY)
df.to_gbq(BQ_TABLE_URI, if_exists="replace")

Let's preview the data to be written to the table.

In [None]:
df.head()

And finally write the DataFrame to the target BigQuery table.

In [None]:
df.to_gbq(BQ_TABLE_URI, if_exists="replace")

### Create Feature Online Store

Create a Bigtable feature online store.

In [None]:
fos: FeatureOnlineStore = FeatureOnlineStore.create_bigtable_store(FOS_ID)

### Create Feature View

Under the previously created feature online store, create a feature view which
pulls data from the BigQuery table.

In [None]:
fv: FeatureView = fos.create_feature_view(
    FV_ID,
    source=fs_utils.FeatureViewBigQuerySource(
        uri=f"bq://{BQ_TABLE_URI}",
        entity_id_columns=["entity_id"],
    ),
)

### Sync the feature view

Trigger the data sync for the feature view.

In [None]:
fv_sync = fv.sync()

Wait for the feature view sync to be complete.

In [None]:
import time

while True:
    fv_sync = FeatureView.FeatureViewSync(fv_sync.resource_name)
    end_time = fv_sync._gca_resource.run_time.end_time.seconds
    if end_time > 0:
        status = "Success" if fv_sync._gca_resource.final_status.code == 0 else "Failed"
        print(f"Sync: {status}")
        break

    print("Waiting for FeatureViewSync...")
    time.sleep(10)

FeatureView.FeatureViewSync(fv_sync.resource_name)._gca_resource

### Reading multiple entities

The following sections demonstrate a few different ways to read multiple entities. Using the FetchFeatureValues API results in higher latency due to sequential calls - using threads slightly reduces latency. Using the StreamingFetchFeatureValues API reduces latency greatly as batching of reads is done by the Feature Store server.

Create the client so read latency is not affected by gRPC channel creation. Inspecting the output can help check that data is synced and present in the FeatureView. If a NOT_FOUND error is received, then certain server-side caches likely have not expired - please retry the read.

In [None]:
fv.read("1")

#### Using FetchFeatureValues API

##### Sequential call

Create a helper function to read multiple entities by issuing sequential read calls.

In [None]:
def read_multiple_sequential(low, high):
    return list(map(fv.read, [[str(i)] for i in range(low, high)]))

The following fetches entities 1-10. The `%%timeit` magic command times the execution of the cell. This allows rough latency & performance estimates.

In [None]:
%%timeit
read_multiple_sequential(1, 11)

##### Using thread pool

Create a helper function to read multiple entities by issuing concurrent read
calls by using a thread pool.

In [None]:
from concurrent.futures import ThreadPoolExecutor


def read_multiple_with_thread_pool(low, high):
    with ThreadPoolExecutor() as e:
        futures = []
        for i in range(low, high):
            futures.append(e.submit(fv.read, ["1"]))
        return [f.result() for f in futures]

Fetch entities 1-10 concurrently using the thread pool.

In [None]:
%%timeit
read_multiple_with_thread_pool(1, 11)

#### Using the StreamingFetchFeatureValues API

##### Setup Helper function

Create a function that helps with managing some of the details of using a Streaming gRPC
API.

In [None]:
from typing import List

from google.cloud.aiplatform_v1beta1 import FeatureOnlineStoreServiceClient
from google.cloud.aiplatform_v1beta1.types import \
    feature_online_store_service as feature_online_store_service_pb2


def sffv(data_client, feature_view, keys_list: List[List[str]]):
    """Helper function"""

    def request_generator(keys_list):
        for keys in keys_list:
            data_keys = [
                feature_online_store_service_pb2.FeatureViewDataKey(key=key)
                for key in keys
            ]
            request = (
                feature_online_store_service_pb2.StreamingFetchFeatureValuesRequest(
                    feature_view=feature_view,
                    data_keys=data_keys,
                )
            )
            yield request

    responses = data_client.streaming_fetch_feature_values(
        requests=request_generator(keys_list)
    )
    return [response for response in responses]

##### Initialize data client

In [None]:
API_ENDPOINT = f"{LOCATION}-aiplatform.googleapis.com"
data_client = FeatureOnlineStoreServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

##### Fetch multiple entities using StreamingFetch

Fetch entities 1-10 using the `StreamingFetchFeatureValues` API.

In [None]:
%%timeit
sffv(
    data_client=data_client,
    feature_view=fv.resource_name,
    keys_list=[[f"{num}" for num in range(1, 11)]],
)

Read 1000 entities using the SFFV API. The latency
increases, but should roughly be 100ms-200ms. You can further improve this by changing the way the keys are grouped.

In [None]:
%%timeit
sffv(
    data_client=data_client,
    feature_view=fv.resource_name,
    keys_list=[[f"{num}" for num in range(1, 1001)]],
)

##### Changing grouping of keys (batch size tuning)

In the `keys_list` argument to `sffv()`, change the grouping of the keys. This affects how Vertex AI Feature Store reads the keys based on the request.

The following `keys_list` list tells Vertex AI Feature Store to perform two reads. During the first read, Vertex AI Feature Store reads entities 1-500. During the second read, Vertex AI Feature Store reads entities 501-1000. Placing contiguous entities in the same read will help reduce latency.

In [None]:
%%timeit
sffv(
    data_client=data_client,
    feature_view=fv.resource_name,
    keys_list=[
        [f"{num}" for num in range(1, 501)],
        [f"{num}" for num in range(501, 1001)],
    ],
)

You can try your own "batch size" to see what works best.

In [None]:
%%timeit
batch_size = 50  # @param {type:"number"}
sffv(
    data_client=data_client,
    feature_view=fv.resource_name,
    keys_list=[
        [f"{num}" for num in range(i, i + batch_size + 1)]
        for i in range(1, 1001, batch_size)
    ],
)

#### Using StreamingFetchFeatureValues with REST

The following is an example of using the `StreamingFetchFeatureValues` API via REST.

The following two print statements will print details that need to be copied into the curl command.

In [None]:
print(
    f"URL: https://us-central1-aiplatform.googleapis.com/v1beta1/{fv.resource_name}:streamingFetchFeatureValues"
)  # Copy output into {URL}

print("FV resource name: ", fv.resource_name)  # Copy output into {FEATURE_VIEW}

Uncomment the cell. After copying releveant details into {URL} and {FEATURE_VIEW}, run the cell.

In [None]:
# %%bash
# # Uncomment and run
# curl {URL} \
#   -X POST \
#   -H "Authorization: Bearer $(gcloud auth print-access-token)" \
#   -H "Content-Type: application/json" \
#   -d '[
#   {
#     data_keys: [{key: "1"}, {key: "2"}, {key: "3"}],
#     feature_view: "{FEATURE_VIEW}"
#   },
#   {
#     data_keys: [{key: "5"}, {key: "6"}, {key: "7"}],
#     feature_view: "{FEATURE_VIEW}"
#   }
# ]'

## Cleaning up

### Delete feature view and feature online store

In [None]:
fv.delete()

In [None]:
fos.delete()

### Delete BigQuery dataset and table

In [None]:
client = bigquery.Client()

In [None]:
client.delete_table(f"{BQ_TABLE_URI}")

In [None]:
client.delete_dataset(f"{PROJECT_ID}.{BQ_DATASET_ID}")