In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# FraudFinder - Feature Engineering (batch) (New Feature Store)

## Overview

[FraudFinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the FraudFinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

As you engineer features for model training, it's important to consider how the features are computed when making predictions with new data. For online predictions, you may have features that can be pre-computed via _batch feature engineering_. You may also features that need to be computed on-the-fly via _streaming-based feature engineering_. For these Fraudfinder labs, for computing features based on the last n _days_, you will use _batch_ feature engineering in BigQuery; for computing features based on the last n _minutes_, you will use _streaming-based_ feature engineering using Dataflow.

This notebook shows how to generate new features on bank transactions by customer and terminal over the last n days, by doing batch feature engineering in SQL with BigQuery. Then, you will create a feature store using Vertex AI Feature Store, and ingest your newly-created features from BigQuery into Vertex AI Feature Store, so that a feature store can become the single source of data for both training and model inference. 

You will also create some placeholder values for streaming-based feature engineering, which is covered in the next notebook, `03_feature_engineering_streaming.ipynb`.

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)
- [BigQuery](https://cloud.google.com/bigquery/)


Steps performed in this notebook:

- Build customer and terminal-related features
- Create Feature store, entities and features
- Ingest feature values in Feature store from BigQuery table
- Read features from the feature store

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* BigQuery

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

### Import libraries

In [None]:
# General
import datetime as dt
import json
import os
import random
import sys
import time
from datetime import datetime, timedelta
from typing import List, Union

# Data Engineering
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 500)

# Vertex AI and Vertex AI Feature Store
from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery
from google.cloud.aiplatform import EntityType, Feature, Featurestore

### Define constants

In [None]:
# Define the date range of transactions for feature engineering (last 10 days up until yesterday)
YESTERDAY = datetime.today() - timedelta(days=1)
YEAR_MONTH_PREFIX = YESTERDAY.strftime("%Y-%m")
DATAPROCESSING_START_DATE = (YESTERDAY - timedelta(days=10)).strftime("%Y-%m-%d")
DATAPROCESSING_END_DATE = YESTERDAY.strftime("%Y-%m-%d")

# Define BiqQuery dataset and tables to calculate features.
RAW_BQ_TRANSACTION_TABLE_URI = f"{PROJECT_ID}.tx.tx"

INGESTION_BQ_TRANSACTION_TABLE_URI = f"{PROJECT_ID}.tx.ingestion_tx_records"

RAW_BQ_LABELS_TABLE_URI = f"{PROJECT_ID}.tx.txlabels"
FEATURES_BQ_TABLE_URI = f"{PROJECT_ID}.tx.wide_features_table"

# Define Vertex AI Feature store settings.
# CUSTOMERS_TABLE_NAME = f"customers_{DATAPROCESSING_END_DATE.replace('-', '')}"
# CUSTOMERS_BQ_TABLE_URI = f"{PROJECT_ID}.tx.{CUSTOMERS_TABLE_NAME}"

CUSTOMERS_FE_BQ_VIEW_URI = f"{PROJECT_ID}.tx.v_customers_features"

TERMINALS_TABLE_NAME = f"terminals_{DATAPROCESSING_END_DATE.replace('-', '')}"

# TERMINALS_BQ_TABLE_URI = f"{PROJECT_ID}.tx.{TERMINALS_TABLE_NAME}"

TERMINALS_FE_BQ_VIEW_URI = f"{PROJECT_ID}.tx.v_terminals_features"

CUSTOMERS_STREAMING_FE_TABLE_URI = f"{PROJECT_ID}.tx.t_customers_streaming_features"
TERMINALS_STREAMING_FE_TABLE_URI = f"{PROJECT_ID}.tx.t_terminals_streaming_features"

ONLINE_STORAGE_NODES = 1
FEATURE_TIME = "feature_ts"
CUSTOMER_ENTITY_ID = "customer"
TERMINAL_ENTITY_ID = "terminal"

### Helpers

Define a set of helper functions to run BigQuery query and create features. 

In [None]:
def run_bq_query(sql: str, show=False) -> Union[str, pd.DataFrame]:
    """
    Run a BigQuery query and return the job ID or result as a DataFrame
    Args:
        sql: SQL query, as a string, to execute in BigQuery
        show: A flag to show query result in a Pandas Dataframe
    Returns:
        df: DataFrame of results from query,  or error, if any
    """

    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    result = client_result.result()
    print(f"Finished job_id: {job_id}")

    if show:
        df = result.to_arrow().to_pandas()
        return df

## Creating Destination Table for ingestion pipeline

In [None]:
create_ingestion_tx_records_table = f"""
CREATE OR REPLACE TABLE `{INGESTION_BQ_TRANSACTION_TABLE_URI}`
(
  TX_ID STRING OPTIONS(description="Unique transaction identifier"),
  TX_TS TIMESTAMP OPTIONS(description="Timestamp of the transaction"),
  CUSTOMER_ID STRING OPTIONS(description="Unique customer identifier"),
  TERMINAL_ID STRING OPTIONS(description="Unique terminal identifier"),
  TX_AMOUNT FLOAT64 OPTIONS(description="The monetary value of the transaction")
)
PARTITION BY
  DATE(TX_TS)
CLUSTER BY
  CUSTOMER_ID
OPTIONS (
  description = "A table to store customer transaction data, partitioned by day and clustered by customer."
)"""
print(create_ingestion_tx_records_table)

In [None]:
run_bq_query(create_ingestion_tx_records_table)

## Feature Engineering

### Define customer and terminal-related features for batch feature engineering

In this section, you will create features, based on historical customer behaviour and historical terminal activities. This features will be batch-generated using SQL in BigQuery, where the historical data is stored.

The query below will calculate 2 sets of features: 

1. **Customer-related features**: which describes the spending behaviour of customer within 1, 7 and 15 days time windows using number of transactions and average amount spent in dollars ($)

2. **Terminal-related features** which describes the risk of a given terminal to be exposed to fraudulent transactions within 1, 7 and 15 days using average number of fraudulent transactions in dollars ($), the number of transactions and risk index. One thing to note is that you will add some delay to take into account time that would pass between the time of transaction and the result of fraud investigation or customer claim.

You will use one month of transaction data starting from the end of January and going back to compute the features.

Below is the schema you should expect to see, after doing the batch feature engineering in BigQuery:

|feature_time           |customer_id| customer batch features   |
|-----------------------|-----------|---------------------------|
|2022-01-01 17:20:15 UTC|1          |(e.g., nb_tx,  avg_tx)     |
|2022-01-02 12:08:40 UTC|2          |(e.g., nb_tx,  avg_tx)     |
|2022-01-03 17:30:48 UTC|3          |(e.g., nb_tx,  avg_tx)     |


|feature_time           |terminal_id| terminal batch features|
|-----------------------|-----------|------------------------|
|2022-01-01 17:20:15 UTC|12345      |(e.g., risk_x_days)     |
|2022-01-02 12:08:40 UTC|26789      |(e.g., risk_x_days)     |
|2022-01-03 17:30:48 UTC|101112     |(e.g., risk_x_days)     |


#### Create the query to create batch features

Date settings to be used:

##### Customer feature table

Customer table SQL query string:

In [None]:
create_customer_batch_features_query = f"""
CREATE OR REPLACE VIEW `{CUSTOMERS_FE_BQ_VIEW_URI}` AS
WITH
  -- query to join labels with features -------------------------------------------------------------------------------------------
  get_raw_table AS (
  SELECT
    raw_tx.TX_TS,
    raw_tx.TX_ID,
    raw_tx.CUSTOMER_ID,
    raw_tx.TERMINAL_ID,
    raw_tx.TX_AMOUNT
  FROM (
    SELECT
      *
    FROM
      `{INGESTION_BQ_TRANSACTION_TABLE_URI}`
    WHERE
      DATE(TX_TS) BETWEEN DATE_SUB(CURRENT_DATETIME, INTERVAL 15 DAY) AND CURRENT_DATETIME
    ) raw_tx),

  -- query to calculate CUSTOMER spending behaviour --------------------------------------------------------------------------------
  get_customer_spending_behaviour AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TX_AMOUNT,
    
    # calc the number of customer tx over daily windows per customer (1, 7 and 15 days, expressed in seconds)
    COUNT(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_1DAY_WINDOW,
    COUNT(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_7DAY_WINDOW,
    COUNT(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_14DAY_WINDOW,
      
    # calc the customer average tx amount over daily windows per customer (1, 7 and 15 days, expressed in seconds, in dollars ($))
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW,
  FROM get_raw_table)

# Create the table with CUSTOMER and TERMINAL features ----------------------------------------------------------------------------
SELECT
  current_timestamp() as feature_timestamp,
  CUSTOMER_ID AS customer_id,
  CAST(CUSTOMER_ID_NB_TX_1DAY_WINDOW AS INT64) AS customer_id_nb_tx_1day_window,
  CAST(CUSTOMER_ID_NB_TX_7DAY_WINDOW AS INT64) AS customer_id_nb_tx_7day_window,
  CAST(CUSTOMER_ID_NB_TX_14DAY_WINDOW AS INT64) AS customer_id_nb_tx_14day_window,
  CAST(CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_1day_window,
  CAST(CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_7day_window,
  CAST(CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_14day_window,
FROM
  get_customer_spending_behaviour
"""

In [None]:
print(create_customer_batch_features_query)

##### Run the query 

You create the customer features table

In [None]:
run_bq_query(create_customer_batch_features_query)

##### Inspect the result 

You can query some data rows to validate the result of the query

In [None]:
run_bq_query(f"SELECT * FROM `{CUSTOMERS_FE_BQ_VIEW_URI}` LIMIT 10", show=True)

##### Terminal feature table

Terminal table SQL query string:

In [None]:
create_terminal_batch_features_query = f"""
# query to calculate TERMINAL spending behaviour --------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `{TERMINALS_FE_BQ_VIEW_URI}` AS
WITH
  -- query to join labels with features -------------------------------------------------------------------------------------------
  get_raw_table AS (
  SELECT
    raw_tx.TX_TS,
    raw_tx.TX_ID,
    raw_tx.CUSTOMER_ID,
    raw_tx.TERMINAL_ID,
    raw_tx.TX_AMOUNT,
    raw_lb.TX_FRAUD
  FROM (
    SELECT
      *
    FROM
      `{RAW_BQ_TRANSACTION_TABLE_URI}`
    WHERE
      DATE(TX_TS) BETWEEN DATE_SUB(CURRENT_DATETIME, INTERVAL 15 DAY) AND CURRENT_DATETIME
    ) raw_tx
  LEFT JOIN 
    `{RAW_BQ_LABELS_TABLE_URI}` as raw_lb
  ON raw_tx.TX_ID = raw_lb.TX_ID),

  # query to calculate TERMINAL spending behaviour --------------------------------------------------------------------------------
  get_variables_delay_window AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    
    # calc total amount of fraudulent tx and the total number of tx over the delay period per terminal (7 days - delay, expressed in seconds)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_DELAY,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_TX_DELAY,
      
    # calc total amount of fraudulent tx and the total number of tx over the delayed window per terminal (window + 7 days - delay, expressed in seconds)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_1_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_7_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_14_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_TX_1_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_TX_7_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_TX_14_DELAY_WINDOW,
  FROM get_raw_table),

  # query to calculate TERMINAL risk factors ---------------------------------------------------------------------------------------
  get_risk_factors AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    # calculate numerator of risk index
    NB_FRAUD_1_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_1DAY_WINDOW,
    NB_FRAUD_7_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_7DAY_WINDOW,
    NB_FRAUD_14_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_14DAY_WINDOW,
    # calculate denominator of risk index
    NB_TX_1_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_1DAY_WINDOW,
    NB_TX_7_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_7DAY_WINDOW,
    NB_TX_14_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_14DAY_WINDOW,
      FROM
    get_variables_delay_window),

  # query to calculate the TERMINAL risk index -------------------------------------------------------------------------------------
  get_risk_index AS (
    SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TERMINAL_ID_NB_TX_1DAY_WINDOW,
    TERMINAL_ID_NB_TX_7DAY_WINDOW,
    TERMINAL_ID_NB_TX_14DAY_WINDOW,
    # calculate the risk index
    (TERMINAL_ID_NB_FRAUD_1DAY_WINDOW/(TERMINAL_ID_NB_TX_1DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_1DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_7DAY_WINDOW/(TERMINAL_ID_NB_TX_7DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_7DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_14DAY_WINDOW/(TERMINAL_ID_NB_TX_14DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_14DAY_WINDOW
    FROM get_risk_factors 
  )

# Create the table with CUSTOMER and TERMINAL features ----------------------------------------------------------------------------
SELECT
  current_timestamp() as feature_timestamp,
  TERMINAL_ID AS terminal_id,
  CAST(TERMINAL_ID_NB_TX_1DAY_WINDOW AS INT64) AS terminal_id_nb_tx_1day_window,
  CAST(TERMINAL_ID_NB_TX_7DAY_WINDOW AS INT64) AS terminal_id_nb_tx_7day_window,
  CAST(TERMINAL_ID_NB_TX_14DAY_WINDOW AS INT64) AS terminal_id_nb_tx_14day_window,
  CAST(TERMINAL_ID_RISK_1DAY_WINDOW AS FLOAT64) AS terminal_id_risk_1day_window,
  CAST(TERMINAL_ID_RISK_7DAY_WINDOW AS FLOAT64) AS terminal_id_risk_7day_window,
  CAST(TERMINAL_ID_RISK_14DAY_WINDOW AS FLOAT64) AS terminal_id_risk_14day_window,
FROM
  get_risk_index
"""

In [None]:
print(create_terminal_batch_features_query)

##### Run the query 

You create the customer features table

In [None]:
run_bq_query(create_terminal_batch_features_query)

##### Inspect the result 

You can query some data rows to validate the result of the query

In [None]:
run_bq_query(f"SELECT * FROM `{TERMINALS_FE_BQ_VIEW_URI}` LIMIT 10", show=True)

#### Define the query to initialize the real-time features.

##### Customer feature table

Customer table SQL query string:

In [None]:
initiate_real_time_customer_features_query = f"""
CREATE OR REPLACE TABLE `{CUSTOMERS_STREAMING_FE_TABLE_URI}`
(
    customer_id STRING,
    feature_timestamp TIMESTAMP,
    customer_id_nb_tx_15min_window INT64,
    customer_id_nb_tx_30min_window INT64,
    customer_id_nb_tx_60min_window INT64,
    customer_id_avg_amount_15min_window FLOAT64,
    customer_id_avg_amount_30min_window FLOAT64,
    customer_id_avg_amount_60min_window FLOAT64
)
"""

In [None]:
initiate_real_time_terminal_features_query = f"""
CREATE OR REPLACE TABLE `{TERMINALS_STREAMING_FE_TABLE_URI}`
(
    terminal_id STRING,
    feature_timestamp TIMESTAMP,
    terminal_id_nb_tx_15min_window INT64,
    terminal_id_nb_tx_30min_window INT64,
    terminal_id_nb_tx_60min_window INT64,
    terminal_id_avg_amount_15min_window FLOAT64,
    terminal_id_avg_amount_30min_window FLOAT64,
    terminal_id_avg_amount_60min_window FLOAT64
)
"""

#### Run the query above to initialize the real-time features.

In [None]:
for query in [
    initiate_real_time_customer_features_query,
    initiate_real_time_terminal_features_query,
]:
    run_bq_query(query)

#### Inspect BigQuery features tables

In [None]:
run_bq_query(f"SELECT * FROM `{CUSTOMERS_STREAMING_FE_TABLE_URI}` LIMIT 5", show=True)

In [None]:
run_bq_query(f"SELECT * FROM `{TERMINALS_STREAMING_FE_TABLE_URI}` LIMIT 5", show=True)

Let's look at the final schema of the features table:

### Initialize Vertex AI SDK

Initialize the Vertex AI SDK to get access to Vertex AI services programmatically. 

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

## Create featurestore, `fraudfinder_featurestore`

### Set up and start online serving

Now for the exciting part! To serve data in a feature store, you need to do the following:

1. Create an online store cluster to host the data.
    * Create a `FeatureOnlineStore` instance with autoscaling.
1. Define the data (`FeatureView`) to be served by the newly-created instance. This can either map to
    * The BigQuery view that you just created for serving data.
    * The `FeatureGroup` and `Feature` you'll create to host feature metadata.

Bigtable serving latency is affected by the (Bigtable) load. However, when Bigtable is not overloaded, benchmarks show that the expected server-side latency is around 30 ms, measured at around 100 qps. The client-side latency is expected to be more than 5 ms higher than the server-side latency.

### Import libraries

In [None]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1 import (
    FeatureOnlineStoreAdminServiceClient,
    FeatureOnlineStoreServiceClient,
    FeatureRegistryServiceClient,
)
from google.cloud.aiplatform_v1.types import feature as feature_pb2
from google.cloud.aiplatform_v1.types import feature_group as feature_group_pb2
from google.cloud.aiplatform_v1.types import (
    feature_online_store as feature_online_store_pb2,
)
from google.cloud.aiplatform_v1.types import (
    feature_online_store_admin_service as feature_online_store_admin_service_pb2,
)
from google.cloud.aiplatform_v1.types import (
    feature_online_store_service as feature_online_store_service_pb2,
)
from google.cloud.aiplatform_v1.types import (
    feature_registry_service as feature_registry_service_pb2,
)
from google.cloud.aiplatform_v1.types import feature_view as feature_view_pb2
from google.cloud.aiplatform_v1.types import (
    featurestore_service as featurestore_service_pb2,
)
from google.cloud.aiplatform_v1.types import io as io_pb2

### Initialize Admin Service Client

In [None]:
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

In [None]:
admin_client = FeatureOnlineStoreAdminServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)
registry_client = FeatureRegistryServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

### Create online store instance

To create an online store instance.
Create a `FeatureOnlineStore` instance with autoscaling.

In [None]:
FEATURE_ONLINE_STORE_ID = "fraudfinder_featurestore"

online_store_config = feature_online_store_pb2.FeatureOnlineStore(
    bigtable=feature_online_store_pb2.FeatureOnlineStore.Bigtable(
        auto_scaling=feature_online_store_pb2.FeatureOnlineStore.Bigtable.AutoScaling(
            min_node_count=1, max_node_count=1, cpu_utilization_target=50
        )
    )
)

create_store_lro = admin_client.create_feature_online_store(
    feature_online_store_admin_service_pb2.CreateFeatureOnlineStoreRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}",
        feature_online_store_id=FEATURE_ONLINE_STORE_ID,
        feature_online_store=online_store_config,
    )
)

### Verify online store instance creation

After the long-running operation (LRO) is complete, show the result.

> **Note:** This operation might take up to 10 minutes to complete.

In [None]:
# Wait for the LRO to finish and get the LRO result.
print(create_store_lro.result())

In [None]:
# Use list to verify the store is created.
admin_client.list_feature_online_stores(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}"
)

### Create feature view instance

After creating a `FeatureOnlineStore` instance, you can define the features to serve with it. To do this, create a `FeatureView` instance, which specifies the following:

* A data source (BigQuery table or view URI or FeatureGroup/features ) synced to the `FeatureOnlineStore` instance for serving.
* The cron schedule to run the sync pipeline.

### Create featureView with FeatureGroups/Feature

> **Note:** If you've already created a feature view with BQ source, skip this section and go to [Verify online store instance creation](#scrollTo=igOmzHxx1C0X).

#### [Optional] Create FeatureGroup/Features

Create a FeatureGroup pointing to the created BigQuery view for the demo. You then create features for each column you'd like to register.


##### Data source preparation guidelines for Feature Registry data source

Note that if you choose to use Feature Registry source, Feature Store only provides the option to support time-series sources for which Feature Store generates latest featureValues.

Use the following guidelines to understand the schema and constraints while creating the BigQuery source:

* The BigQuery table or view *must* have a column with `string` values to use as the (entity) IDs. You'll need to specify that this column is the ID column during `FeatureGroup` creation. Note that the size of each value in this column must be less than 4 KB.
* The BigQuery table or view *must* have a column named `feature_timestamp` with `timestamp` values to use as timestamp column.
* Feature Registry sources are treated as sparse by default i.e. a point in time lookup (BQ.ML_FEATURES_AT_TIME()) to generate latest featureValues per entityId.
* Provide values for each feature is a separate column. Supported data types are `bool`, `int`, `double`, `string`, timestamp, arrays of these data types, and bytes. Note that the timestamp data type is converted to `int64` during data sync.
* Feature Store validates the schema during `FeatureView`/`FeatureGroup`/`Feature` creation. However, it doesn't revalidate the schema during each data sync. Columns with unsupported data types added after `FeatureView` creation time are ignored.
* The BigQuery table or view must be in either the same region as the online store, or in a multiregion that overlaps with the online store. For example, if the online store is in `us-central`, the BigQuery source can be in `us-central` or in `US`.

#### Define utility method for feature groups creation

In [None]:
def create_fs_feature_group(
    bq_source_uri, entity_id_column, feature_group_id, feature_ids_list
):

    # Now, create the featureGroup
    feature_group_config = feature_group_pb2.FeatureGroup(
        big_query=feature_group_pb2.FeatureGroup.BigQuery(
            big_query_source=io_pb2.BigQuerySource(input_uri=f"bq://{bq_source_uri}"),
            # Add the entity_id_columns parameter here
            entity_id_columns=[entity_id_column],
        )
    )
    create_group_lro = registry_client.create_feature_group(
        feature_registry_service_pb2.CreateFeatureGroupRequest(
            parent=f"projects/{PROJECT_ID}/locations/{REGION}",
            feature_group_id=feature_group_id,
            feature_group=feature_group_config,
        )
    )

    # After the long-running operation (LRO) is complete, show the result.
    print(create_group_lro.result())

    create_feature_lros = []
    for id in feature_ids_list:
        create_feature_lros.append(
            registry_client.create_feature(
                featurestore_service_pb2.CreateFeatureRequest(
                    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureGroups/{feature_group_id}",
                    feature_id=id,
                    feature=feature_pb2.Feature(),
                )
            )
        )

    # Wait for FS Group creation
    for lro in create_feature_lros:
        # After the long-running operation (LRO) is complete, show the result.
        print(lro.result())

In [None]:
CUSTOMER_ID_COLUMN = "customer_id"  # entity_id

CUSTOMER_BATCH_FEATURES_GROUP_ID = "fraudfinder_customers_batch"

CUSTOMER_BATCH_FEATURE_IDS = [
    "customer_id_nb_tx_14day_window",
    "customer_id_avg_amount_7day_window",
    "customer_id_nb_tx_1day_window",
    "customer_id_avg_amount_1day_window",
    "customer_id_avg_amount_14day_window",
    "customer_id_nb_tx_7day_window",
]

# Creating feature Group for batch for customers
create_fs_feature_group(
    bq_source_uri=CUSTOMERS_FE_BQ_VIEW_URI,
    entity_id_column=CUSTOMER_ID_COLUMN,
    feature_group_id=CUSTOMER_BATCH_FEATURES_GROUP_ID,
    feature_ids_list=CUSTOMER_BATCH_FEATURE_IDS,
)

In [None]:
CUSTOMER_STREAMING_FEATURES_GROUP_ID = "fraudfinder_customers_streaming"
CUSTOMER_STREAMING_FEATURE_IDS = [
    "customer_id_nb_tx_15min_window",
    "customer_id_nb_tx_30min_window",
    "customer_id_nb_tx_60min_window",
    "customer_id_avg_amount_15min_window",
    "customer_id_avg_amount_30min_window",
    "customer_id_avg_amount_60min_window",
]

# Creating feature Group for streaming for customers
create_fs_feature_group(
    bq_source_uri=CUSTOMERS_STREAMING_FE_TABLE_URI,
    entity_id_column=CUSTOMER_ID_COLUMN,
    feature_group_id=CUSTOMER_STREAMING_FEATURES_GROUP_ID,
    feature_ids_list=CUSTOMER_STREAMING_FEATURE_IDS,
)

In [None]:
# Now, create the featureGroup for terminals
TERMINAL_ID_COLUMN = "terminal_id"

TERMINAL_BATCH_FEATURES_GROUP_ID = "fraudfinder_terminals_batch"
TERMINAL_BATCH_FEATURE_IDS = [
    "terminal_id_nb_tx_1day_window",
    "terminal_id_nb_tx_7day_window",
    "terminal_id_nb_tx_14day_window",
    "terminal_id_risk_1day_window",
    "terminal_id_risk_7day_window",
    "terminal_id_risk_14day_window",
]

# Creating feature Group for batch for customers
create_fs_feature_group(
    bq_source_uri=TERMINALS_FE_BQ_VIEW_URI,
    entity_id_column=TERMINAL_ID_COLUMN,
    feature_group_id=TERMINAL_BATCH_FEATURES_GROUP_ID,
    feature_ids_list=TERMINAL_BATCH_FEATURE_IDS,
)

In [None]:
# Now, create the featureGroup for terminals streaming features
TERMINAL_STREAMING_FEATURES_GROUP_ID = "fraudfinder_terminals_streaming"
TERMINAL_STREAMING_FEATURE_IDS = [
    "terminal_id_nb_tx_15min_window",
    "terminal_id_nb_tx_30min_window",
    "terminal_id_nb_tx_60min_window",
    "terminal_id_avg_amount_15min_window",
    "terminal_id_avg_amount_30min_window",
    "terminal_id_avg_amount_60min_window",
]

# Creating feature Group for batch for customers
create_fs_feature_group(
    bq_source_uri=TERMINALS_STREAMING_FE_TABLE_URI,
    entity_id_column=TERMINAL_ID_COLUMN,
    feature_group_id=TERMINAL_STREAMING_FEATURES_GROUP_ID,
    feature_ids_list=TERMINAL_STREAMING_FEATURE_IDS,
)


Create a `FeatureView` instance for the BigQuery view and FeatureGroup/features you created earlier in this tutorial and set the sync time and frequency to 15 min. (*/15 * * * *)

In [None]:
def create_online_fs_view(
    fs_view_id,
    fs_online_store_id,
    feature_group_id,
    feature_ids_list,
    continuous,
    cron_schedule=None,
):

    feature_registry_source = feature_view_pb2.FeatureView.FeatureRegistrySource(
        feature_groups=[
            feature_view_pb2.FeatureView.FeatureRegistrySource.FeatureGroup(
                feature_group_id=feature_group_id,
                feature_ids=feature_ids_list,
            )
        ]
    )

    if continuous:
        sync_config = feature_view_pb2.FeatureView.SyncConfig(continuous=True)
    else:
        sync_config = feature_view_pb2.FeatureView.SyncConfig(cron=cron_schedule)

    create_view_lro = admin_client.create_feature_view(
        feature_online_store_admin_service_pb2.CreateFeatureViewRequest(
            parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{fs_online_store_id}",
            feature_view_id=fs_view_id,
            run_sync_immediately=True,
            feature_view=feature_view_pb2.FeatureView(
                feature_registry_source=feature_registry_source,
                sync_config=sync_config,
            ),
        )
    )

    # Wait for LRO to complete and show result
    print(create_view_lro.result())

In [None]:
CUSTOMER_BATCH_FEATURE_VIEW_ID = "fv_fraudfinder_customers_batch"

CRON_SCHEDULE = "TZ=America/Los_Angeles */15 * * * *"  # Each 15min

create_online_fs_view(
    fs_view_id=CUSTOMER_BATCH_FEATURE_VIEW_ID,
    fs_online_store_id=FEATURE_ONLINE_STORE_ID,
    feature_group_id=CUSTOMER_BATCH_FEATURES_GROUP_ID,
    feature_ids_list=CUSTOMER_BATCH_FEATURE_IDS,
    continuous=False,
    cron_schedule=CRON_SCHEDULE,
)

In [None]:
# Creating featurestore view for Customers Features, streaming FE pipeline.
CUSTOMER_STREAMING_FEATURE_VIEW_ID = "fv_fraudfinder_customers_streaming"

create_online_fs_view(
    fs_view_id=CUSTOMER_STREAMING_FEATURE_VIEW_ID,
    fs_online_store_id=FEATURE_ONLINE_STORE_ID,
    feature_group_id=CUSTOMER_STREAMING_FEATURES_GROUP_ID,
    feature_ids_list=CUSTOMER_STREAMING_FEATURE_IDS,
    continuous=True,
)

In [None]:
# Creating featurestore view for Terminals Features, batch FE pipeline.
TERMINAL_BATCH_FEATURE_VIEW_ID = "fv_fraudfinder_terminals_batch"

create_online_fs_view(
    fs_view_id=TERMINAL_BATCH_FEATURE_VIEW_ID,
    fs_online_store_id=FEATURE_ONLINE_STORE_ID,
    feature_group_id=TERMINAL_BATCH_FEATURES_GROUP_ID,
    feature_ids_list=TERMINAL_BATCH_FEATURE_IDS,
    continuous=False,
    cron_schedule=CRON_SCHEDULE,
)

In [None]:
# Creating featurestore view for Terminals Features, streaming FE pipeline.
TERMINAL_STREAMING_FEATURE_VIEW_ID = "fv_fraudfinder_terminals_streaming"

create_online_fs_view(
    fs_view_id=TERMINAL_STREAMING_FEATURE_VIEW_ID,
    fs_online_store_id=FEATURE_ONLINE_STORE_ID,
    feature_group_id=TERMINAL_STREAMING_FEATURES_GROUP_ID,
    feature_ids_list=TERMINAL_STREAMING_FEATURE_IDS,
    continuous=True,
)

Verify that the `FeatureView` instance is created by listing all the feature views within the online store.

In [None]:
# Again, list all feature view under the FEATURE_ONLINE_STORE_ID to confirm
admin_client.list_feature_views(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

### Start sync manually

The sync pipeline executes according to the schedule specified in the `FeatureView` instance.

To skip the wait and execute the sync pipeline immediately, start the sync manually.

In [None]:
sync_response = admin_client.sync_feature_view(
    feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{TERMINAL_BATCH_FEATURE_VIEW_ID}"
)

The `sync_response` contains the ID of the sync job.

Use `get_feature_view_sync` to check the status of the job.

In [None]:
import time

while True:
    feature_view_sync = admin_client.get_feature_view_sync(
        name=sync_response.feature_view_sync
    )
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}.")
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

In [None]:
admin_client.list_feature_view_syncs(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{TERMINAL_BATCH_FEATURE_VIEW_ID}"
)

In [None]:
sync_response = admin_client.sync_feature_view(
    feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{CUSTOMER_BATCH_FEATURE_VIEW_ID}"
)

In [None]:
import time

while True:
    feature_view_sync = admin_client.get_feature_view_sync(
        name=sync_response.feature_view_sync
    )
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}.")
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

In [None]:
admin_client.list_feature_view_syncs(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{CUSTOMER_BATCH_FEATURE_VIEW_ID}"
)

### Start online serving

After the data sync is complete, use the `FetchFeatureValues` API to retrieve the data.

In [None]:
data_client = FeatureOnlineStoreServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

The `FeatureView` already defines the features needed for the model (via the BigQuery view in this demo). To fetch the data, submit a `fetch_feature_values` request specifying the `FeatureView` resource path and the ID of the entity.

In [None]:
print(FEATURE_ONLINE_STORE_ID)
print(CUSTOMER_BATCH_FEATURE_VIEW_ID)

customer_key = "0001071169708317"  # Put known id here

FEATURE_VIEW_FULL_ID = f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{CUSTOMER_BATCH_FEATURE_VIEW_ID}"

try:
    fe_data = data_client.fetch_feature_values(
        request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
            feature_view=FEATURE_VIEW_FULL_ID,
            data_key=feature_online_store_service_pb2.FeatureViewDataKey(
                key=customer_key
            ),
            data_format=feature_online_store_service_pb2.FeatureViewDataFormat.PROTO_STRUCT,
        )
    )
    features_map = {k: v for k, v in fe_data.proto_struct.items()}
    print(
        json.dumps(
            features_map,
        )
    )
except Exception as exp:
    print("ERROR: " + str(exp))

### Inspect your feature store in the Vertex AI console

You can also inspect your feature store in the [Vertex AI Feature Store console](https://console.cloud.google.com/vertex-ai/feature-store/online-stores)

### END

Now you can go to the next notebook `03_feature_engineering_streaming_new_fs.ipynb`

## Clean up