In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fraudfinder - Feature engineering (batch)

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?name=Model%20Monitoring&download_url=https%3A%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmaster%2Fnotebooks%2Fcommunity%2Fmodel_monitoring%2Fmodel_monitoring_feature_attribs.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Google Cloud Notebooks">Open in Cloud Notebook
    </a>
  </td> 
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/community/model_monitoring/model_monitoring_feature_attribs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Open in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/community/model_monitoring/model_monitoring_feature_attribs.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

[Fraudfinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the Fraudfinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

This notebook shows how to generate new features on bank transactions by customer and terminal, by doing batch feature engineering in SQL with BigQuery. Then, you will create a feature store using Vertex AI Feature Store, and ingest your newly-created features from BigQuery into Vertex AI Feature Store, so that a feature store can become the single source of data for both training and model inference. 

In the following notebook, you will learn:

- How to use BigQuery for feature engineering
- Create a feature store 
- Ingest features into the feature store
- Read features from the feature store

### Load configuration settings from the setup notebook

First, set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

### Import libraries

In [None]:
import os
import sys
import random
import time
import datetime as dt
from datetime import datetime, timedelta
from typing import List, Union
import json

# Data Engineering
import numpy as np
import pandas as pd
from google.cloud import bigquery

# Vertex AI and Vertex AI Feature Store 
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import Featurestore, EntityType, Feature

### Define constants

In [None]:
# Organize project directory structure.
# DATA_DIR = os.path.join("..", "data")
# RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")

# Define the range of transactions for training (Jan 2022).
YEAR_MONTH_PREFIX = "2022-01"
DATAPROCESSING_START_DATE = f"{YEAR_MONTH_PREFIX}-01"
DATAPROCESSING_END_DATE = f"{YEAR_MONTH_PREFIX}-31"

# Define BiqQuery dataset, tables, and time windows to calculate static behavioral features.
RAW_TABLE_LABELS = "txlabels"
FEATURES_TABLE_NAME = f"{PROJECT_ID}.tx.wide_features_table"
FEATURES_PARTIONED_TABLE = f"{FEATURES_TABLE_NAME}_{YEAR_MONTH_PREFIX.replace('-', '')}*"

# Define Vertex AI Feature store settings.
ONLINE_STORAGE_NODES = 1
FEATURE_TIME = "feature_ts"
CUSTOMER_ENTITY_ID = "customer"
TERMINAL_ENTITY_ID = "terminal"
CUSTOMERS_TABLE_NAME = f"{PROJECT_ID}.tx.customers_{DATAPROCESSING_END_DATE.replace('-', '')}"
TERMINALS_TABLE_NAME = f"{PROJECT_ID}.tx.terminals_{DATAPROCESSING_END_DATE.replace('-', '')}"

### Helpers

In [None]:
def run_bq_query(sql: str, project: str, region: str, return_df=False) -> Union[str, pd.DataFrame]:
    """
    An helper function to run a BigQuery query
    Args:
        sql: BigQuery query
        project: project id
        region: region
        debug: dry run the query
        return_df: return a dataframe or not
    Returns:
        df: BigQuery query result
    """

    # Create a BigQuery client.
    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # Proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)
    result = client_result.result()
    job_id = client_result.job_id
    print(f"Finished job_id: {job_id}")

    if return_df:
        # Get & return data frame
        df = result.to_arrow().to_pandas()
        return df
    
    
def create_batch_features(features_table_name: str, 
                          query: str, 
                          end_date: str, 
                          backfill_interval: int, 
                          project_id: str, 
                          region: str
                         ) -> List[str]:
    """
    A helper function to create batch features
    Args:
        features_table_name: name of the feature table
        query: query to create the feature table
        backfill_interval: backfill interval
        query_param: query parameter to change
        project_id: project id
        region: region
    Returns:
        features_table_names: list of feature tables
    """
    # Initialize the backfill starting date.
    backfill_start_date = datetime.strptime(end_date, "%Y-%m-%d") - timedelta(days=backfill_interval)

    # Initialize a list to contain feature table names.
    features_table_names = []

    try:
        # Create a BigQuery client.
        bq_client = bigquery.Client(project=project_id, location=region)

        # For each day in the backfill interval
        for dix in range(backfill_interval + 1):
            
            # Get the backfill date.
            date_query = backfill_start_date.strftime("%Y-%m-%d")

            # Create the feature table name.
            destination = f'{features_table_name}_{backfill_start_date.strftime("%Y%m%d")}'

            # Create the query.
            query = query.replace("@END_DATE_TRAIN", f"'{date_query}'")

            # Create the query job.
            job_config = bigquery.QueryJobConfig(destination=destination, write_disposition='WRITE_TRUNCATE')

            # Run the query.
            job = bq_client.query(query, job_config=job_config)
            _ = job.result()

            # Append the feature table name to the list.
            features_table_names.append(destination)

            # Increment the backfill date.
            backfill_start_date += timedelta(days=1)

    except RuntimeError as error:
        print(error)

    return features_table_names

### Initialize Vertex AI SDK
Initialize the Vertex AI SDK.

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

## Feature Engineering

### Customer ID and Terminal ID transformations

In this section, you will create features, based on historical customer behaviour and historical terminal activity, which can be later used to train a machine learning model. 

Shown below are some SQL queries used to generate the wide features table you'll want to ingest into feature store. 

The query will calculate 2 sets of features: 

1. Customer features which describes the spending behaviour of customer within 7, 14 and 30 days time windows using number of transactions and avarage amount spent

2. Terminal features which describes the risk of a given terminal to be exposed to fraudulent transactions within 7, 14 and 30 days using average number of fraudulent transactions and the number of transactions. One thing to notice is that we add some delay which would take into account time would pass between the time of transaction and the result of fraud investigation or customer claim.

You will use one month of transaction data starting from the end of January and going back. 

#### Define the query

In [None]:
create_batch_features_query = \
f"""
WITH
  # query to join labels with features -------------------------------------------------------------------------------------------
  get_raw_table AS (
  SELECT
    raw_tx.TX_TS,
    raw_tx.TX_ID,
    raw_tx.CUSTOMER_ID,
    raw_tx.TERMINAL_ID,
    raw_tx.TX_AMOUNT,
    raw_lb.TX_FRAUD
  FROM (
    SELECT
      *
    FROM
      `tx.tx`
    WHERE
      DATE(TX_TS) <= @END_DATE_TRAIN
    ORDER BY
      TX_TS) AS raw_tx
  LEFT JOIN 
    `tx.{RAW_TABLE_LABELS}` as raw_lb
  ON raw_tx.TX_ID = raw_lb.TX_ID),

  # query to calculate CUSTOMER spending behaviour --------------------------------------------------------------------------------
  get_customer_spending_behaviour AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TX_AMOUNT,
    TX_FRAUD,
    
    # calculate the number of customer transactions over windows
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_1DAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_7DAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_14DAY_WINDOW,
      
    # calculate the customer average transactions amount over windows
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW,
  FROM get_raw_table),

  # query to calculate TERMINAL spending behaviour --------------------------------------------------------------------------------
  get_variables_delay_window AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    
    # calculate total amount and the total number of trasactions over the delay period (7 days - delay)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_DELAY,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_TX_DELAY,
      
    # calculate total amount and the total number of trasactions over the delayed window (window + 7 days - delay)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_1_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_7_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_14_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_TX_1_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_TX_7_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_TX_14_DELAY_WINDOW,
  FROM get_raw_table),

  # query to calculate TERMINAL risk factors ---------------------------------------------------------------------------------------
  get_risk_factors AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    # calculate numerator
    NB_FRAUD_1_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_1DAY_WINDOW,
    NB_FRAUD_7_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_7DAY_WINDOW,
    NB_FRAUD_14_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_14DAY_WINDOW,
    # calculate denominator
    NB_TX_1_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_1DAY_WINDOW,
    NB_TX_7_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_7DAY_WINDOW,
    NB_TX_14_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_14DAY_WINDOW,
      FROM
    get_variables_delay_window),

  # query to calculate the TERMINAL risk index -------------------------------------------------------------------------------------
  get_risk_index AS (
    SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TERMINAL_ID_NB_TX_1DAY_WINDOW,
    TERMINAL_ID_NB_TX_7DAY_WINDOW,
    TERMINAL_ID_NB_TX_14DAY_WINDOW,
    # calculate the risk index
    (TERMINAL_ID_NB_FRAUD_1DAY_WINDOW/(TERMINAL_ID_NB_TX_1DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_1DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_7DAY_WINDOW/(TERMINAL_ID_NB_TX_7DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_7DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_14DAY_WINDOW/(TERMINAL_ID_NB_TX_14DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_14DAY_WINDOW
    FROM get_risk_factors 
  )

SELECT
  PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%S', CONCAT(@END_DATE_TRAIN, ' ', STRING(TIME_TRUNC(CURRENT_TIME(), SECOND))), 'UTC') AS feature_ts,
  a.CUSTOMER_ID AS customer_id,
  a.TERMINAL_ID AS terminal_id,
  CAST(a.CUSTOMER_ID_NB_TX_1DAY_WINDOW AS INT64) AS customer_id_nb_tx_1day_window,
  CAST(a.CUSTOMER_ID_NB_TX_7DAY_WINDOW AS INT64) AS customer_id_nb_tx_7day_window,
  CAST(a.CUSTOMER_ID_NB_TX_14DAY_WINDOW AS INT64) AS customer_id_nb_tx_14day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_1day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_7day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_14day_window,
  CAST(b.TERMINAL_ID_NB_TX_1DAY_WINDOW AS INT64) AS terminal_id_nb_tx_1day_window,
  CAST(b.TERMINAL_ID_NB_TX_7DAY_WINDOW AS INT64) AS terminal_id_nb_tx_7day_window,
  CAST(b.TERMINAL_ID_NB_TX_14DAY_WINDOW AS INT64) AS terminal_id_nb_tx_14day_window,
  CAST(b.TERMINAL_ID_RISK_1DAY_WINDOW AS FLOAT64) AS terminal_id_risk_1day_window,
  CAST(b.TERMINAL_ID_RISK_7DAY_WINDOW AS FLOAT64) AS terminal_id_risk_7day_window,
  CAST(b.TERMINAL_ID_RISK_14DAY_WINDOW AS FLOAT64) AS terminal_id_risk_14day_window,
FROM
  get_customer_spending_behaviour AS a
INNER JOIN
  get_risk_index AS b
ON
  a.TX_TS = b.TX_TS
  AND a.TX_ID = b.TX_ID
  AND a.CUSTOMER_ID = b.CUSTOMER_ID
  AND a.TERMINAL_ID = b.TERMINAL_ID
ORDER BY
  a.TX_TS
"""

#### Run the query above to aggregate and store the wide feature data in a BigQuery table.

In the code below, you will iterate the SQL over predifined number of days in order to  backfill features. 

In [None]:
BACKFILL_INTERVAL = 1
BACKFILL_START_DATE = datetime.strptime(DATAPROCESSING_END_DATE, "%Y-%m-%d") - timedelta(days=BACKFILL_INTERVAL)

In [None]:
features_table_names = create_batch_features(
    query=create_batch_features_query,
    features_table_name=FEATURES_TABLE_NAME,
    end_date=DATAPROCESSING_END_DATE, 
    backfill_interval=BACKFILL_INTERVAL, 
    project_id=PROJECT_ID, 
    region=REGION)

#### Quick peek at the BigQuery data

In [None]:
view_query = f"""SELECT * FROM `{features_table_names[-1]}` LIMIT 10"""
batch_feature_table_df = run_bq_query(view_query, project=PROJECT_ID, region=REGION, return_df=True)

In [None]:
batch_feature_table_df

### Initialize real time features 

In order to ingest realtime features, we initialize them with a default values. The following query will

- Add one column for each real time feature
- Set 0 as default values for each of them
- Update all real-time columns with default values

#### Define the query

In [None]:
initiate_real_time_features_query = \
f"""
ALTER TABLE `@FEATURE_TABLE`
ADD COLUMN customer_id_nb_tx_15min_window INT64,
ADD COLUMN customer_id_nb_tx_30min_window INT64,
ADD COLUMN customer_id_nb_tx_60min_window INT64,
ADD COLUMN customer_id_avg_amount_15min_window FLOAT64,
ADD COLUMN customer_id_avg_amount_30min_window FLOAT64,
ADD COLUMN customer_id_avg_amount_60min_window FLOAT64,
ADD COLUMN terminal_id_nb_tx_15min_window INT64,
ADD COLUMN terminal_id_nb_tx_30min_window INT64,
ADD COLUMN terminal_id_nb_tx_60min_window INT64,
ADD COLUMN terminal_id_avg_amount_15min_window FLOAT64,
ADD COLUMN terminal_id_avg_amount_30min_window FLOAT64,
ADD COLUMN terminal_id_avg_amount_60min_window FLOAT64;

ALTER TABLE `@FEATURE_TABLE`
ALTER COLUMN customer_id_nb_tx_15min_window SET DEFAULT 0,
ALTER COLUMN customer_id_nb_tx_30min_window SET DEFAULT 0,
ALTER COLUMN customer_id_nb_tx_60min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_15min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_30min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_60min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_15min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_30min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_60min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_15min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_30min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_60min_window SET DEFAULT 0;

UPDATE `@FEATURE_TABLE`
SET customer_id_nb_tx_15min_window = 0,
    customer_id_nb_tx_30min_window  = 0,
    customer_id_nb_tx_60min_window  = 0, 
    customer_id_avg_amount_15min_window = 0,
    customer_id_avg_amount_30min_window  = 0,
    customer_id_avg_amount_60min_window  = 0,
    terminal_id_nb_tx_15min_window = 0,
    terminal_id_nb_tx_30min_window  = 0,
    terminal_id_nb_tx_60min_window  = 0,
    terminal_id_avg_amount_15min_window = 0,
    terminal_id_avg_amount_30min_window = 0,
    terminal_id_avg_amount_60min_window  = 0
WHERE TRUE; 
"""

### Run the query above to initialize the real-time features.

In [None]:
for tbn in features_table_names:
    initialization_query = initiate_real_time_features_query
    initialization_query = initialization_query.replace('@FEATURE_TABLE', tbn)
    run_bq_query(initialization_query, project=PROJECT_ID, region=REGION)

### Quick peek at the BigQuery data

In [None]:
view_query = f"""SELECT * FROM `{features_table_names[-1]}` LIMIT 10"""
feature_table_df = run_bq_query(view_query, project=PROJECT_ID, region=REGION, return_df=True)

In [None]:
feature_table_df

## Feature Store for feature management

### What is a feature store?

The features generated are great examples of features that we can store the [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore). This is because:

- The features need to be precalculated.
- We want to share the features. 
- We will use the features for real-time predictions. 

Vertex AI Feature Store provides a centralized repository for organizing, storing, and serving ML features. Using a central featurestore enables an organization to efficiently share, discover, and re-use ML features at scale, which can increase the velocity of developing and deploying new ML applications.

### Why would you like to set up it?

So far you've built and stored features in BigQuery. Now, in order to predict fraud, you want to serve those features in real-time with millisecond scale latency. In particular, when the ML gateway receives a prediction request for a specific transaction (including customer, terminal, and transaction ids), the system needs to fetch the features related to that transaction and pass them as inputs to the model for online prediction. As you can imagine, an analytical data warehouse such as BigQuery is not able to provide low-latency near real-time read operations. 

Last year, Google Cloud announched Vertex AI, a managed machine learning (ML) platform that allows data science teams to accelerate the deployment and maintenance of ML models. The platform is comprised of several building blocks, including the Vertex AI Feature Store, which provides a managed service for low latency scalable feature serving. It also provides a centralized feature repository with easy APIs to search and discover features, as well as feature monitoring capabilities to track drift and other quality issues. 

Vertex AI Feature Store uses a time series data model to store a series of values for features. This enables Vertex AI Feature Store to maintain feature values as they change over time and to support point-in-time queries of feature values. Feature Store organizes resources hierarchically (`Featurestore -> EntityType -> Feature`) in the following order: 

- **Featurestore**: the resource to contains entities and features.
    - **EntityType**: under a Featurestore, an EntityType describes an minimal data entry.
        - **Feature**: under an EntityType, a feature is an attribute of the EntityType. 


You must create these resources before you can ingest data into a Feature Store. 

Let's do that now using the **Vertex AI SDK**!

### Create featurestore, `fraud_detection`

A featurestore is the top-level container for entity types, features, and feature values. Typically, an organization creates one shared featurestore for feature ingestion, serving, and sharing across all teams in the organization.

In [None]:
# Try to create a new featurestore resource
ff_feature_store = Featurestore.create(featurestore_id=f'{FEATURESTORE_ID}',
                              online_store_fixed_node_count=ONLINE_STORAGE_NODES, 
                              labels={"team": "dataoffice", 
                                      "app" : "fraud_finder"}, 
                              sync=True)

### Create the main entity types and their features

An entity type is a collection of semantically related features. You define your own entity types, based on the concepts that are relevant to your use case. In this case, the Fraud Finder service has the entity types event, customer and transaction. 

#### Create the ```customer``` entity type 

In [None]:
customer_entity_type = ff_feature_store.create_entity_type(
                                        entity_type_id=CUSTOMER_ENTITY_ID,
                                        description="Customer Entity", 
                                        sync=True)

#### Create features of the ```customer``` entity type

In [None]:
customer_feature_configs = {
    "customer_id_nb_tx_1day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last day",
    "labels": {"status": "passed"}
    },
    "customer_id_nb_tx_7day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last 7 days",
    "labels": {"status": "passed"}
    },
    "customer_id_nb_tx_14day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last 14 days",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_1day_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last day",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_7day_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 7 days",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_14day_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 14 days",
    "labels": {"status": "passed"}
    },
    "customer_id_nb_tx_15min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last 15 minutes",
    "labels": {"status": "passed"}
    },
    "customer_id_nb_tx_30min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last 30 minutes",
    "labels": {"status": "passed"}
    },
    "customer_id_nb_tx_60min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the customer in the last 60 minutes",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_15min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 15 minutes",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_30min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 30 minutes",
    "labels": {"status": "passed"}
    },
    "customer_id_avg_amount_60min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 60 minutes",
    "labels": {"status": "passed"}
    }
}

In [None]:
customer_feature_ids = customer_entity_type.batch_create_features(
        feature_configs = customer_feature_configs,
        sync=True)

#### Create the ```terminal``` entity type

In [None]:
terminal_entity_type = ff_feature_store.create_entity_type(
                        entity_type_id=TERMINAL_ENTITY_ID,
                        description="Terminal Entity", 
                        sync=True)

#### Create features of the ```terminal``` entity type

In [None]:
terminal_feature_configs = {
    "terminal_id_nb_tx_1day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the last day",
    "labels": {"status": "passed"}
    },
    "terminal_id_nb_tx_7day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the 7 days",
    "labels": {"status": "passed"}
    },
    "terminal_id_nb_tx_14day_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the 14 days",
    "labels": {"status": "passed"}
    },
    "terminal_id_risk_1day_window": {
    "value_type": "DOUBLE",
    "description" : "Risk score calculated average number of frauds on the terminal in the last day",
    "labels": {"status": "passed"}
    },
    "terminal_id_risk_7day_window": {
    "value_type": "DOUBLE",
    "description" : "Risk score calculated average number of frauds on the terminal in the last 7 days",
    "labels": {"status": "passed"}
    },
    "terminal_id_risk_14day_window": {
    "value_type": "DOUBLE",
    "description" : "Risk score calculated average number of frauds on the terminal in the last 14 day",
    "labels": {"status": "passed"}
    },
    "terminal_id_nb_tx_15min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the last 15 minutes",
    "labels": {"status": "passed"}
    },
    "terminal_id_nb_tx_30min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the last 30 minutes",
    "labels": {"status": "passed"}
    },
    "terminal_id_nb_tx_60min_window": {
    "value_type": "INT64",
    "description" : "Number of transactions by the terminal in the last 60 minutes",
    "labels": {"status": "passed"}
    },
    "terminal_id_avg_amount_15min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 15 minutes",
    "labels": {"status": "passed"}
    },
    "terminal_id_avg_amount_30min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 30 minutes",
    "labels": {"status": "passed"}
    },
    "terminal_id_avg_amount_60min_window": {
    "value_type": "DOUBLE",
    "description" : "Average spending amount in the last 60 minutes",
    "labels": {"status": "passed"}
    }
}

In [None]:
terminal_feature_ids = terminal_entity_type.batch_create_features(
    feature_configs = terminal_feature_configs,
    sync=True)

#### A quick look in the console

Let see how Vertex AI Feature store looks like in the [console](https://console.cloud.google.com/vertex-ai/features)

### Import feature values 

Now we need to import the actual feature values before we can use them for online/offline use.

About **Source Data format and Layout**:

- The feature store [supports source data](https://cloud.google.com/vertex-ai/docs/featurestore/source-data) from BigQuery tables or Avro and CSV files on Google Cloud Storage.
- Each imported entity *must* have an ID.
- Each entity can *optionally* have a timestamp, to specifying when the feature values are generated.

We will now decompose the wide features table (constructed earlier in this notebook) into three sub-tables: events, customers, and terminals. Then we will import feature values from those tables into our feature store.

#### Prepare tables to import

Now it's time to import the data into our Feature Store.

In [None]:
# Define queries to select relevant colums for each entities

sql_queries_feature_store = []


customers_sql_query = f"""
CREATE OR REPLACE TABLE
  `{CUSTOMERS_TABLE_NAME}` AS
SELECT feature_ts, customer_id, customer_id_nb_tx_1day_window,
customer_id_nb_tx_7day_window, customer_id_nb_tx_14day_window,
customer_id_avg_amount_1day_window, customer_id_avg_amount_7day_window,
customer_id_avg_amount_14day_window, customer_id_nb_tx_15min_window,
customer_id_nb_tx_30min_window, customer_id_nb_tx_60min_window,
customer_id_avg_amount_15min_window, customer_id_avg_amount_30min_window,
customer_id_avg_amount_60min_window
FROM `{FEATURES_PARTIONED_TABLE}`
ORDER BY feature_ts
"""

sql_queries_feature_store.append(customers_sql_query)


terminals_sql_query = f"""
CREATE OR REPLACE TABLE
  `{TERMINALS_TABLE_NAME}` AS
SELECT feature_ts, terminal_id, terminal_id_nb_tx_1day_window,
terminal_id_nb_tx_7day_window, terminal_id_nb_tx_14day_window,
terminal_id_risk_1day_window,terminal_id_risk_7day_window,
terminal_id_risk_14day_window, terminal_id_nb_tx_15min_window,
terminal_id_nb_tx_30min_window, terminal_id_nb_tx_60min_window,
terminal_id_avg_amount_15min_window, terminal_id_avg_amount_30min_window,
terminal_id_avg_amount_60min_window
FROM `{FEATURES_PARTIONED_TABLE}`
ORDER BY feature_ts
"""

sql_queries_feature_store.append(terminals_sql_query)

In [None]:
for sql_query in sql_queries_feature_store:
    run_bq_query(sql_query, project=PROJECT_ID, region=REGION)

#### Quick peek at the BigQuery customer and terminal entity tables

In [None]:
view_query = f"""SELECT * FROM `{CUSTOMERS_TABLE_NAME}` LIMIT 10"""
customer_table_df = run_bq_query(view_query, project=PROJECT_ID, region=REGION, return_df=True)
customer_table_df

In [None]:
view_query = f"""SELECT * FROM `{TERMINALS_TABLE_NAME}` LIMIT 10"""
terminal_table_df = run_bq_query(view_query, project=PROJECT_ID, region=REGION, return_df=True)
terminal_table_df

#### Import customers

In the following section, you will import customers features into your feature store.

In [None]:
CUSTOMERS_FEATURES_IDS = [feature.name for feature in customer_feature_ids.list_features()]
CUSTOMER_BQ_SOURCE_URI = f"bq://{CUSTOMERS_TABLE_NAME}"
CUSTOMER_ENTITY_ID_FIELD = "customer_id"

In [None]:
customer_entity_type.ingest_from_bq(
    feature_ids=CUSTOMERS_FEATURES_IDS,
    feature_time=FEATURE_TIME,
    bq_source_uri=CUSTOMER_BQ_SOURCE_URI,
    entity_id_field=CUSTOMER_ENTITY_ID_FIELD,
    disable_online_serving=False,
    worker_count=10,
    sync=False
    )

#### Monitor the `Customer` features ingestion job in the console.


You can go to the [Feature Store Console](https://console.cloud.google.com/vertex-ai/ingestion-jobs) to view your ingestion job. 

#### Import terminals
In the following section, you will import the contents of the events table into your feature store.

In [None]:
TERMINAL_ENTITY_ID = "terminal"
TERMINALS_FEATURES_IDS = [feature.name for feature in terminal_feature_ids.list_features()]
TERMINALS_BQ_SOURCE_URI = f"bq://{TERMINALS_TABLE_NAME}"
TERMINALS_ENTITY_ID_FIELD = "terminal_id"

In [None]:
terminal_entity_type.ingest_from_bq(
    feature_ids=TERMINALS_FEATURES_IDS,
    feature_time=FEATURE_TIME,
    bq_source_uri=TERMINALS_BQ_SOURCE_URI,
    entity_id_field=TERMINALS_ENTITY_ID_FIELD,
    disable_online_serving=False,
    worker_count=10,
    sync=False
)

#### Monitor the ingestion jobs in the console.

The ingestion jobs you just created run asynchronously and they should take several minutes to complete. Please monitoring them in the [console](https://console.cloud.google.com/vertex-ai/ingestion-jobs).


### Search for feature values 
In this section, you'll run a search query on your feature store to validate that some data was ingested, as expected.

In [None]:
customer_aggregated_features = customer_entity_type.read(
      entity_ids=["5830444124423549", "5469689693941771", "1361459972478769"],
      feature_ids=CUSTOMERS_FEATURES_IDS)

In [None]:
customer_aggregated_features

## (DO NOT RUN) Cleaning up

In [None]:
# ff_feature_store.delete(sync=True, force=True)