In [1]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fraudfinder - Feature Engineering (batch)

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?&download_url=https://github.com/GoogleCloudPlatform/fraudfinder/raw/main/02_feature_engineering_batch.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Google Cloud Notebooks">Open in Cloud Notebook
    </a>
  </td> 
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/fraudfinder/blob/main/02_feature_engineering_batch.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Open in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/02_feature_engineering_batch.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

[Fraudfinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the Fraudfinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

As you engineer features for model training, it's important to consider how the features are computed when making predictions with new data. For online predictions, you may have features that can be pre-computed via _batch feature engineering_. You may also features that need to be computed on-the-fly via _streaming-based feature engineering_. For these Fraudfinder labs, for computing features based on the last n _days_, you will use _batch_ feature engineering in BigQuery; for computing features based on the last n _minutes_, you will use _streaming-based_ feature engineering using Dataflow.

This notebook shows how to generate new features on bank transactions by customer and terminal over the last n days, by doing batch feature engineering in SQL with BigQuery. Then, you will create a feature store using Vertex AI Feature Store, and ingest your newly-created features from BigQuery into Vertex AI Feature Store, so that a feature store can become the single source of data for both training and model inference. 

You will also create some placeholder values for streaming-based feature engineering, which is covered in the next notebook, `03_feature_engineering_streaming.ipynb`.

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)
- [BigQuery](https://cloud.google.com/bigquery/)


Steps performed in this notebook:

- Build customer and terminal-related features
- Create Feature store, entities and features
- Ingest feature values in Feature store from BigQuery table
- Read features from the feature store

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* BigQuery

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [2]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)


BUCKET_NAME          = "fraudfinder-359112-fraudfinder"
PROJECT              = "fraudfinder-359112"
REGION               = "us-central1"
ID                   = "z0umx"
FEATURESTORE_ID      = "fraudfinder_z0umx"
MODEL_NAME           = "fraudfinder_logreg_model"
ENDPOINT_NAME        = "fraudfinder_logreg_endpoint"
TRAINING_DS_SIZE     = "1000"



### Import libraries

In [3]:
# General
import datetime as dt
import json
import os
import random
import sys
import time
from datetime import datetime, timedelta
from typing import List, Union

# Data Engineering
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

# Vertex AI and Vertex AI Feature Store
from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery
from google.cloud.aiplatform import EntityType, Feature, Featurestore

### Define constants

In [4]:
# Define the date range of transactions for feature engineering (last 31 days up until yesterday)
YESTERDAY = datetime.today() - timedelta(days=1)
YEAR_MONTH_PREFIX = YESTERDAY.strftime("%Y-%m")
DATAPROCESSING_START_DATE = (YESTERDAY - timedelta(days=10)).strftime("%Y-%m-%d")
DATAPROCESSING_END_DATE = YESTERDAY.strftime("%Y-%m-%d")

# Define BiqQuery dataset and tables to calculate features.
RAW_TABLE_LABELS = "txlabels"
FEATURES_TABLE_NAME = f"{PROJECT_ID}.tx.wide_features_table"
FEATURES_PARTIONED_TABLE = (
    f"{FEATURES_TABLE_NAME}_{YEAR_MONTH_PREFIX.replace('-', '')}*"
)

# Define Vertex AI Feature store settings.
ONLINE_STORAGE_NODES = 1
FEATURE_TIME = "feature_ts"
CUSTOMER_ENTITY_ID = "customer"
TERMINAL_ENTITY_ID = "terminal"
CUSTOMERS_TABLE_NAME = (
    f"{PROJECT_ID}.tx.customers_{DATAPROCESSING_END_DATE.replace('-', '')}"
)
TERMINALS_TABLE_NAME = (
    f"{PROJECT_ID}.tx.terminals_{DATAPROCESSING_END_DATE.replace('-', '')}"
)

### Helpers

Define a set of helper functions to run BigQuery query and create features. 

In [5]:
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Run a BigQuery query and return the job ID or result as a DataFrame
    Args:
        sql: SQL query, as a string, to execute in BigQuery
    Returns:
        df: DataFrame of results from query,  or error, if any
    """

    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df


def create_batch_features(
    features_table_name: str,
    query: str,
    start_date: str,
    backward_ndays: int,
    project_id: str,
    region: str,
) -> List[str]:
    """
    A helper function to create n daily feature tables over a date interval, 
    where n stands for the number of days in the backward interval.
    Then return names of feature tables in a list.
    Args:
        features_table_name: name of the feature table
        query: query to create the feature table
        start_date: the date to start creating features
        backward_ndays: number of days going backward from start date
        project_id: project id
        region: region
    Returns:
        features_table_names: list of feature tables
    """
    # Initialize the initial date (start minus backward_interval).
    initial_date = datetime.strptime(start_date, "%Y-%m-%d") - timedelta(
        days=backward_ndays
    )

    # Initialize a list to contain names of feature tables.
    features_table_names = []

    try:
        # Create a BigQuery client.
        bq_client = bigquery.Client(project=project_id, location=region)

        # For each day in the date interval
        for _ in range(backward_ndays + 1):
            
            # Create the name of destination feature table.
            destination = (
                f'{features_table_name}_{initial_date.strftime("%Y%m%d")}'
            )

            # Format the day date for the query.
            date_query = initial_date.strftime("%Y-%m-%d")

            # Create the query.
            query = query.replace("@END_DATE_TRAIN", f"'{date_query}'")

            # Create the query job.
            job_config = bigquery.QueryJobConfig(
                destination=destination, write_disposition="WRITE_TRUNCATE"
            )

            # Run the query to create daily feature table.
            job = bq_client.query(query, job_config=job_config)
            _ = job.result()
            
            print(f"Created BQ table: {destination}")

            # Append the feature table name to the list.
            features_table_names.append(destination)

            # Increment the date interval.
            initial_date += timedelta(days=1)

    except RuntimeError as error:
        print(error)

    return features_table_names

## Feature Engineering

### Define customer and terminal-related features for batch feature engineering

In this section, you will create features, based on historical customer behaviour and historical terminal activities. This features will be batch-generated using SQL in BigQuery, where the historical data is stored.

The query below will calculate 2 sets of features: 

1. **Customer-related features**: which describes the spending behaviour of customer within 1, 7 and 15 days time windows using number of transactions and average amount spent in dollars ($)

2. **Terminal-related features** which describes the risk of a given terminal to be exposed to fraudulent transactions within 1, 7 and 15 days using average number of fraudulent transactions in dollars ($), the number of transactions and risk index. One thing to note is that you will add some delay to take into account time that would pass between the time of transaction and the result of fraud investigation or customer claim.

You will use one month of transaction data starting from the end of January and going back to compute the features.

Below is the schema you should expect to see, after doing the batch feature engineering in BigQuery:

|feature_time           |customer_id|terminal_id| customer batch features   | terminal batch features|
|-----------------------|-----------|-----------|---------------------------|------------------------|
|2022-01-01 17:20:15 UTC|1          |12345      |(e.g., nb_tx,  avg_tx)     |(e.g., risk_x_days)     |
|2022-01-02 12:08:40 UTC|2          |26789      |(e.g., nb_tx,  avg_tx)     |(e.g., risk_x_days)     |
|2022-01-03 17:30:48 UTC|3          |101112     |(e.g., nb_tx,  avg_tx)     |(e.g., risk_x_days)     |


#### Create the query to create batch features

Date settings to be used:

In [6]:
print(f"""
DATAPROCESSING_START_DATE: {DATAPROCESSING_START_DATE}
DATAPROCESSING_END_DATE: {DATAPROCESSING_END_DATE}
""")


DATAPROCESSING_START_DATE: 2022-10-14
DATAPROCESSING_END_DATE: 2022-10-24



SQL query string:

In [7]:
create_batch_features_query = f"""
WITH
  -- query to join labels with features -------------------------------------------------------------------------------------------
  get_raw_table AS (
  SELECT
    raw_tx.TX_TS,
    raw_tx.TX_ID,
    raw_tx.CUSTOMER_ID,
    raw_tx.TERMINAL_ID,
    raw_tx.TX_AMOUNT,
    raw_lb.TX_FRAUD
  FROM (
    SELECT
      *
    FROM
      `tx.tx`
    WHERE
      DATE(TX_TS) BETWEEN DATE_SUB(@END_DATE_TRAIN, INTERVAL 15 DAY) AND @END_DATE_TRAIN
    ) raw_tx
  LEFT JOIN 
    `tx.{RAW_TABLE_LABELS}` as raw_lb
  ON raw_tx.TX_ID = raw_lb.TX_ID),

  -- query to calculate CUSTOMER spending behaviour --------------------------------------------------------------------------------
  get_customer_spending_behaviour AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TX_AMOUNT,
    TX_FRAUD,
    
    # calc the number of customer tx over daily windows per customer (1, 7 and 15 days, expressed in seconds)
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_1DAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_7DAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_NB_TX_14DAY_WINDOW,
      
    # calc the customer average tx amount over daily windows per customer (1, 7 and 15 days, expressed in seconds, in dollars ($))
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 86400 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,
    AVG(TX_AMOUNT) OVER (PARTITION BY CUSTOMER_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW,
  FROM get_raw_table),

  # query to calculate TERMINAL spending behaviour --------------------------------------------------------------------------------
  get_variables_delay_window AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    
    # calc total amount of fraudulent tx and the total number of tx over the delay period per terminal (7 days - delay, expressed in seconds)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_DELAY,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 604800 PRECEDING
      AND CURRENT ROW ) AS NB_TX_DELAY,
      
    # calc total amount of fraudulent tx and the total number of tx over the delayed window per terminal (window + 7 days - delay, expressed in seconds)
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_1_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_7_DELAY_WINDOW,
    SUM(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_FRAUD_14_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 691200 PRECEDING
      AND CURRENT ROW ) AS NB_TX_1_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1209600 PRECEDING
      AND CURRENT ROW ) AS NB_TX_7_DELAY_WINDOW,
    COUNT(TX_FRAUD) OVER (PARTITION BY TERMINAL_ID ORDER BY UNIX_SECONDS(TX_TS) ASC RANGE BETWEEN 1814400 PRECEDING
      AND CURRENT ROW ) AS NB_TX_14_DELAY_WINDOW,
  FROM get_raw_table),

  # query to calculate TERMINAL risk factors ---------------------------------------------------------------------------------------
  get_risk_factors AS (
  SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    # calculate numerator of risk index
    NB_FRAUD_1_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_1DAY_WINDOW,
    NB_FRAUD_7_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_7DAY_WINDOW,
    NB_FRAUD_14_DELAY_WINDOW - NB_FRAUD_DELAY AS TERMINAL_ID_NB_FRAUD_14DAY_WINDOW,
    # calculate denominator of risk index
    NB_TX_1_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_1DAY_WINDOW,
    NB_TX_7_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_7DAY_WINDOW,
    NB_TX_14_DELAY_WINDOW - NB_TX_DELAY AS TERMINAL_ID_NB_TX_14DAY_WINDOW,
      FROM
    get_variables_delay_window),

  # query to calculate the TERMINAL risk index -------------------------------------------------------------------------------------
  get_risk_index AS (
    SELECT
    TX_TS,
    TX_ID,
    CUSTOMER_ID,
    TERMINAL_ID,
    TERMINAL_ID_NB_TX_1DAY_WINDOW,
    TERMINAL_ID_NB_TX_7DAY_WINDOW,
    TERMINAL_ID_NB_TX_14DAY_WINDOW,
    # calculate the risk index
    (TERMINAL_ID_NB_FRAUD_1DAY_WINDOW/(TERMINAL_ID_NB_TX_1DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_1DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_7DAY_WINDOW/(TERMINAL_ID_NB_TX_7DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_7DAY_WINDOW,
    (TERMINAL_ID_NB_FRAUD_14DAY_WINDOW/(TERMINAL_ID_NB_TX_14DAY_WINDOW+0.0001)) AS TERMINAL_ID_RISK_14DAY_WINDOW
    FROM get_risk_factors 
  )

# Create the table with CUSTOMER and TERMINAL features ----------------------------------------------------------------------------
SELECT
  PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%S', CONCAT(@END_DATE_TRAIN, ' ', STRING(TIME_TRUNC(CURRENT_TIME(), SECOND))), 'UTC') AS feature_ts,
  a.CUSTOMER_ID AS customer_id,
  a.TERMINAL_ID AS terminal_id,
  CAST(a.CUSTOMER_ID_NB_TX_1DAY_WINDOW AS INT64) AS customer_id_nb_tx_1day_window,
  CAST(a.CUSTOMER_ID_NB_TX_7DAY_WINDOW AS INT64) AS customer_id_nb_tx_7day_window,
  CAST(a.CUSTOMER_ID_NB_TX_14DAY_WINDOW AS INT64) AS customer_id_nb_tx_14day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_1day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_7day_window,
  CAST(a.CUSTOMER_ID_AVG_AMOUNT_14DAY_WINDOW AS FLOAT64) AS customer_id_avg_amount_14day_window,
  CAST(b.TERMINAL_ID_NB_TX_1DAY_WINDOW AS INT64) AS terminal_id_nb_tx_1day_window,
  CAST(b.TERMINAL_ID_NB_TX_7DAY_WINDOW AS INT64) AS terminal_id_nb_tx_7day_window,
  CAST(b.TERMINAL_ID_NB_TX_14DAY_WINDOW AS INT64) AS terminal_id_nb_tx_14day_window,
  CAST(b.TERMINAL_ID_RISK_1DAY_WINDOW AS FLOAT64) AS terminal_id_risk_1day_window,
  CAST(b.TERMINAL_ID_RISK_7DAY_WINDOW AS FLOAT64) AS terminal_id_risk_7day_window,
  CAST(b.TERMINAL_ID_RISK_14DAY_WINDOW AS FLOAT64) AS terminal_id_risk_14day_window,
FROM
  get_customer_spending_behaviour AS a
INNER JOIN
  get_risk_index AS b
ON
  a.TX_TS = b.TX_TS
  AND a.TX_ID = b.TX_ID
  AND a.CUSTOMER_ID = b.CUSTOMER_ID
  AND a.TERMINAL_ID = b.TERMINAL_ID
"""

#### Create batch features table in a BigQuery by running the above query.

Now, you create features by iterating the SQL query to calculate terminal and customer features. 

In particular, you set the `BACKWARD_NDAYS` to calculate (backfill) your features and simulate the feature engineering process over several days. 

This interval is the number of days you want to go back from the end date to start feature engineering. Then, you create (backfill) your features by iterating the SQL query n times where n is the length of `BACKWARD_NDAYS` using the `create_batch_features` function.

For example, let's take the number of transactions per customer over the last day aka `customer_id_nb_tx_1day_window`. 

When `BACKWARD_NDAYS=1`, you will

1. Go back to yesterday
1. Calculate `customer_id_nb_tx_1day_window` using transactions until yesterday
1. Move ahead to today
1. Calculate `customer_id_nb_tx_1day_window` using transactions until today

With this process, you will get a more realistic representation of real-world features in the fraud detection scenario. 


In [8]:
BACKWARD_NDAYS = 1

The next cell may take a few seconds to run:

In [9]:
print(f"""
Batch-generating features for:
{[datetime.strftime(YESTERDAY - timedelta(days=x), "%Y-%m-%d") for x in range(BACKWARD_NDAYS + 1)]}
""")

features_table_names = create_batch_features(
    query=create_batch_features_query,
    features_table_name=FEATURES_TABLE_NAME,
    start_date=DATAPROCESSING_END_DATE,
    backward_ndays=BACKWARD_NDAYS,
    project_id=PROJECT_ID,
    region=REGION,
)


Batch-generating features for:
['2022-10-24', '2022-10-23']

Created BQ table: fraudfinder-359112.tx.wide_features_table_20221023
Created BQ table: fraudfinder-359112.tx.wide_features_table_20221024


#### Inspect the BigQuery features table 

You can query some data rows to validate the result of the query. 

In [10]:
run_bq_query(
    f"SELECT * FROM `{features_table_names[-1]}` LIMIT 10"
)

Finished job_id: 52d1a355-f501-4455-acdd-b475b9b1d5ef


Unnamed: 0,feature_ts,customer_id,terminal_id,customer_id_nb_tx_1day_window,customer_id_nb_tx_7day_window,customer_id_nb_tx_14day_window,customer_id_avg_amount_1day_window,customer_id_avg_amount_7day_window,customer_id_avg_amount_14day_window,terminal_id_nb_tx_1day_window,terminal_id_nb_tx_7day_window,terminal_id_nb_tx_14day_window,terminal_id_risk_1day_window,terminal_id_risk_7day_window,terminal_id_risk_14day_window
0,2022-10-23 09:43:03+00:00,4100480335116242,9048159,15,116,116,66.037333,121.684655,121.684655,0,0,0,0.0,0.0,0.0
1,2022-10-23 09:43:03+00:00,6208260461647471,18121189,22,54,54,301.284091,314.707037,314.707037,0,0,0,0.0,0.0,0.0
2,2022-10-23 09:43:03+00:00,7861785097277308,77821795,40,40,40,342.15675,342.15675,342.15675,0,0,0,0.0,0.0,0.0
3,2022-10-23 09:43:03+00:00,7861785097277308,28661956,46,46,46,344.456087,344.456087,344.456087,0,0,0,0.0,0.0,0.0
4,2022-10-23 09:43:03+00:00,2703847605482615,70189296,17,78,78,263.582353,192.412949,192.412949,0,0,0,0.0,0.0,0.0
5,2022-10-23 09:43:03+00:00,7650162650839187,74894711,13,127,127,105.746923,154.562047,154.562047,0,0,0,0.0,0.0,0.0
6,2022-10-23 09:43:03+00:00,1194713379481049,87403126,2,42,42,24.35,53.140476,53.140476,0,0,0,0.0,0.0,0.0
7,2022-10-23 09:43:03+00:00,5241938888236861,2953136,12,58,58,39.591667,45.373966,45.373966,0,0,0,0.0,0.0,0.0
8,2022-10-23 09:43:03+00:00,861713745482445,17245758,4,35,35,60.3725,64.814857,64.814857,0,0,0,0.0,0.0,0.0
9,2022-10-23 09:43:03+00:00,1556107125114863,45084793,6,81,81,275.161667,221.127901,221.127901,0,0,0,0.0,0.0,0.0


### Define customer and terminal-related features for _real-time_ feature engineering

To make more accurate predictions, you can also create real-time features to inspect, for example, the most recent minutes of activity for both customers and terminals.

In order to have these features available for training and inference, we first need to make sure they are defined in a BigQuery table as placeholders to be ingested into Vertex AI Feature Store.

In the query below, you initialize two sets of features for real-time feature engineering: 

1. Customer features which describes the spending behaviour of customer within 15, 30 and 60 minutes time windows using number of transactions and average amount spent in dollars ($)

2. Terminal features which describes the risk of a given terminal to be exposed to fraudulent transactions within 15, 30 and 60 minutes using average number of fraudulent transactions in dollars ($) and the number of transactions. 

To do so, you will:

- Add one column for each real time feature
- Set 0 as default values for each of them
- Update all real-time columns with default values

Then you will create the actual values for real-time feature engineering in the next `03_feature_engineering_streaming.ipynb` notebook. 

#### Define the query to initialize the real-time features.

In [11]:
initiate_real_time_features_query = f"""
ALTER TABLE `@FEATURE_TABLE`
ADD COLUMN customer_id_nb_tx_15min_window INT64,
ADD COLUMN customer_id_nb_tx_30min_window INT64,
ADD COLUMN customer_id_nb_tx_60min_window INT64,
ADD COLUMN customer_id_avg_amount_15min_window FLOAT64,
ADD COLUMN customer_id_avg_amount_30min_window FLOAT64,
ADD COLUMN customer_id_avg_amount_60min_window FLOAT64,
ADD COLUMN terminal_id_nb_tx_15min_window INT64,
ADD COLUMN terminal_id_nb_tx_30min_window INT64,
ADD COLUMN terminal_id_nb_tx_60min_window INT64,
ADD COLUMN terminal_id_avg_amount_15min_window FLOAT64,
ADD COLUMN terminal_id_avg_amount_30min_window FLOAT64,
ADD COLUMN terminal_id_avg_amount_60min_window FLOAT64;

ALTER TABLE `@FEATURE_TABLE`
ALTER COLUMN customer_id_nb_tx_15min_window SET DEFAULT 0,
ALTER COLUMN customer_id_nb_tx_30min_window SET DEFAULT 0,
ALTER COLUMN customer_id_nb_tx_60min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_15min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_30min_window SET DEFAULT 0,
ALTER COLUMN customer_id_avg_amount_60min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_15min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_30min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_nb_tx_60min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_15min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_30min_window SET DEFAULT 0,
ALTER COLUMN terminal_id_avg_amount_60min_window SET DEFAULT 0;

UPDATE `@FEATURE_TABLE`
SET customer_id_nb_tx_15min_window = 0,
    customer_id_nb_tx_30min_window  = 0,
    customer_id_nb_tx_60min_window  = 0, 
    customer_id_avg_amount_15min_window = 0,
    customer_id_avg_amount_30min_window  = 0,
    customer_id_avg_amount_60min_window  = 0,
    terminal_id_nb_tx_15min_window = 0,
    terminal_id_nb_tx_30min_window  = 0,
    terminal_id_nb_tx_60min_window  = 0,
    terminal_id_avg_amount_15min_window = 0,
    terminal_id_avg_amount_30min_window = 0,
    terminal_id_avg_amount_60min_window  = 0
WHERE TRUE; 
"""

#### Run the query above to initialize the real-time features.

In [12]:
for tbl_name in features_table_names:
    print(f"Creating table: {tbl_name}")
    updated_query = initiate_real_time_features_query.replace("@FEATURE_TABLE", tbl_name)
    run_bq_query(updated_query)

Creating table: fraudfinder-359112.tx.wide_features_table_20221023
Finished job_id: 38d6bf78-5d40-4804-8f52-7d663ad833a1
Creating table: fraudfinder-359112.tx.wide_features_table_20221024
Finished job_id: 1a2dea6c-8400-4b8d-9c60-160dfac0409b


#### Inspect BigQuery features table 

In [13]:
run_bq_query(
    f"SELECT * FROM `{features_table_names[-1]}` LIMIT 10"
)

Finished job_id: 33e0f26e-8a16-4307-9194-83557c080e69


Unnamed: 0,feature_ts,customer_id,terminal_id,customer_id_nb_tx_1day_window,customer_id_nb_tx_7day_window,customer_id_nb_tx_14day_window,customer_id_avg_amount_1day_window,customer_id_avg_amount_7day_window,customer_id_avg_amount_14day_window,terminal_id_nb_tx_1day_window,terminal_id_nb_tx_7day_window,terminal_id_nb_tx_14day_window,terminal_id_risk_1day_window,terminal_id_risk_7day_window,terminal_id_risk_14day_window,customer_id_nb_tx_15min_window,customer_id_nb_tx_30min_window,customer_id_nb_tx_60min_window,customer_id_avg_amount_15min_window,customer_id_avg_amount_30min_window,customer_id_avg_amount_60min_window,terminal_id_nb_tx_15min_window,terminal_id_nb_tx_30min_window,terminal_id_nb_tx_60min_window,terminal_id_avg_amount_15min_window,terminal_id_avg_amount_30min_window,terminal_id_avg_amount_60min_window
0,2022-10-23 09:43:03+00:00,8983650167659586,6762932,4,37,37,33.31,30.522973,30.522973,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
1,2022-10-23 09:43:03+00:00,851303074695441,76034865,9,67,67,58.28,48.238507,48.238507,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
2,2022-10-23 09:43:03+00:00,3902096097891560,19177649,39,119,119,23.438718,63.229412,63.229412,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
3,2022-10-23 09:43:03+00:00,7147543723503013,44791470,40,178,178,50.76875,45.598146,45.598146,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
4,2022-10-23 09:43:03+00:00,6939219066831238,6175818,39,70,70,302.805641,311.012571,311.012571,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
5,2022-10-23 09:43:03+00:00,9253994821759995,21387618,6,142,142,131.053333,180.326761,180.326761,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
6,2022-10-23 09:43:03+00:00,6944952662219942,42544086,45,48,48,411.722444,391.522708,391.522708,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
7,2022-10-23 09:43:03+00:00,5312140590146806,33464910,17,49,49,46.445882,55.685714,55.685714,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
8,2022-10-23 09:43:03+00:00,7861785097277308,57959389,18,110,110,74.712222,300.358818,300.358818,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0
9,2022-10-23 09:43:03+00:00,7650162650839187,10481336,15,129,129,100.820667,153.232403,153.232403,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0


Let's look at the final schema of the features table:

In [14]:
run_bq_query(f"""
SELECT column_name, data_type
FROM tx.INFORMATION_SCHEMA.COLUMNS
WHERE table_name = '{features_table_names[-1].split(".")[-1]}'
""")

Finished job_id: 02f1f687-178f-4fc3-a380-a49fb71bde0b


Unnamed: 0,column_name,data_type
0,feature_ts,TIMESTAMP
1,customer_id,STRING
2,terminal_id,STRING
3,customer_id_nb_tx_1day_window,INT64
4,customer_id_nb_tx_7day_window,INT64
5,customer_id_nb_tx_14day_window,INT64
6,customer_id_avg_amount_1day_window,FLOAT64
7,customer_id_avg_amount_7day_window,FLOAT64
8,customer_id_avg_amount_14day_window,FLOAT64
9,terminal_id_nb_tx_1day_window,INT64


## Feature store for feature management

### What is a feature store?

The features generated are great examples of features that we can store the [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore). This is because:

- The features are needed for real-time prediction
- feature values in a feature store can be used for both training and serving
- if needed, features can be shared with other use cases beyond fraud detection

Vertex AI Feature Store provides a centralized repository for organizing, storing, and serving ML features. Using a central featurestore enables an organization to efficiently share, discover, and re-use ML features at scale, which can increase the velocity of developing and deploying new ML applications.

### Why would you like to set up it?

So far you've built and stored features in BigQuery. 

Now, in order to predict fraud, you want to serve those features in real-time with millisecond scale latency. In particular, when the ML gateway receives a prediction request for a specific transaction (including customer, terminal, and transaction ids), the system needs to fetch the features related to that transaction and pass them as inputs to the model for online prediction. As you can imagine, an analytical data warehouse such as BigQuery is not able to provide low-latency near real-time read operations. 

Vertex AI Feature Store provides a managed service for low latency scalable feature serving. It also provides a centralized feature repository with easy APIs to search and discover features, as well as feature monitoring capabilities to track drift and other quality issues. 

Vertex AI Feature Store uses a time series data model to store a series of values for features, which enables Vertex AI Feature Store to maintain feature values as they change over time and to support point-in-time queries of feature values. Feature Store organizes resources hierarchically (`Featurestore -> EntityType -> Feature`) in the following order: 

- **Featurestore**: the resource to contains entities and features.
    - **EntityType**: under a Featurestore, an EntityType describes an minimal data entry.
        - **Feature**: under an EntityType, a feature is an attribute of the EntityType. 


You must create these resources before you can ingest data into a Feature Store. 

In the follow section, you will use create a feature store using Vertex AI Feature Store, and ingest data into it to be used later for training and model inference.

### Initialize Vertex AI SDK

Initialize the Vertex AI SDK to get access to Vertex AI services programmatically. 

In [15]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

### Create featurestore, `fraudfinder_<ID>`

A featurestore is the top-level container for entity types, features, and feature values. Typically, an organization creates one shared featurestore for feature ingestion, serving, and sharing across all teams in the organization.

Below you create a `featurestore` resources with different labels. 

In [16]:
# Try to create a new featurestore resource
ff_feature_store = Featurestore.create(
    featurestore_id=f"{FEATURESTORE_ID}",
    online_store_fixed_node_count=ONLINE_STORAGE_NODES,
    labels={"team": "cymbal_bank", "app": "fraudfinder"},
    sync=True,
)

Creating Featurestore
Create Featurestore backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/operations/7588531630856208384
Featurestore created. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx
To use this Featurestore in another session:
featurestore = aiplatform.Featurestore('projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx')


### Create the main entity types and their features

An entity type is a collection of semantically related features. You define your own entity types, based on the concepts that are relevant to your use case. 

In this case, you create `customer` and `transaction` entity types. 

#### Create the ```customer``` entity type 

In [17]:
customer_entity_type = ff_feature_store.create_entity_type(
    entity_type_id=CUSTOMER_ENTITY_ID, description="Customer Entity", sync=True
)

Creating EntityType
Create EntityType backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer/operations/1254271531517739008
EntityType created. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer
To use this EntityType in another session:
entity_type = aiplatform.EntityType('projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer')


#### Create features of the ```customer``` entity type

In [18]:
customer_feature_configs = {
    "customer_id_nb_tx_1day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last day",
        "labels": {"status": "passed"},
    },
    "customer_id_nb_tx_7day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last 7 days",
        "labels": {"status": "passed"},
    },
    "customer_id_nb_tx_14day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last 14 days",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_1day_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last day",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_7day_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 7 days",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_14day_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 14 days",
        "labels": {"status": "passed"},
    },
    "customer_id_nb_tx_15min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last 15 minutes",
        "labels": {"status": "passed"},
    },
    "customer_id_nb_tx_30min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last 30 minutes",
        "labels": {"status": "passed"},
    },
    "customer_id_nb_tx_60min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the customer in the last 60 minutes",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_15min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 15 minutes",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_30min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 30 minutes",
        "labels": {"status": "passed"},
    },
    "customer_id_avg_amount_60min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 60 minutes",
        "labels": {"status": "passed"},
    },
}

In [19]:
customer_feature_ids = customer_entity_type.batch_create_features(
    feature_configs=customer_feature_configs, sync=True
)

Batch creating features EntityType entityType: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer
Batch create Features EntityType entityType backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/operations/5845691351621959680
EntityType entityType Batch created features. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer


#### Create the ```terminal``` entity type

In [20]:
terminal_entity_type = ff_feature_store.create_entity_type(
    entity_type_id=TERMINAL_ENTITY_ID, description="Terminal Entity", sync=True
)

Creating EntityType
Create EntityType backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal/operations/1569523505433673728
EntityType created. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal
To use this EntityType in another session:
entity_type = aiplatform.EntityType('projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal')


#### Create features of the ```terminal``` entity type

In [21]:
terminal_feature_configs = {
    "terminal_id_nb_tx_1day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the last day",
        "labels": {"status": "passed"},
    },
    "terminal_id_nb_tx_7day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the 7 days",
        "labels": {"status": "passed"},
    },
    "terminal_id_nb_tx_14day_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the 14 days",
        "labels": {"status": "passed"},
    },
    "terminal_id_risk_1day_window": {
        "value_type": "DOUBLE",
        "description": "Risk score calculated average number of frauds on the terminal in the last day",
        "labels": {"status": "passed"},
    },
    "terminal_id_risk_7day_window": {
        "value_type": "DOUBLE",
        "description": "Risk score calculated average number of frauds on the terminal in the last 7 days",
        "labels": {"status": "passed"},
    },
    "terminal_id_risk_14day_window": {
        "value_type": "DOUBLE",
        "description": "Risk score calculated average number of frauds on the terminal in the last 14 day",
        "labels": {"status": "passed"},
    },
    "terminal_id_nb_tx_15min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the last 15 minutes",
        "labels": {"status": "passed"},
    },
    "terminal_id_nb_tx_30min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the last 30 minutes",
        "labels": {"status": "passed"},
    },
    "terminal_id_nb_tx_60min_window": {
        "value_type": "INT64",
        "description": "Number of transactions by the terminal in the last 60 minutes",
        "labels": {"status": "passed"},
    },
    "terminal_id_avg_amount_15min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 15 minutes",
        "labels": {"status": "passed"},
    },
    "terminal_id_avg_amount_30min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 30 minutes",
        "labels": {"status": "passed"},
    },
    "terminal_id_avg_amount_60min_window": {
        "value_type": "DOUBLE",
        "description": "Average spending amount in the last 60 minutes",
        "labels": {"status": "passed"},
    },
}

In [22]:
terminal_feature_ids = terminal_entity_type.batch_create_features(
    feature_configs=terminal_feature_configs, sync=True
)

Batch creating features EntityType entityType: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal
Batch create Features EntityType entityType backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/operations/5163396008075329536
EntityType entityType Batch created features. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal


### Inspect your feature store in the Vertex AI console

You can also inspect your feature store in the [Vertex AI Feature Store console](https://console.cloud.google.com/vertex-ai/features)

### Ingest feature values in Vertex AI Feature Store

Now we need to ingest the actual feature values you created in BigQuery into the Vertex AI Feature Store.

To ingest features values in Vertex AI Feature Store, you need to check the following requirements related to **Source Data format and Layout**:

- Features values have to [be stored](https://cloud.google.com/vertex-ai/docs/featurestore/source-data) in BigQuery tables or Avro and CSV files on Google Cloud Storage.
- Each imported feature entity *must* have an ID.
- Each feature entity can *optionally* have a timestamp, to specifying when the feature values are generated.

For simplicity, you now create customers and terminals tables starting from the feature table you constructed earlier in this notebook.

In the following queries, you select batch and realtime features both for customers and terminals. 

Finally you will ingest feature values from those BigQuery tables into Vertex AI Feature Store.

#### Create queries to ingest feature values in Vertex AI Feature Store

In [23]:
# Define queries to select relevant columns for each entities

sql_queries_for_feature_store_entities = []

customers_sql_query = f"""
CREATE OR REPLACE TABLE
  `{CUSTOMERS_TABLE_NAME}` AS
SELECT feature_ts, customer_id, customer_id_nb_tx_1day_window,
customer_id_nb_tx_7day_window, customer_id_nb_tx_14day_window,
customer_id_avg_amount_1day_window, customer_id_avg_amount_7day_window,
customer_id_avg_amount_14day_window, customer_id_nb_tx_15min_window,
customer_id_nb_tx_30min_window, customer_id_nb_tx_60min_window,
customer_id_avg_amount_15min_window, customer_id_avg_amount_30min_window,
customer_id_avg_amount_60min_window
FROM `{FEATURES_PARTIONED_TABLE}`;
"""

sql_queries_for_feature_store_entities.append(customers_sql_query)

terminals_sql_query = f"""
CREATE OR REPLACE TABLE
  `{TERMINALS_TABLE_NAME}` AS
SELECT feature_ts, terminal_id, terminal_id_nb_tx_1day_window,
terminal_id_nb_tx_7day_window, terminal_id_nb_tx_14day_window,
terminal_id_risk_1day_window,terminal_id_risk_7day_window,
terminal_id_risk_14day_window, terminal_id_nb_tx_15min_window,
terminal_id_nb_tx_30min_window, terminal_id_nb_tx_60min_window,
terminal_id_avg_amount_15min_window, terminal_id_avg_amount_30min_window,
terminal_id_avg_amount_60min_window
FROM `{FEATURES_PARTIONED_TABLE}`;
"""

sql_queries_for_feature_store_entities.append(terminals_sql_query)

#### Run the query above to create customers and terminals feature values tables

In [24]:
for sql_query in sql_queries_for_feature_store_entities:
    run_bq_query(sql_query)

Finished job_id: ec11c0e9-934b-4f91-8266-5dfc868b4195
Finished job_id: 0174239a-9119-44c7-a853-a460ac854ea9


#### Inspect the BigQuery customer and terminal entity tables

In [25]:
run_bq_query(
    f"SELECT * FROM `{CUSTOMERS_TABLE_NAME}` LIMIT 10"
)

Finished job_id: 25b92dc4-b69e-4aa7-b9a5-76cbabed9110


Unnamed: 0,feature_ts,customer_id,customer_id_nb_tx_1day_window,customer_id_nb_tx_7day_window,customer_id_nb_tx_14day_window,customer_id_avg_amount_1day_window,customer_id_avg_amount_7day_window,customer_id_avg_amount_14day_window,customer_id_nb_tx_15min_window,customer_id_nb_tx_30min_window,customer_id_nb_tx_60min_window,customer_id_avg_amount_15min_window,customer_id_avg_amount_30min_window,customer_id_avg_amount_60min_window
0,2022-10-23 09:43:03+00:00,5090795046287155,1,1,1,80.28,80.28,80.28,0,0,0,0.0,0.0,0.0
1,2022-10-23 09:43:03+00:00,2805636636517471,1,1,1,67.23,67.23,67.23,0,0,0,0.0,0.0,0.0
2,2022-10-23 09:43:03+00:00,5533486567478465,1,1,1,32.8,32.8,32.8,0,0,0,0.0,0.0,0.0
3,2022-10-23 09:43:03+00:00,8460617210167899,1,1,1,11.13,11.13,11.13,0,0,0,0.0,0.0,0.0
4,2022-10-23 09:43:03+00:00,548139265099145,1,1,1,76.54,76.54,76.54,0,0,0,0.0,0.0,0.0
5,2022-10-23 09:43:03+00:00,5698547930681804,1,1,1,21.59,21.59,21.59,0,0,0,0.0,0.0,0.0
6,2022-10-23 09:43:03+00:00,4624159904240573,1,1,1,49.58,49.58,49.58,0,0,0,0.0,0.0,0.0
7,2022-10-23 09:43:03+00:00,4618490453079178,1,1,1,26.09,26.09,26.09,0,0,0,0.0,0.0,0.0
8,2022-10-23 09:43:03+00:00,301734323388056,1,1,1,88.98,88.98,88.98,0,0,0,0.0,0.0,0.0
9,2022-10-23 09:43:03+00:00,1975163517046804,1,1,1,48.83,48.83,48.83,0,0,0,0.0,0.0,0.0


In [26]:
run_bq_query(
    f"SELECT * FROM `{TERMINALS_TABLE_NAME}` LIMIT 10"
)

Finished job_id: 4f41b947-7ed7-474b-b647-df1512959d6e


Unnamed: 0,feature_ts,terminal_id,terminal_id_nb_tx_1day_window,terminal_id_nb_tx_7day_window,terminal_id_nb_tx_14day_window,terminal_id_risk_1day_window,terminal_id_risk_7day_window,terminal_id_risk_14day_window,terminal_id_nb_tx_15min_window,terminal_id_nb_tx_30min_window,terminal_id_nb_tx_60min_window,terminal_id_avg_amount_15min_window,terminal_id_avg_amount_30min_window,terminal_id_avg_amount_60min_window
0,2022-10-23 09:43:03+00:00,36933918,37,222,258,0.027027,0.045045,0.042636,0,0,0,0.0,0.0,0.0
1,2022-10-23 09:43:03+00:00,53692739,4,4,4,0.249994,0.249994,0.249994,0,0,0,0.0,0.0,0.0
2,2022-10-23 09:43:03+00:00,59062665,7,7,7,0.142855,0.142855,0.142855,0,0,0,0.0,0.0,0.0
3,2022-10-23 09:43:03+00:00,13812436,8,10,10,0.124998,0.099999,0.099999,0,0,0,0.0,0.0,0.0
4,2022-10-23 09:43:03+00:00,55564660,3,3,3,0.333322,0.333322,0.333322,0,0,0,0.0,0.0,0.0
5,2022-10-23 09:43:03+00:00,94705593,4,4,4,0.499988,0.499988,0.499988,0,0,0,0.0,0.0,0.0
6,2022-10-23 09:43:03+00:00,77302003,7,7,7,0.142855,0.142855,0.142855,0,0,0,0.0,0.0,0.0
7,2022-10-23 09:43:03+00:00,21387618,1,1,1,0.9999,0.9999,0.9999,0,0,0,0.0,0.0,0.0
8,2022-10-23 09:43:03+00:00,7925259,7,7,7,0.142855,0.142855,0.142855,0,0,0,0.0,0.0,0.0
9,2022-10-23 09:43:03+00:00,17373952,7,7,7,0.142855,0.142855,0.142855,0,0,0,0.0,0.0,0.0


#### Ingest customer feature values into `customers` entity in Vertex AI Feature Store 

In the following section, you will import customer feature values into your feature store.

In [27]:
CUSTOMERS_FEATURES_IDS = [
    feature.name for feature in customer_feature_ids.list_features()
]
CUSTOMER_BQ_SOURCE_URI = f"bq://{CUSTOMERS_TABLE_NAME}"
CUSTOMER_ENTITY_ID_FIELD = "customer_id"

In [28]:
customer_entity_type.ingest_from_bq(
    feature_ids=CUSTOMERS_FEATURES_IDS,
    feature_time=FEATURE_TIME,
    bq_source_uri=CUSTOMER_BQ_SOURCE_URI,
    entity_id_field=CUSTOMER_ENTITY_ID_FIELD,
    disable_online_serving=False,
    worker_count=10,
    sync=False,
)

Importing EntityType feature values: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer


<google.cloud.aiplatform.featurestore.entity_type.EntityType object at 0x7f33150f5590> 
resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer

#### Monitor the `customer` features ingestion job in the console.


You can go to the [Feature Store Console](https://console.cloud.google.com/vertex-ai/ingestion-jobs) to view your ingestion job. 

#### Ingest terminal feature values into `terminal` entity in Vertex AI Feature Store 

In the following section, you will import terminal feature values into your feature store.

In [29]:
TERMINAL_ENTITY_ID = "terminal"
TERMINALS_FEATURES_IDS = [
    feature.name for feature in terminal_feature_ids.list_features()
]
TERMINALS_BQ_SOURCE_URI = f"bq://{TERMINALS_TABLE_NAME}"
TERMINALS_ENTITY_ID_FIELD = "terminal_id"

In [30]:
terminal_entity_type.ingest_from_bq(
    feature_ids=TERMINALS_FEATURES_IDS,
    feature_time=FEATURE_TIME,
    bq_source_uri=TERMINALS_BQ_SOURCE_URI,
    entity_id_field=TERMINALS_ENTITY_ID_FIELD,
    disable_online_serving=False,
    worker_count=10,
    sync=False,
)

Importing EntityType feature values: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal


<google.cloud.aiplatform.featurestore.entity_type.EntityType object at 0x7f3314f46310> 
resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal

Import EntityType feature values backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer/operations/7554754633650929664
Import EntityType feature values backing LRO: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal/operations/3560114540731432960
EntityType feature values imported. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/customer
EntityType feature values imported. Resource name: projects/897147055262/locations/us-central1/featurestores/fraudfinder_z0umx/entityTypes/terminal


#### Monitor the ingestion jobs in the console.

The ingestion jobs you just created run asynchronously and they should take several minutes to complete. Please monitoring them in the [console](https://console.cloud.google.com/vertex-ai/ingestion-jobs).


### Search for feature values 
In this section, you'll run a search query on your feature store to validate that some data was ingested, as expected.

In [None]:
customer_aggregated_features = customer_entity_type.read(
    entity_ids=["5830444124423549", "5469689693941771", "1361459972478769"],
    feature_ids=CUSTOMERS_FEATURES_IDS,
)

In [None]:
customer_aggregated_features

### END

Now you can go to the next notebook `03_feature_engineering_streaming.ipynb`

## (DO NOT RUN) Clean up

In [None]:
# ff_feature_store.delete(sync=True, force=True)