In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 2 : experimentation: get started with Feature Store

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage2/get_started_vertex_feature_store.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/ai/platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage2/get_started_vertex_feature_store.ipynb">
      Open in Google Cloud Notebooks
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 2 : experimentation: get started with Feature Store.

### Dataset

The dataset used for this tutorial is the Movie Recommendations. The version of the dataset you will use in this tutorial is stored in a public Cloud Storage bucket, in Avro format.

The dataset predicts whether a persons will watch a movie.

### Objective

In this tutorial, you learn how to use `Vertex AI Feature Store` for when training and prediction with `Vertex AI`.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Feature Store`

The steps performed include:

- Creating a Vertex AI `Featurestore` resource.
    - Creating `EntityType` resources for the `Featurestore` resource.
    - Creating `Feature` resources for each `EntityType` resource.
- Import feature values (entity data items) into `Featurestore` resource.
- Perform online serving from a `Featurestore` resource.
- Perform batch serving from a `Featurestore` resource.

## Installations

Install *one time* the packages for executing the MLOps notebooks.

In [None]:
ONCE_ONLY = False
if ONCE_ONLY:
    ! pip3 install -U tensorflow==2.5 $USER_FLAG
    ! pip3 install -U tensorflow-data-validation==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-transform==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-io==0.18 $USER_FLAG
    ! pip3 install --upgrade google-cloud-aiplatform[tensorboard] $USER_FLAG
    ! pip3 install --upgrade google-cloud-pipeline-components $USER_FLAG
    ! pip3 install --upgrade google-cloud-bigquery $USER_FLAG
    ! pip3 install --upgrade google-cloud-logging $USER_FLAG
    ! pip3 install --upgrade apache-beam[gcp] $USER_FLAG
    ! pip3 install --upgrade pyarrow $USER_FLAG
    ! pip3 install --upgrade cloudml-hypertune $USER_FLAG
    ! pip3 install --upgrade kfp $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

#### Import Vertex SDK

Import the Vertex SDK into your Python environment.

In [None]:
import time

from google.cloud.aiplatform import gapic as aip

#### Import BigQuery

Import the BigQuery package into your Python environment.

In [None]:
from google.cloud import bigquery

### Create BigQuery client

Create the BigQuery client.

In [None]:
bqclient = bigquery.Client()

#### Vertex AI constants

Setup up the following constants for Vertex AI:

- `API_ENDPOINT`: The Vertex AI API service endpoint for `FeatureStore` services.

In [None]:
# API service endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

## Set up clients

The Vertex SDK works as a client/server model. On your side (the Python script) you will create a client that sends requests and receives responses from the Vertex AI server.

You will use different clients in this tutorial for different steps in the workflow. So set them all up upfront.

- Feature Store Service for creating a feature store.
- Feature Store Serving Service for serving from a feature store.

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_feature_store_client():
    client = aip.FeaturestoreServiceClient(client_options=client_options)
    return client


def create_feature_store_serving_client():
    client = aip.FeaturestoreOnlineServingServiceClient(client_options=client_options)
    return client


clients = {}
clients["feature_store"] = create_feature_store_client()
clients["feature_store_serving"] = create_feature_store_serving_client()

for client in clients.items():
    print(client)

## Introduction to Vertex AI Feature Store

Let's assume you have a recommendation model that predicts a coupon to print on the back of a cash register receipt. Now, if that model was trained only on single transaction instances (what was bought and how much), then (in the past) you used an Apriori algorithm.

But now we have historical data on the customer (say it's indexed by credit card number). Like total purchases to date, average purchase per transaction, frequency of purchase by product category, etc. We use this "enriched data" to train a recommender system.

Now it's time to do a live prediction. You get a transaction from the cash register, but all it has is the credit card number and this transaction. It does not have the enriched data the model needs. During serving, the credit card number is used as an index to Feature Store to get the enriched data needed for the model.

Next problem. Let's say the enriched data the model was trained on was timestamp June 1. This transaction is June 15. Assume that the user has made other transactions between June 1 and 15, and the enriched data has been continuously updated in Feature Store. But the model was trained on June 1st data. FeatureStore knows the version number and serves the June 1 version to the model (not the current June 15); otherwise, if you used June 15 data you have training-serving skew.

Next problem, data drift. Things change, suddenly one day everybody is buying toilet paper! There is a significant change in the distribution of the current stored enriched data from the distribution that the deployed model was trained on. FeatureStore can detect changes/thresholds in distribution changes and trigger a notification for retraining the model.

Learn more about [Vertex AI Feature Store API](https://cloud.google.com/vertex-ai/docs/featurestore)

## Vertex AI Feature Store data model

Vertex AI Feature Store organizes data with the following 3 important hierarchical concepts:

        Featurestore -> EntityType -> Feature

- `Featurestore`: the place to store your features
- `EntityType`: under a `Featurestore`, an `EntityType` describes an object to be modeled, real one or virtual one.
- `Feature`: under an `EntityType`, a `Feature` describes an attribute of the `EntityType`

Learn more about [Vertex AI Feature Store data model](https://cloud.google.com/vertex-ai/docs/featurestore/concepts).

In the movie prediction dataset, you create a `Featurestore` resource called movies. This `Featurestore` resource has 2 entity types:
- `users`: The entity type has the `age`, `gender`, and `like genres` features.
- `movies`: The entity type has the `genres` and `average rating` features.

## Create a `Featurestore` resource

First, you create a `Featurestore` for the dataset using the `create_featurestore()` method, with the following parameters:

- `parent`: The base portion of the fully qualified resource identifier (projects/<project>/location/<location>)
- `featurestore_id`: The name of the feature store.
- `featurestore`: Configuration settings for the feature store.
    - `online_serving_config`: Configuration settings for online serving from the feature store.

Note, this is a long-running-operation (LRO), so you do a `response.result()` to block on the operation completing.

In [None]:
from google.cloud.aiplatform_v1.types import featurestore, featurestore_service

# Represents featurestore resource path.
FEATURESTORE_NAME = "movies"

response = clients["feature_store"].create_featurestore(
    featurestore_service.CreateFeaturestoreRequest(
        parent=PARENT,
        featurestore_id=FEATURESTORE_NAME,
        featurestore=featurestore.Featurestore(
            online_serving_config=featurestore.Featurestore.OnlineServingConfig(
                fixed_node_count=1
            )
        ),
    )
)

response.result()

### List your `Featurestore` resources

You can get a list of all your `Featurestore` resources in your project using the `list_featurestores()` method, with the following parameters:

- `parent`: The base portion of the fully qualified resource identifier (projects/<project>/location/<location>)

In [None]:
featurestores = clients["feature_store"].list_featurestores(parent=PARENT)

for featurestore in featurestores:
    print(featurestore)

### Get a `Featurestore` resource

You can get a specifed `Featurestore` resource in your project using the `get_featurestore()` method, with the following parameters:

- `name`: The fully qualified resource identifier for the `Featurestore` resource.

In [None]:
resource_name = clients["feature_store"].featurestore_path(
    PROJECT_ID, REGION, FEATURESTORE_NAME
)
print(resource_name)

featurestore = clients["feature_store"].get_featurestore(name=resource_name)
print(featurestore)

## Create entity types for your `Featurestore` resource

Next, you create the `EntityType` resources for your `Featurestore` resource using the `create_entity_type()` method, with the following parameters:

- `parent`: The fully qualified resource identifier for the `Featurestore` resource.
- `entity_type_id`: The name of the `EntityType` resource.
- `entity_type`: Configuration settings for the `EntityType` resource.

In [None]:
from google.cloud.aiplatform_v1.types import entity_type

for name, description in [("users", "Users descrip"), ("movies", "Movies descrip")]:
    response = clients["feature_store"].create_entity_type(
        featurestore_service.CreateEntityTypeRequest(
            parent=resource_name,
            entity_type_id=name,
            entity_type=entity_type.EntityType(
                description=description,
            ),
        )
    )

    response.result()

### Add `Feature` resources for your `EntityType` resources

Next, you create the `Feature` resources for each of the `EntityType` resources in your `Featurestore` resource using the `create_feature()` method, with the following parameters:

- `parent`: The fully qualified resource identifier for the `EntityType` resource.
- `feature_id`: The name of the `Feature` resource.
- `feature`: The configuration settings for the `Feature` resource.

In [None]:
from google.cloud.aiplatform_v1.types import feature


def create_features(featurestore_name, entity_name, features):
    parent = clients["feature_store"].entity_type_path(
        PROJECT_ID, REGION, featurestore_name, entity_name
    )
    for name, descrip, dtype in features:
        response = clients["feature_store"].create_feature(
            parent=parent,
            feature=feature.Feature(value_type=dtype, description=descrip),
            feature_id=name,
        )

        response.result()


create_features(
    FEATURESTORE_NAME,
    "users",
    [
        ("age", "Age descrip", feature.Feature.ValueType.INT64),
        ("gender", "Gender descrip", feature.Feature.ValueType.STRING),
        ("liked_genres", "Genres descrip", feature.Feature.ValueType.STRING_ARRAY),
    ],
)

create_features(
    FEATURESTORE_NAME,
    "movies",
    [
        ("title", "Title descrip", feature.Feature.ValueType.STRING),
        ("genres", "Genres descrip", feature.Feature.ValueType.STRING),
        ("average_rating", "Ave descrip", feature.Feature.ValueType.DOUBLE),
    ],
)

### Search all `Feature` resources in your `Featurestore` resources

You can get a list of all `Feature` resources in your `Featurestore` resources using the method `search_features()`, with the following parameters:

- `location`: The base portion of the fully qualified resource identifier (projects/<project>/location/<location>)

In [None]:
features = clients["feature_store"].search_features(location=PARENT)

for feature in features:
    print(features)

### Search `Feature` resources using a query filter

You can narrow your search of `Feature` resources by specifying a `query` filter.

In [None]:
# Search by name
features = clients["feature_store"].search_features(
    featurestore_service.SearchFeaturesRequest(
        location=PARENT, query="feature_id:title"
    )
)

print("By Name")
for feature in features:
    print(features)

# Search by data type
features = clients["feature_store"].search_features(
    featurestore_service.SearchFeaturesRequest(
        location=PARENT, query="value_type=DOUBLE"
    )
)

print("By Data Type")
for feature in features:
    print(feature)

In [None]:
IMPORT_FILE = (
    "gs://cloud-samples-data/vertex-ai/feature-store/datasets/movie_prediction.csv"
)
FS_ENTITIES = {
    "users": "gs://cloud-samples-data/vertex-ai/feature-store/datasets/users.avro",
    "movies": "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movies.avro",
}

## Import the feature data into your `Featurestore` resource

Next, you import the feature data for your `Featurestore` resource. Once imported, you can use these feature values for online and offline (batch) serving.

### Data layout

Each imported `EntityType` resource data must have an ID; also, each `EntityType` resource data item can optionally have a timestamp, sepecifying when the feature values were generated.

When importing, specify the following in your request:

- Data source format: BigQuery Table/Avro/CSV
- Data source URL
- Destination: featurestore/entity types/features to be imported

The feature values for the movies dataset are in Avro format. The Avro schemas are as follows:

**Users entity**:

```
schema = {
  "type": "record",
  "name": "User",
  "fields": [
      {
       "name":"user_id",
       "type":["null","string"]
      },
      {
       "name":"age",
       "type":["null","long"]
      },
      {
       "name":"gender",
       "type":["null","string"]
      },
      {
       "name":"liked_genres",
       "type":{"type":"array","items":"string"}
      },
      {
       "name":"update_time",
       "type":["null",{"type":"long","logicalType":"timestamp-micros"}]
      },
  ]
 }
 ```

**Movies entity**:

```
schema = {
 "type": "record",
 "name": "Movie",
 "fields": [
     {
      "name":"movie_id",
      "type":["null","string"]
     },
     {
      "name":"average_rating",
      "type":["null","double"]
     },
     {
      "name":"title",
      "type":["null","string"]
     },
     {
      "name":"genres",
      "type":["null","string"]
     },
     {
      "name":"update_time",
      "type":["null",{"type":"long","logicalType":"timestamp-micros"}]
     },
 ]
}
```

### Importing the feature values

You import the feature values for the `EntityType` resources using the `import_feature_values()` method, with the following parameters:

- `entity_type`: The fully qualified resource identifier for the `EntityType` resource.
- The location of the feature values, one of:
    `avro_source`: The Cloud Storage location of the feature values in Avro format.
    `csv_source`: The Cloud Storage location of the feature values in Avro format.
    `bigquery_source`: The BigQuery table for the feature values.
- `entity_id_field`: The source column for the unique ID for each entity data item.
- `feature_specs`: The source colums for the features to import into the `EntityType` resource.
- `feature_time_field`: The source column for the timestamp of each entity data item.
- `worker_count`: The number of parallel workers to read in and update the feature values in the `EntityType` resource.

In [None]:
from google.cloud.aiplatform_v1.types import io as io

request = featurestore_service.ImportFeatureValuesRequest(
    entity_type=clients["feature_store"].entity_type_path(
        PROJECT_ID, REGION, FEATURESTORE_NAME, "users"
    ),
    avro_source=io.AvroSource(
        # Source
        gcs_source=io.GcsSource(uris=[FS_ENTITIES["users"]])
    ),
    entity_id_field="user_id",
    feature_specs=[
        # Features
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(id="age"),
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(id="gender"),
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(id="liked_genres"),
    ],
    feature_time_field="update_time",
    worker_count=1,
)

response = clients["feature_store"].import_feature_values(request)
response.result()

request = featurestore_service.ImportFeatureValuesRequest(
    entity_type=clients["feature_store"].entity_type_path(
        PROJECT_ID, REGION, FEATURESTORE_NAME, "movies"
    ),
    avro_source=io.AvroSource(gcs_source=io.GcsSource(uris=[FS_ENTITIES["movies"]])),
    entity_id_field="movie_id",
    feature_specs=[
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(id="title"),
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(id="genres"),
        featurestore_service.ImportFeatureValuesRequest.FeatureSpec(
            id="average_rating"
        ),
    ],
    feature_time_field="update_time",
    worker_count=1,
)

response = clients["feature_store"].import_feature_values(request)
response.result()

## Vertex AI Feature Store serving

The Vertex AI Feature Store service provides the following two services for serving features from a `Featurestore` resource:

- Online serving - low-latency serving of small batches of features (prediction).

- Batch serving - high-throughput serving of large batches of features (training and prediction).

In [None]:
from google.cloud.aiplatform_v1.types import (FeatureSelector, IdMatcher,
                                              featurestore_online_service)


def serve_features(featurestore_name, entity_name, features, id):
    feature_selector = FeatureSelector(id_matcher=IdMatcher(ids=features))

    request = clients["feature_store_serving"].read_feature_values(
        featurestore_online_service.ReadFeatureValuesRequest(
            # Fetch from the following feature store/entity type
            entity_type=clients["feature_store"].entity_type_path(
                PROJECT_ID, REGION, featurestore_name, entity_name
            ),
            entity_id=id,
            feature_selector=feature_selector,
        )
    )
    return request


features = serve_features(
    FEATURESTORE_NAME, "users", ["age", "gender", "liked_genres"], "alice"
)

print(features)

### Multiple entity data items

You serve features for multiple entity data items using the `streaming_read_feature_values()` method with the following parameters:

- `entity_type`: The fully qualified resource identifier for the `EntityType` resource.
- `feature_selector`: The features to serve from the corresponding `EntityType` resource.
- `entity_ids`: The unique IDs of the data items to serve the corresponding features.

In [None]:
from google.cloud.aiplatform_v1.types import (FeatureSelector, IdMatcher,
                                              featurestore_online_service)


def serve_streaming_features(featurestore_name, entity_name, features, ids):
    feature_selector = FeatureSelector(id_matcher=IdMatcher(ids=features))

    request = clients["feature_store_serving"].streaming_read_feature_values(
        featurestore_online_service.StreamingReadFeatureValuesRequest(
            # Fetch from the following feature store/entity type
            entity_type=clients["feature_store"].entity_type_path(
                PROJECT_ID, REGION, featurestore_name, entity_name
            ),
            entity_ids=ids,
            feature_selector=feature_selector,
        )
    )
    return request


features = serve_streaming_features(
    FEATURESTORE_NAME, "users", ["age", "gender", "liked_genres"], ["alice", "bob"]
)

for feature in features:
    print(feature)

## Batch Serving

The Vertex AI Feature Store batch serving service is optimized for serving large batches of features in real-time with high-throughput, typically for training a model or batch prediction.

### Output dataset

For batch serving, you use a BigQuery table for the output. First, you must create this output destination table.

In [None]:
# Output dataset
DESTINATION_DATASET = f"movies_predictions_{TIMESTAMP}"

# Output table.
DESTINATION_TABLE = "training_data"  # @param {type:"string"}

DESTINATION_TABLE_URI = f"bq://{PROJECT_ID}.{DESTINATION_DATASET}.{DESTINATION_TABLE}"

dataset_id = f"{PROJECT_ID}.{DESTINATION_DATASET}"
dataset = bigquery.Dataset(dataset_id)
dataset = bqclient.create_dataset(dataset)
print("Created dataset:", dataset_id)

### Batch Read Feature Values

Assemble the request which specify the following info:

*   Where is the label data, i.e., Table 1.
*   Which features are read, i.e., the column names in Table 2.

The output is stored in a BigQuery table.

In [None]:
request = featurestore_service.BatchReadFeatureValuesRequest(
    # featurestore info
    featurestore=clients["feature_store"].featurestore_path(
        PROJECT_ID, REGION, FEATURESTORE_NAME
    ),
    # URL for the label data, i.e., Table 1.
    csv_read_instances=io.CsvSource(gcs_source=io.GcsSource(uris=[IMPORT_FILE])),
    destination=featurestore_service.FeatureValueDestination(
        bigquery_destination=io.BigQueryDestination(
            # Output to BigQuery table created earlier
            output_uri=DESTINATION_TABLE_URI
        )
    ),
    entity_type_specs=[
        featurestore_service.BatchReadFeatureValuesRequest.EntityTypeSpec(
            # Read the 'age', 'gender' and 'liked_genres' features from the 'users' entity
            entity_type_id="users",
            feature_selector=FeatureSelector(
                id_matcher=IdMatcher(
                    ids=[
                        # features, use "*" if you want to select all features within this entity type
                        "age",
                        "gender",
                        "liked_genres",
                    ]
                )
            ),
        ),
        featurestore_service.BatchReadFeatureValuesRequest.EntityTypeSpec(
            # Read the 'average_rating' and 'genres' feature values of the 'movies' entity
            entity_type_id="movies",
            feature_selector=FeatureSelector(
                id_matcher=IdMatcher(ids=["average_rating", "genres"])
            ),
        ),
    ],
)

response = clients["feature_store"].batch_read_feature_values(request)

response.result()

### Delete a BigQuery dataset

Use the method `delete_dataset()` to delete a BigQuery dataset along with all its tables, by setting the parameter `delete_contents` to `True`.

In [None]:
bqclient.delete_dataset(dataset, delete_contents=True)

### Delete a `Featurestore` resource

You can get a delete a specified `Featurestore` resource using the `delete_featurestores()` method, with the following parameters:

- `name`: The fully qualified resource identifier for the `Featurestore` resource.
- `force`: Forces deletion of the `Featurestore` resource when non-empty.

In [None]:
clients["feature_store"].delete_featurestore(name=resource_name, force=True)