In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using Vertex AI Feature Store with Pandas Dataframe

<table align="left">
    <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
    
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 
        Run in Colab
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Overview

This notebook introduces Pandas support for Feature Store using Vertex AI SDK. For pre-requisites and introduction on Vertex AI SDK and Feature Store native support, please go through this [Colab notebook](https://colab.sandbox.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/feature_store/sdk-feature-store.ipynb). 

Learn more about [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore).

### Objective

In this notebook, you learn how to use `Vertex AI Feature Store` with pandas Dataframe.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Feature Store

The steps performed include:

- Ingest Feature values from Pandas DataFrame into Feature Store's Entity types.
- Read Entity feature values from Online Feature Store into Pandas DataFrame.
- Batch serve feature values from your Feature Store into Pandas DataFrame.

You also learn how Vertex AI Feature Store can be useful in the below scenarios:

- Online serving with updated feature values.
- Point-in-time correctness to fetch feature values for training.

### Dataset

This tutorial is a part of the Feature Store tutorial notebooks. It uses a movie recommendation dataset as an example for demonstrating various functionalities of Feature Store. The original task is to train a model to predict if a user is going to watch a movie, and serve the model online.

### Costs 

This tutorial uses billable components of Google Cloud:

* Vertex AI

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Before you begin

### Install additional packages

To run this notebook, you need to install the following packages for Python.

In [None]:
! pip install --quiet --upgrade google-cloud-aiplatform \
                                google-cloud-bigquery \
                                google-cloud-bigquery-storage \
                                avro \
                                pyarrow \
                                pandas

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

### Import libraries

In [None]:
import datetime

import pandas as pd
from avro.datafile import DataFileReader
from avro.io import DatumReader
from google.cloud import aiplatform

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and region.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

## Create a Feature Store

The method to create a Feature Store in Vertex AI returns a
[long-running operation](https://google.aip.dev/151) (LRO). An LRO starts an asynchronous job. LROs are returned for other API methods too, such as updating or deleting a featurestore. 

Running the code cell below creates a featurestore and prints the process' logs.

In [None]:
# Create featurestore
movie_predictions_feature_store = aiplatform.Featurestore.create(
    featurestore_id="movie_predictions", online_store_fixed_node_count=1
)

## Create Entity types

Entity types can be created within the Featurestore class. Below, you create the `Users` entity type and `Movies` entity type. Process logs are printed in the output for each cell.

In [None]:
# Create users entity type
users_entity_type = movie_predictions_feature_store.create_entity_type(
    entity_type_id="users",
    description="Users entity",
)

In [None]:
# Create movies entity type
movies_entity_type = movie_predictions_feature_store.create_entity_type(
    entity_type_id="movies",
    description="Movies entity",
)

## Create Features
Features can be created within each entity type. Add defined features to the `Users` entity type and `Movies` entity type by using the following methods.

### Add features using *create_feature* method
Provide the feature information like id, type and description to the `create_feature` method of entity type.

In [None]:
# Create age feature
users_feature_age = users_entity_type.create_feature(
    feature_id="age",
    value_type="INT64",
    description="User age",
)

# Create gender feature
users_feature_gender = users_entity_type.create_feature(
    feature_id="gender",
    value_type="STRING",
    description="User gender",
)

# Create liked_genres feature
users_feature_liked_genres = users_entity_type.create_feature(
    feature_id="liked_genres",
    value_type="STRING_ARRAY",
    description="An array of genres this user liked",
)

### Add features using batch method
You can also create features using a config map in a dictionary format and the `batch_create_features` method. This way, you can add multiple features at once. 

Below, you define and create *title*, *genres* and *average_rating* features using the batch method.

In [None]:
movies_feature_configs = {
    "title": {
        "value_type": "STRING",
        "description": "The title of the movie",
    },
    "genres": {
        "value_type": "STRING",
        "description": "The genre of the movie",
    },
    "average_rating": {
        "value_type": "DOUBLE",
        "description": "The average rating for the movie, range is [1.0-5.0]",
    },
}

movie_features = movies_entity_type.batch_create_features(
    feature_configs=movies_feature_configs,
)

## Ingest Feature values into Entity types from dataframes

You need to ingest feature values into your entity type containing the features. It is so that you can later `read` (online) or `batch serve` (offline) the feature values from the entity type. 

In this step, you learn how to ingest feature values from a Pandas dataframe into an entity type. You can also import feature values from BigQuery or Google Cloud Storage.

### Get data from source

Define the public data sources for users and movies and copy them locally into *avro* files.

In [None]:
GCS_USERS_AVRO_URI = (
    "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/users.avro"
)
GCS_MOVIES_AVRO_URI = (
    "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movies.avro"
)

USERS_AVRO_FN = "users.avro"
MOVIES_AVRO_FN = "movies.avro"

! gsutil cp $GCS_USERS_AVRO_URI $USERS_AVRO_FN
! gsutil cp $GCS_MOVIES_AVRO_URI $MOVIES_AVRO_FN

### Load data from avro files 

Load users and movies data from avro files into Pandas dataframes.

In [None]:
# Define a class for reading the avro data
class AvroReader:
    def __init__(self, data_file):
        self.avro_reader = DataFileReader(open(data_file, "rb"), DatumReader())

    def to_dataframe(self):
        records = [record for record in self.avro_reader]
        return pd.DataFrame.from_records(data=records)

In [None]:
# Load users data from avro file
users_avro_reader = AvroReader(data_file=USERS_AVRO_FN)
users_source_df = users_avro_reader.to_dataframe()
print(users_source_df)

In [None]:
# Load movies data from avro file
movies_avro_reader = AvroReader(data_file=MOVIES_AVRO_FN)
movies_source_df = movies_avro_reader.to_dataframe()
print(movies_source_df)

### Ingest Feature values into Entity types

Load the feature values into `users` entity type providing the id fields and time field.

In [None]:
users_entity_type.ingest_from_df(
    feature_ids=["age", "gender", "liked_genres"],
    feature_time="update_time",
    df_source=users_source_df,
    entity_id_field="user_id",
)

Load the feature values into `movie` entity type providing the id fields and time field.

In [None]:
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating", "title", "genres"],
    feature_time="update_time",
    df_source=movies_source_df,
    entity_id_field="movie_id",
)

## Read/serve Entity's feature values online from Feature Store

Feature Store allows [online serving](https://cloud.google.com/vertex-ai/docs/featurestore/serving-online)
which lets you read feature values for small batches of entities. It works well when you want to read values of selected features from an entity or multiple entities in an entity type.

### Read feature values for users

In [None]:
users_read_df = users_entity_type.read(
    entity_ids=["dave", "alice", "charlie", "bob", "eve"],
)
print(users_read_df)

### Read feature values for movies

In [None]:
movies_read_df = movies_entity_type.read(
    entity_ids=["movie_01", "movie_02", "movie_03", "movie_04"],
    feature_ids=["title", "genres", "average_rating"],
)
print(movies_read_df)

## Batch serve feature values from Feature Store

Batch Serving is used to fetch a large batch of feature values for high-throughput, and is typically used for training a model or batch prediction. In this section, you learn how to prepare training examples by using the Feature Store's batch serve function.

### Read instances from source file

Define the source file and destination file. 

In [None]:
GCS_READ_INSTANCES_CSV_URI = "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movie_prediction.csv"
READ_INSTANCES_CSV_FN = "data.csv"

Copy the instances from the source file to the destination file locally.

In [None]:
! gsutil cp $GCS_READ_INSTANCES_CSV_URI $READ_INSTANCES_CSV_FN

### Load the instances

Load the instances from CSV file into a Pandas dataframe.

In [None]:
read_instances_df = pd.read_csv(READ_INSTANCES_CSV_FN)
print(read_instances_df)

### Change the data type

Change the data type of the timestamp field from `Timestamp` to `Datetime64`.

In [None]:
print("before: ", read_instances_df["timestamp"].dtype)
read_instances_df = read_instances_df.astype({"timestamp": "datetime64"})
print("after:  ", read_instances_df["timestamp"].dtype)

### Batch serve feature values from Feature Store

Serve the batch response to a dataframe and display the data.

In [None]:
movie_predictions_df = movie_predictions_feature_store.batch_serve_to_df(
    serving_feature_ids={
        "users": ["age", "gender", "liked_genres"],
        "movies": ["title", "average_rating", "genres"],
    },
    read_instances_df=read_instances_df,
)
movie_predictions_df

## Read the latest feature values

In Feature Store, you access the latest or the last available feature values unless a specific time is provided. Now, you test this feature by ingesting new data to the entity types and reading it from the Feature Store.

### Ingest updated feature values

Now, you update the feature values by running the following cell. 

**Note:** For comparison, you can try printing the feature values read from the entity types earlier (those in `movies_read_df` variable). 

In [None]:
# Create a dataframe for the new data
update_movies_df = pd.DataFrame(
    data=[["movie_03", 4.3], ["movie_04", 4.8]],
    columns=["movie_id", "average_rating"],
)

# Ingest the new data from the dataframe
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating"],
    feature_time=datetime.datetime.now(),
    df_source=update_movies_df,
    entity_id_field="movie_id",
)

### Fetch the latest feature values

Reading from the entity type gives you the updated feature values from the latest ingestion.

In [None]:
update_movies_read_df = movies_entity_type.read(
    entity_ids=["movie_01", "movie_02", "movie_03", "movie_04"],
    feature_ids=["title", "genres", "average_rating"],
)
print(update_movies_read_df)

## Point-in-time correctness

Vertex AI Feature Store captures feature values for a feature at a specific point in time. In case there are missing values in your past data, you can backfill them using batch serving.

### Missing data
Recall that response from the batch serve from last ingestion has some missing data in it.

In [None]:
# Print the response
print(movie_predictions_df)

### Backfill/correct point-in-time data

Impute the missing data based on the timestamps.

In [None]:
# Impute the users data
backfill_users_df = pd.DataFrame(
    data=[["bob", 34, "Male", ["Drama"], "2020-02-13 09:35:15"]],
    columns=["user_id", "age", "gender", "liked_genres", "update_time"],
)
backfill_users_df = backfill_users_df.astype({"update_time": "datetime64"})
print(backfill_users_df)

In [None]:
# Impute the movies data
backfill_movies_df = pd.DataFrame(
    data=[["movie_04", 4.2, "The Dark Knight", "Action", "2020-02-13 09:35:15"]],
    columns=["movie_id", "average_rating", "title", "genres", "update_time"],
)
backfill_movies_df = backfill_movies_df.astype({"update_time": "datetime64"})
print(backfill_movies_df)

### Ingest the backfilled/corrected data

Ingest the imputed point-in-time data from dataframe to the entity types in feature store.

In [None]:
# Ingest the users data
users_entity_type.ingest_from_df(
    feature_ids=["age", "gender", "liked_genres"],
    feature_time="update_time",
    df_source=backfill_users_df,
    entity_id_field="user_id",
)

In [None]:
# Ingest the users data
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating", "title", "genres"],
    feature_time="update_time",
    df_source=backfill_movies_df,
    entity_id_field="movie_id",
)

### Fetch the latest data
Batch serve the latest ingested data with backfill/correction to a dataframe to ensure the feature store is updated. 

In [None]:
backfill_movie_predictions_df = movie_predictions_feature_store.batch_serve_to_df(
    serving_feature_ids={
        "users": ["age", "gender", "liked_genres"],
        "movies": ["title", "average_rating", "genres"],
    },
    read_instances_df=read_instances_df,
)
print(backfill_movie_predictions_df)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

In [None]:
# Delete the feature store
movie_predictions_feature_store.delete(force=True)