In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
    <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
    
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 
        Run in Colab
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/sdk-feature-store-pandas.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Overview

This notebook introduces Pandas support for Feature Store using Vertex AI SDK. For pre-requisites and introduction on Vertex AI SDK and Feature Store native support, please go through this [Colab notebook](https://colab.sandbox.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/feature_store/sdk-feature-store.ipynb). 

### Dataset

This tutorial uses a movie recommendation dataset as an example throughout all the notebooks including this one. The original task is to train a model to predict if a user is going to watch a movie and serve the model online.

### Objective

In this notebook, you learn how to:

- Ingest Feature values from Pandas DataFrame into Feature Store's Entity types.
- Read Entity Feature values from Online Feature Store into Pandas DataFrame.
- Batch serve Feature values from your Feature Store into Pandas DataFrame.

You also learn how Vertex AI Feature Store can be useful in the below scenarios:

- Online serving with updated feature values.
- Point-in-time correctness to fetch feature values for training.

### Costs 

This tutorial uses billable components of Google Cloud:

* Vertex AI

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Before you begin

### Install additional packages

To run this notebook, you need to install the following packages for Python.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"
    
! pip install -U {USER_FLAG} --upgrade google-cloud-aiplatform \
                                        google-cloud-bigquery \
                                        google-cloud-bigquery-storage \
                                        avro \
                                        pyarrow \
                                        pandas -q

### Restart the kernel

After you install the packages, you need to restart the notebook kernel so that it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click **Create**. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Import libraries and define constants

In [None]:
import datetime

import pandas as pd
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

## Create Feature Store Resources

### Create Feature Store

The method to create a Feature Store returns a
[long-running operation](https://google.aip.dev/151) (LRO). An LRO starts an asynchronous job. LROs are returned for other API
methods too, such as updating or deleting a featurestore. Running the code cell creates a featurestore and prints the process logs.

In [None]:
movie_predictions_feature_store = aiplatform.Featurestore.create(
    featurestore_id="movie_predictions",
    online_store_fixed_node_count=1,
)

### Create Entity Types

Entity types can be created within the Featurestore class. Below, you create the `Users` entity type and `Movies` entity type. Process logs are printed in the output for each cell.

In [None]:
users_entity_type = movie_predictions_feature_store.create_entity_type(
    entity_type_id="users",
    description="Users entity",
)

In [None]:
movies_entity_type = movie_predictions_feature_store.create_entity_type(
    entity_type_id="movies",
    description="Movies entity",
)

### Create Features
Features can be created within each entity type. Add defining features to the `Users` entity type and `Movies` entity type by using the following methods.

In [None]:
users_feature_age = users_entity_type.create_feature(
    feature_id="age",
    value_type="INT64",
    description="User age",
)

users_feature_gender = users_entity_type.create_feature(
    feature_id="gender",
    value_type="STRING",
    description="User gender",
)

users_feature_liked_genres = users_entity_type.create_feature(
    feature_id="liked_genres",
    value_type="STRING_ARRAY",
    description="An array of genres this user liked",
)

In [None]:
movies_feature_configs = {
    "title": {
        "value_type": "STRING",
        "description": "The title of the movie",
    },
    "genres": {
        "value_type": "STRING",
        "description": "The genre of the movie",
    },
    "average_rating": {
        "value_type": "DOUBLE",
        "description": "The average rating for the movie, range is [1.0-5.0]",
    },
}

In [None]:
movie_features = movies_entity_type.batch_create_features(
    feature_configs=movies_feature_configs,
)

## Ingest Feature Values into Entity Type from a Pandas DataFrame

You need to ingest feature values into your entity type containing the features, so you can later `read` (online) or `batch serve` (offline) the feature values from the entity type. In this step, you will learn how to ingest feature values from a Pandas DataFrame into an entity type. We can also import feature values from BigQuery or Google Cloud Storage.


#### Get data from source files

In [None]:
GCS_USERS_AVRO_URI = (
    "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/users.avro"
)
GCS_MOVIES_AVRO_URI = (
    "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movies.avro"
)

USERS_AVRO_FN = "users.avro"
MOVIES_AVRO_FN = "movies.avro"

In [None]:
! gsutil cp $GCS_USERS_AVRO_URI $USERS_AVRO_FN
! gsutil cp $GCS_MOVIES_AVRO_URI $MOVIES_AVRO_FN

#### Load Avro Files into Pandas DataFrames

In [None]:
from avro.datafile import DataFileReader
from avro.io import DatumReader


class AvroReader:
    def __init__(self, data_file):
        self.avro_reader = DataFileReader(open(data_file, "rb"), DatumReader())

    def to_dataframe(self):
        records = [record for record in self.avro_reader]
        return pd.DataFrame.from_records(data=records)

In [None]:
users_avro_reader = AvroReader(data_file=USERS_AVRO_FN)
users_source_df = users_avro_reader.to_dataframe()
print(users_source_df)

In [None]:
movies_avro_reader = AvroReader(data_file=MOVIES_AVRO_FN)
movies_source_df = movies_avro_reader.to_dataframe()
print(movies_source_df)

#### Ingest Feature Values into _Users_ Entity Type

In [None]:
users_entity_type.ingest_from_df(
    feature_ids=["age", "gender", "liked_genres"],
    feature_time="update_time",
    df_source=users_source_df,
    entity_id_field="user_id",
)

#### Ingest Feature Values into _Movies_ Entity Type

In [None]:
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating", "title", "genres"],
    feature_time="update_time",
    df_source=movies_source_df,
    entity_id_field="movie_id",
)

## Read/Online Serve Entity's Feature Values from Vertex AI Online Feature Store

Feature Store allows [online serving](https://cloud.google.com/vertex-ai/docs/featurestore/serving-online)
which lets you read feature values for small batches of entities. It works well when you want to read values of selected features from an entity or multiple entities in an entity type.

In [None]:
users_read_df = users_entity_type.read(
    entity_ids=["dave", "alice", "charlie", "bob", "eve"],
)
print(users_read_df)

In [None]:
movies_read_df = movies_entity_type.read(
    entity_ids=["movie_01", "movie_02", "movie_03", "movie_04"],
    feature_ids=["title", "genres", "average_rating"],
)
print(movies_read_df)

## Batch Serve Feature Values from Vertex AI Feature Store

Batch Serving is used to fetch a large batch of feature values for high-throughput, and is typically used for training a model or batch prediction. In this section, you learn how to prepare training examples by using the Feature Store's batch serve function.

#### Read instances from source file

In [None]:
GCS_READ_INSTANCES_CSV_URI = "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movie_prediction.csv"
READ_INSTANCES_CSV_FN = "data.csv"

In [None]:
! gsutil cp $GCS_READ_INSTANCES_CSV_URI $READ_INSTANCES_CSV_FN

#### Load CSV file into a Pandas DataFrame

In [None]:
read_instances_df = pd.read_csv(READ_INSTANCES_CSV_FN)
print(read_instances_df)

#### Change the Dtype of `Timestamp` to `Datetime64`

In [None]:
print("before: ", read_instances_df["timestamp"].dtype)
read_instances_df = read_instances_df.astype({"timestamp": "datetime64"})
print("after:  ", read_instances_df["timestamp"].dtype)

#### Batch Serve Feature Values from Movie Predictions Feature Store

In [None]:
movie_predictions_df = movie_predictions_feature_store.batch_serve_to_df(
    serving_feature_ids={
        "users": ["age", "gender", "liked_genres"],
        "movies": ["title", "average_rating", "genres"],
    },
    read_instances_df=read_instances_df,
)
movie_predictions_df

## Read the Updated Feature Values

#### Feature Values from last ingestion
Recall read from the Entity Type shows Feature Values from the last ingestion.

In [None]:
print(movies_read_df)

#### Ingest updated Feature Values

In [None]:
update_movies_df = pd.DataFrame(
    data=[["movie_03", 4.3], ["movie_04", 4.8]],
    columns=["movie_id", "average_rating"],
)
print(update_movies_df)

In [None]:
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating"],
    feature_time=datetime.datetime.now(),
    df_source=update_movies_df,
    entity_id_field="movie_id",
)

#### Latest Feature Values
Read from the Entity Type shows updated Feature values from the latest ingestion.

In [None]:
update_movies_read_df = movies_entity_type.read(
    entity_ids=["movie_01", "movie_02", "movie_03", "movie_04"],
    feature_ids=["title", "genres", "average_rating"],
)
print(update_movies_read_df)

## Point-in-Time Correctness

#### Missing data
Recall Batch Serve from the last ingestion has some missing data in it.

In [None]:
print(movie_predictions_df)

#### Backfill/Correct point-in-time data

In [None]:
backfill_users_df = pd.DataFrame(
    data=[["bob", 34, "Male", ["Drama"], "2020-02-13 09:35:15"]],
    columns=["user_id", "age", "gender", "liked_genres", "update_time"],
)
backfill_users_df = backfill_users_df.astype({"update_time": "datetime64"})
print(backfill_users_df)

In [None]:
backfill_movies_df = pd.DataFrame(
    data=[["movie_04", 4.2, "The Dark Knight", "Action", "2020-02-13 09:35:15"]],
    columns=["movie_id", "average_rating", "title", "genres", "update_time"],
)
backfill_movies_df = backfill_movies_df.astype({"update_time": "datetime64"})
print(backfill_movies_df)

#### Ingest backfilled/corrected point-in-time data from dataframe

In [None]:
users_entity_type.ingest_from_df(
    feature_ids=["age", "gender", "liked_genres"],
    feature_time="update_time",
    df_source=backfill_users_df,
    entity_id_field="user_id",
)

In [None]:
movies_entity_type.ingest_from_df(
    feature_ids=["average_rating", "title", "genres"],
    feature_time="update_time",
    df_source=backfill_movies_df,
    entity_id_field="movie_id",
)

#### Latest ingestion with imputed missing data
Batch Serve from the latest ingestion with backfill/correction has reduced missing data.

In [None]:
backfill_movie_predictions_df = movie_predictions_feature_store.batch_serve_to_df(
    serving_feature_ids={
        "users": ["age", "gender", "liked_genres"],
        "movies": ["title", "average_rating", "genres"],
    },
    read_instances_df=read_instances_df,
)
print(backfill_movie_predictions_df)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

You can also keep the project but delete the featurestore by running the code below:

In [None]:
movie_predictions_feature_store.delete(force=True)