In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Feature Monitoring in Vertex AI Feature Store

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/feature_monitoring_with_feature_registry.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Ffeature_store%2Ffeature_monitoring_with_feature_registry.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/feature_monitoring_with_feature_registry.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/feature_monitoring_with_feature_registry.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

In this tutorial, you will learn how to use the Vertex AI SDK for Python to monitor feature data in Vertex AI Feature Store

This tutorial uses the following Google Cloud ML services and resources:

* Vertex AI Feature Store
* BigQuery

The steps performed include the following:

* Setup BigQuery data
* Setup Feature Registry
* Setup FeatureMonitors, execute FeatureMonitorJobs to observe feature stats and detect drift.
* Clean up

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform bigframes

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Restart kernel (Workbench only)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Grant instance service account permissions (Workbench only)

Grant your workbench instance owner (in format of xxx-compute@developer.gserviceaccount.com) following IAM permissions:
*   Bigquery Admin
*   Vertex AI Feature Store Admin


### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# change to your own project id
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Imports and IDs

In [None]:
import bigframes
import bigframes.pandas
import pandas as pd
from google.cloud import bigquery
from vertexai.resources.preview.feature_store import (Feature, FeatureGroup,
                                                      FeatureMonitor)
from vertexai.resources.preview.feature_store import utils as fs_utils

The following variables set BigQuery and Feature Group resources that will be
used or created. If you'd like to use your own data source (CSV), please adjust
`DATA_SOURCE`.

In [None]:
BQ_DATASET_ID = "fhfv_dataset_unique"  # @param {type:"string"}
BQ_TABLE_ID = "fhfv_table_unique"  # @param {type:"string"}
BQ_TABLE_URI = f"{PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"

FEATURE_GROUP_ID = "fg_feature_monitoring_tutorial"  # @param {type:"string"}

DATA_SOURCE = "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movie_prediction.csv"

## Create BigQuery table containing feature data

First we'll use BigQuery DataFrames to load in our CSV data source. Then we'll
rename the `timestamp` column to `feature_timestamp` to support usage as a
BigQuery source in Feature Registry.

In [None]:
session = bigframes.connect(
    bigframes.BigQueryOptions(
        project=PROJECT_ID,
        location=LOCATION,
    )
)
df = session.read_csv(DATA_SOURCE)
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.rename(columns={"timestamp": "feature_timestamp"})

Let's preview the data we'll write to the table.

In [None]:
df.head()

And finally we'll write the DataFrame to the target BigQuery table.

In [None]:
df.to_gbq(BQ_TABLE_URI, if_exists="replace")

## Create feature registry resources

Create a feature group backed by the BigQuery table created above.

In [None]:
fg: FeatureGroup = FeatureGroup.create(
    f"{FEATURE_GROUP_ID}",
    fs_utils.FeatureGroupBigQuerySource(
        uri=f"bq://{BQ_TABLE_URI}", entity_id_columns=["users"]
    ),
)

In [None]:
# For existing FeatureGroup, get by passing FEATURE_GROUP_ID
fg = FeatureGroup(f"{FEATURE_GROUP_ID}")
print(fg)

Create the `movies` feature which corresponds to the `movies` column in the
recently created BigQuery table.

In [None]:
movies_feature: Feature = fg.create_feature("movies")

## Setup Feature Monitoring

### Create Feature Monitor

In [None]:
FEATURE_MONITOR_ID = "vertex_sdk_fm_cron"  # @param {type:"string"}
fm: FeatureMonitor = fg.create_feature_monitor(
    name=FEATURE_MONITOR_ID,
    feature_selection_configs=[("movies", 0.1)],
    schedule_config="0 * * * *",  # Default schedule (hourly)
)

List Feature Monitors created in the Feature Group

In [None]:
fms: list[FeatureMonitor] = fg.list_feature_monitors()
print(fms)

Get FeatureMonitor and it's properties

In [None]:
fm = fg.get_feature_monitor(FEATURE_MONITOR_ID)
print(fm)
print(
    "feature selection configs: (feature and it's drift threshold):",
    fm.feature_selection_configs,
)
print("schedule config in cron string: ", fm.schedule_config)

### Execute a FeatureMonitorJob

FeatureMonitorJob will be executed in two ways:
1. Automatically executed in scheduled time set the schedule_config in FeatureMonitor.
2. Manually trigger. In the following sections we will manually trigger monitor job to observe stats and drifts.

Stats are generated on the snapshot of the data in FeatureMonitorJob execution.

Manually execute FeatureMonitorJob as following

In [None]:
fmj = fm.create_feature_monitor_job()

In [None]:
print(fmj)

#### Observe Feature Stats in FeatureMonitorJob

Get Feature Monitor Job and observe the feature_stats_and_anomalies. feature_stats refers to tensor flow proto [FeatureNameStatistics](https://www.tensorflow.org/tfx/tf_metadata/api_docs/python/tfmd/proto/statistics_pb2/FeatureNameStatistics)

In [None]:
# Note: if feature_stats_and_anomalies not shown, wait for a few seconds to minutes then retry
import time

while True:
    fmj_get = fm.get_feature_monitor_job(fmj.name)
    if (
        fmj_get.feature_stats_and_anomalies is None
        or len(fmj_get.feature_stats_and_anomalies) == 0
    ):
        time.sleep(5)
    else:
        break
print(fmj_get)
print(fmj_get.feature_stats_and_anomalies)

At this time, only one job executed, no drift detected.

In [None]:
for feature_stats_and_anomalies in fmj_get.feature_stats_and_anomalies:
    print("feature: ", feature_stats_and_anomalies.feature_id)
    print("drift score: ", feature_stats_and_anomalies.distribution_deviation)
    print("drift detected: ", feature_stats_and_anomalies.drift_detected)

#### Get Feature Stats in Feature

In [None]:
feature_movie = fg.get_feature("movies", latest_stats_count=5)
print(feature_movie)

# At this time, only one job executed, no drift detected.
for feature_stats in feature_movie.feature_stats_and_anomalies:
    print("feature monitor job id: ", feature_stats.feature_monitor_job_id)
    print("drift score: ", feature_stats.distribution_deviation)
    print("drift detected: ", feature_stats.drift_detected)

Full feature_stats_and_anomalies in feature

In [None]:
print(feature_movie.feature_stats_and_anomalies)

### Detect drift

Drifts happen when data in Feature Offline Store (BQ Source) changes overtime. Every Feature Monitor job will calculate drift comparing the data snapshot in the new job with the data snapshot in last job.

Algorithm to calculate drift score:
* For Categorical type: [L-infinity](https://en.wikipedia.org/wiki/Chebyshev_distance) distance.
* For Numerical type: [Jensen–Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)

In this tutorial, append additional data to the BQ table to simulate the data changes.


In [None]:
from io import StringIO

data = """users,movies,timestamp
"new_1","action_1",2024-08-15T08:28:14Z
"new_2","drama_2",2024-09-15T08:28:14Z
"new_3","romance_3",2024-10-15T08:28:14Z
"new_4","science_fiction_4",2024-11-15T09:29:16Z
"new_5","comedy_5",2024-12-11T07:27:19Z
"""

# Read the data into a pandas DataFrame
df_new = session.read_csv(StringIO(data))
df_new["timestamp"] = pd.to_datetime(df_new["timestamp"], utc=True)
df_new = df_new.rename(columns={"timestamp": "feature_timestamp"})
df_new.head()

In [None]:
# Append new data to the Bigquery table
df_new.to_gbq(BQ_TABLE_URI, if_exists="append")

In [None]:
fmj_new = fm.create_feature_monitor_job(description="new job test drift detection")

List FeatureMonitorJobs, all jobs including the new one are shown

In [None]:
fmjs = fm.list_feature_monitor_jobs()
print(fmjs)

Observe drift in Feature Monitor Job

In [None]:
while True:
    fmj_with_drift = fm.get_feature_monitor_job(fmj_new.name)
    if (
        fmj_with_drift.feature_stats_and_anomalies is None
        or len(fmj_with_drift.feature_stats_and_anomalies) == 0
    ):
        time.sleep(5)
    else:
        break
print(fmj_with_drift)
for feature_stats_and_anomalies in fmj_with_drift.feature_stats_and_anomalies:
    print("feature: ", feature_stats_and_anomalies.feature_id)
    print(
        "drift score (distribution_deviation): ",
        feature_stats_and_anomalies.distribution_deviation,
    )
    print("drift detected: ", feature_stats_and_anomalies.drift_detected)

Observe the full statistics and drift

In [None]:
print(fmj_with_drift.feature_stats_and_anomalies)

Observe drift in Feature

In [None]:
feature_movie = fg.get_feature("movies", latest_stats_count=5)
print(feature_movie)

# There will be stats generated by two jobs, one has no drift, one detected drift
for feature_stats in feature_movie.feature_stats_and_anomalies:
    print("feature monitor job id: ", feature_stats.feature_monitor_job_id)
    print("drift score: ", feature_stats.distribution_deviation)
    print("drift detected: ", feature_stats.drift_detected)

## Cleaning up

### Delete feature monitor, feature and feature group

In [None]:
# Delete Feature Monitor, all FeatureMonitorJobs created under the Feature Monitor will be automatically deleted, but stats kept under Feature.
fm.delete()

In [None]:
# Delete Feature, all stats under the Feature will be automatically deleted.
movies_feature.delete()

In [None]:
# Delete Feature Group.
fg.delete()

### Delete BigQuery dataset and table

In [None]:
client = bigquery.Client()

In [None]:
client.delete_table(f"{BQ_TABLE_URI}")

In [None]:
client.delete_dataset(f"{PROJECT_ID}.{BQ_DATASET_ID}")