In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 3 : formalization: get started with BigQuery and TFDV pipeline components

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_bq_tfdv_pipeline_components.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_bq_tfdv_pipeline_components.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_bq_tfdv_pipeline_components.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 3 : formalization: get started with BigQuery and TFDV pipeline components.

### Dataset

The dataset used for this tutorial is the GSOD dataset from [BigQuery public datasets](https://cloud.google.com/bigquery/public-data). The version of the dataset you use only the fields year, month and day to predict the value of mean daily temperature (mean_temp).

### Objective

In this tutorial, you learn how to use build lightweight Python components for BigQuery and Tensorflow Data Validation.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`
- `Vertex AI Datasets`
- `BigQuery`

The steps performed include:

- Build and execute a pipeline component for creating a Vertex AI Tabular Dataset from a BigQuery table.
- Build and execute a pipeline component for generating TFDV statistics and schema from a Vertex AI Tabular Dataset.
- Execute a Vertex AI pipeline.

### Costs 


This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage


Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installations

Install *one time* the packages for executing the MLOps notebooks.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
! pip3 install -U tensorflow $USER_FLAG
! pip3 install --upgrade google-cloud-aiplatform[tensorboard] $USER_FLAG
! pip3 install --upgrade kfp $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
import os

PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "[your-region]"  # @param {type:"string"}
if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

*Note:* The code for automatically finding your service account works on a user-managed Workbench AI noteboook. If you are using a fully-managed notebook or colab, you will need to manually enter your service account.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your GCP project id from gcloud
    shell_output = !gcloud auth list 2>/dev/null
    SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip

#### Import TensorFlow

Import the TensorFlow package into your Python environment.

In [None]:
import tensorflow as tf

In [None]:
from typing import NamedTuple

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## Pipeline components with BigQuery and Dataflow

### An anatomy of a pipeline component

Let's dive a bit into how pipeline components are executed. First, each component is containerized. That is, each component has its own:

- container image
- installation requirements
- (optional) hardware requirements

The above affects the amount of time/resources required to provision the component. For example, if each component in the pipeline had a different machine requirement, then a machine would have to be provisioned for each component. Even if the machine type is the same, if each component had a different container image, then a new container image would have to be provisioned for each component.

In otherwords, the efficiency of the pipeline is affected by the amount of provisioning.

Additionally, since each component runs in a container with its own memory space, there are performance issues relating to the amount of data moved across the container boundaries -- i.e., marshalling. To marshall data, the data has to be serialized and written to a volume storage, where the next component can access and de-serialize the data. Simple data types like integers, floats, strings, small dictionaries can be efficiently marshalled. You want to avoid though marshalling large memory objects.

### Construction of data pipeline components

Both BigQuery and Dataflow deal with data, and more importantly large amounts of data. As a result, you need to carefully consider the construction of the pipeline, so that you are not marshalling large amounts of in-memory data.

For example, consider a task that consists of reading a million records into an in-memory pandas dataframe, and then the in-memory data is processed for statistics. You could write this as two components: one component creates the dataframe, and the other processes it. Sounds good, nice and modular and the first component is likely reusable. Bad choice though.

If you did construct the components this way, the first component would have to write the dataframe to a disk, and the second component would then have to read it back from disk. Very inefficient. If you need a large in-memory object, one should only create it in the same component where it is used. In this example, one would create a single component to create and process the dataframe.

Let's now consider Vertex AI resources like datasets, models and endpoints. These resources have a physical manifestation which may include a combination of data and binary files. The Vertex AI resource object is not the actual files, but a in-memory wrapper. The resource object consists of properties and method, and file data is not read into memory until a property/method needs it.

Thus, for efficiency purposes, Vertex AI was designed with reference identifiers. One can load these resource wrappers via the resource identifier. Thus, when creating or otherwise referencing resource objects between components, one passes the resource identifier(s) between components.

#### Location of BigQuery training data.

Now set the variable `IMPORT_FILE` to the location of the data table in BigQuery.

In [None]:
IMPORT_FILE = "bq://bigquery-public-data.samples.gsod"
BQ_TABLE = "bigquery-public-data.samples.gsod"

### BigQuery components

First, you build a component `create_dataset_bq` to create a Vertex AI dataset from a BigQuery table. The component will return the resource identifier for the created Vertex AI dataset. Next, you build two downstream components:

    - `get_dataset_source`: Using the returned resource identifier, load the dataset resource object and get/return the dataset input source.
    - `get_column_names`: Using the returned resource identifier, load the dataset resource object and get/return the dataset column names.

In [None]:
@component(packages_to_install=["google-cloud-aiplatform"])
def create_dataset_bq(bq_table: str, display_name: str, project: str) -> str:
    import google.cloud.aiplatform as aip

    dataset = aip.TabularDataset.create(
        display_name=display_name, bq_source="bq://" + bq_table, project=project
    )

    return dataset.resource_name


@component(packages_to_install=["google-cloud-aiplatform"])
def get_dataset_source(dataset_id: str) -> str:
    import google.cloud.aiplatform as aip

    dataset = aip.TabularDataset(dataset_id)
    if "gcsSource" in dataset.gca_resource.metadata["inputConfig"].keys():
        files = dataset.gca_resource.metadata["inputConfig"]["gcsSource"]["uri"]
        return list(files)
    else:
        bq = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"]["uri"]
        return bq


@component(packages_to_install=["google-cloud-aiplatform"])
def get_column_names(dataset_id: str) -> list:
    import google.cloud.aiplatform as aip

    dataset = aip.TabularDataset(dataset_id)
    return dataset.column_names


PIPELINE_ROOT = "{}/pipeline_root/dataset_bq".format(BUCKET_URI)


@dsl.pipeline(
    name="dataset-bq",
    description="Vertex Dataset from BQ Table",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(
    bq_table: str = BQ_TABLE, display_name: str = "example", project: str = PROJECT_ID
):
    create_op = create_dataset_bq(bq_table, display_name, project)

    _ = get_dataset_source(create_op.output)

    _ = get_column_names(create_op.output)


compiler.Compiler().compile(pipeline_func=pipeline, package_path="dataset_bq.json")

pipeline = aip.PipelineJob(
    display_name="dataset_bq",
    template_path="dataset_bq.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.run()

! rm dataset_bq.json

### View the pipeline execution results

Next, view the results -- i.e., artifacts that are passed by each component.

In [None]:
import json

PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)


def print_pipeline_output(job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            ! gsutil cat $EXECUTE_OUTPUT
            break
        elif tf.io.gfile.exists(GCP_RESOURCES):
            ! gsutil cat $GCP_RESOURCES
            break

    return EXECUTE_OUTPUT


print("create_dataset_bq")
artifacts = print_pipeline_output(pipeline, "create-dataset-bq")
output = !gsutil cat $artifacts
val = json.loads(output[0])
dataset_id = val["parameters"]["Output"]["stringValue"]
print("\n\n")

print("get_dataset_source")
artifacts = print_pipeline_output(pipeline, "get-dataset-source")
print("\n\n")

print("get_column_names")
artifacts = print_pipeline_output(pipeline, "get-column-names")

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Build TFDV component for dataset statistics

Next, you build a component that will use the Tensorflow Data Validation package to produce dataset statistics and schema from the Vertex AI dataset you created, with the following parameters:

- `dataset_id`: The resource ID of the Vertex AI dataset.
- `label`: The label column for the dataset.
- `bucket`: The bucket to write the statistics and schema data

The statistics and schema are large memory objects that may be reused downstream by other components. For this purpose, the component directly writes the data to a Cloud Storage bucket, and then returns the Cloud Storage locations of the statistics and schema file as output artifacts.

In [None]:
@component(
    packages_to_install=[
        "google-cloud-aiplatform",
        "google-cloud-bigquery",
        "tensorflow-data-validation==1.2",
        "tensorflow==2.5",
    ]
)
def statistics(
    dataset_id: str, label: str, bucket: str
) -> NamedTuple("Outputs", [("stats", str), ("schema", str)]):  # Return parameters
    import google.cloud.aiplatform as aip
    import tensorflow_data_validation as tfdv
    from google.cloud import bigquery

    dataset = aip.TabularDataset(dataset_id)
    if "gcsSource" in dataset.gca_resource.metadata["inputConfig"].keys():
        files = dataset.gca_resource.metadata["inputConfig"]["gcsSource"]["uri"]
        files = list(files)
        stats = tfdv.generate_statistics_from_csv(
            data_location=files[0],
            stats_options=tfdv.StatsOptions(label_feature=label, num_top_values=50),
        )
    else:
        bq = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"]["uri"]
        bq_table = bq[5:]
        table = bigquery.TableReference.from_string(bq_table)
        bqclient = bigquery.Client()
        rows = bqclient.list_rows(
            table,
            selected_fields=[
                bigquery.SchemaField("station_number", "STRING"),
                bigquery.SchemaField("year", "INTEGER"),
                bigquery.SchemaField("month", "INTEGER"),
                bigquery.SchemaField("day", "INTEGER"),
                bigquery.SchemaField("mean_temp", "FLOAT"),
            ],
            max_results=10000,
        )
        dataframe = rows.to_dataframe()
        stats = tfdv.generate_statistics_from_dataframe(
            dataframe=dataframe,
            stats_options=tfdv.StatsOptions(label_feature=label, num_top_values=50),
        )

    stats_file = bucket + "/stats.txt"
    tfdv.write_stats_text(output_path=stats_file, stats=stats)

    schema = tfdv.infer_schema(statistics=stats)

    schema_file = bucket + "/schema.txt"
    tfdv.write_schema_text(output_path=schema_file, schema=schema)

    return (stats_file, schema_file)


PIPELINE_ROOT = "{}/pipeline_root/dataset_stats".format(BUCKET_URI)


@dsl.pipeline(
    name="dataset-stats", description="Dataset statistics", pipeline_root=PIPELINE_ROOT
)
def pipeline(dataset_id: str, label: str, bucket: str):

    _ = statistics(dataset_id, label, bucket)


compiler.Compiler().compile(pipeline_func=pipeline, package_path="dataset_stats.json")

pipeline = aip.PipelineJob(
    display_name="dataset_stats",
    template_path="dataset_stats.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "dataset_id": dataset_id,
        "label": "mean_temp",
        "bucket": BUCKET_URI,
    },
)

pipeline.run()

!rm -f dataset_stats.json

### View the pipeline execution results

Next, view the results -- i.e., the location of the statistics and schema artifacts.

In [None]:
artifacts = print_pipeline_output(pipeline, "statistics")
output = !gsutil cat $artifacts
val = json.loads(output[0])
schema_location = val["parameters"]["schema"]["stringValue"]
stats_location = val["parameters"]["stats"]["stringValue"]

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Vertex AI dataset
- Cloud Storage Bucket

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False

# Create reference to Vertex AI dataset created in pipeline
dataset = aip.TabularDataset(dataset_id)

# delete Vertex AI dataset
dataset.delete()

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI