In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 3 : formalization: get started with Kubeflow Pipelines

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage3/get_started_with_kubeflow_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/ai/platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage3/get_started_with_kubeflow_pipelines.ipynb">
      Open in Google Cloud Notebooks
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 3 : formalization: get started with Kubeflow Pipelines.

### Objective

In this tutorial, you learn how to use `Kubeflow Pipelines`.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`

The steps performed include:

- Building KFP lightweight Python function components.
- Assembling and compiling KFP components into a pipeline.
- Executing a KFP pipeline using Vertex AI Pipelines.
- Building sequential, parallel, multiple output components.
- Building control flow into pipelines.

## Installations

Install *one time* the packages for executing the MLOps notebooks.

In [None]:
ONCE_ONLY = False
if ONCE_ONLY:
    ! pip3 install -U tensorflow==2.5 $USER_FLAG
    ! pip3 install -U tensorflow-data-validation==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-transform==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-io==0.18 $USER_FLAG
    ! pip3 install --upgrade google-cloud-aiplatform[tensorboard] $USER_FLAG
    ! pip3 install --upgrade google-cloud-bigquery $USER_FLAG
    ! pip3 install --upgrade google-cloud-logging $USER_FLAG
    ! pip3 install --upgrade apache-beam[gcp] $USER_FLAG
    ! pip3 install --upgrade pyarrow $USER_FLAG
    ! pip3 install --upgrade cloudml-hypertune $USER_FLAG
    ! pip3 install --upgrade kfp $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = 'us-central1'  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "gs://[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_NAME

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
if SERVICE_ACCOUNT == "" or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == "[your-service-account]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud auth list 2>/dev/null
    SERVICE_ACCOUNT = shell_output[2].strip()
    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_NAME

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_NAME

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip

#### Import TensorFlow

Import the TensorFlow package into your Python environment.

In [None]:
import tensorflow as tf

In [None]:
from typing import NamedTuple

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

## Pipeline introduction

Vertex AI Pipelines lets you orchestrate your machine learning (ML) workflows in a serverless manner. Pipelines are re-usable, and their executions and artifact generation can be tracked by Vertex AI Experiments and Vertex AI ML Metadata. With pipelines, you do the following:

    1. Design the pipeline workflow.
    2. Compile the pipeline.
    3. Schedule execution (or run now) the pipeline.
    4. Get the pipeline results.

Pipelines are designed using language specific domain specific language (DSL). Vertex AI Pipelines support both KFP DSL and TFX DSL for designing pipelines.

In addition to designing components, you can use a wide variety of pre-built Google Cloud Pipeline Components for Vertex AI services.

Learn more about [Building a pipeline](https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline)

## Basic pipeline introduction

This demonstrates the basics of constructing and executing a pipeline. You do the following:

1. Design a simple Python function based component to output the input string.
2. Construct a pipeline that uses the component.
2. Compile the pipeline.
3. Execute the pipeline.

### Design hello world component

To create a KFP component from a Python function, you add the KFP DSL decorator `@component` to the function. In this example, the decorator takes the following parameters:

- `output_component_file`: (optional) write the component description to a YAML file such that the component is portable.
- `base_image`: (optional): The interpreter for executing the Python function. By default it is Python 3.7

In [None]:
@component(output_component_file="hello_world.yaml", base_image="python:3.9")
def hello_world(text: str) -> str:
    print(text)
    return text

! cat hello_world.yaml

### Design the hello world pipeline

Next, you design the pipeline for running the hello world component. A pipeline is specified as a Python function with the KFP DSL decorator `@dsl.component`, with the following parameters:

- `name`: Name of the pipeline.
- `description`: Description of the pipeline.
- `pipeline_root`: The artifact repository where KFP stores a pipelineâ€™s artifacts.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/hello_world".format(BUCKET_NAME)

@dsl.pipeline(
    name="hello-world",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(text: str = "hi there"):
    hello_world_task = hello_world(text)

### Compile the hello world pipeline

Once the design of the pipeline is completed, the next step is to compile it. The pipeline definition is compiled into a JSON formatted file, which is transportable and can be interpreted by both KFP and Vertex AI Pipelines.

You compile the pipeline with the method Compiler().compile(), with the following parameters:

- `pipeline_func`: The corresponding DSL function that defines the pipeline.
- `package_path`: The JSON file to write the transportable compiled pipeline to.

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="hello_world.json"
)

! cat hello_world.json

### Execute the hello world pipeline

Now that the pipeline is compiled, you can execute by:

- Create a Vertex AI PipelineJob, with the following parameters:
    - `display_name`: The human readable name for the job.
    - `template_path`: Thee compiled JSON pipeline definition.
    - `pipeline_root`: Where to write output artifacts to.

Click on the generated link below `INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:` to see your run in the Cloud Console.

In [None]:
pipeline = aip.PipelineJob(
    display_name="hello_world",
    template_path="hello_world.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

! rm hello_world.json

### View the hello world pipeline execution results

In [None]:
PROJECT_NUMBER = pipeline.gca_resource.name.split('/')[1]
print(PROJECT_NUMBER)

def print_pipeline_output(job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = PIPELINE_ROOT + '/' + PROJECT_NUMBER + '/' + JOB_ID + '/' + output_task_name + '_' + str(TASK_ID) + '/executor_output.json'
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            ! gsutil cat $EXECUTE_OUTPUT
            break

    return EXECUTE_OUTPUT

print_pipeline_output(pipeline, 'hello-world')

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Load a component from YAML definition

By storing the component definition, you can share and resuse the component by loading the component from its corresponding YAML file definition:

    hello_world_op = components.load_component_from_file('./hello_world.yaml').

You can also use the load_component_from_url method, if your component YAML file is stored online, such as if in a git repo.

In [None]:
from kfp import components

PIPELINE_ROOT = "{}/pipeline_root/hello_world-v2".format(BUCKET_NAME)

hello_world_op = components.load_component_from_file('./hello_world.yaml')

@dsl.pipeline(
    name="hello-world-v2",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(text: str = "hi there"):
    hellow_world_task = hello_world_op(text)

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="hello_world-v2.json"
)

pipeline = aip.PipelineJob(
    display_name="hello_world-v2",
    template_path="hello_world-v2.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

! rm hello_world-v2.json hello_world.yaml

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Package dependencies

Each component is assembled and executed within its own container. If a component has a dependency on one or more Python packages, you specify installing the packages with the parameter `packages_to_install`.

In [None]:
@component(packages_to_install=["numpy"])
def numpy_mean(values: list) -> float:
    import numpy as np
    return np.mean(values)

PIPELINE_ROOT = "{}/pipeline_root/numpy_mean".format(BUCKET_NAME)

@dsl.pipeline(
    name="numpy",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(values: list = [2,3]):
    numpy_task = numpy_mean(values)

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="numpy_mean.json"
)

pipeline = aip.PipelineJob(
    display_name="numpy_mean",
    template_path="numpy_mean.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'numpy-mean')

! rm numpy_mean.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

## Sequential tasks in pipeline

Next, you design and execute a pipeline with sequential tasks. In this example, the first task adds two integers and the second tasks divides the result (output) of the add task by 2.

*Note:* The output from the add task is referenced by the property `output`.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/add_div2".format(BUCKET_NAME)

@component(output_component_file="add.yaml", base_image="python:3.9")
def add(v1: int, v2: int) -> int:
    return v1 + v2

@component(output_component_file="div2.yaml", base_image="python:3.9")
def div_by_2(v: int) -> int:
    return v // 2

@dsl.pipeline(
    name="add-div2",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(v1: int = 4, v2: int = 5):
    add_task = add(v1, v2)
    div2_task = div_by_2(add_task.output)

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="add_div2.json"
)

pipeline = aip.PipelineJob(
    display_name="add_div2",
    template_path="add_div2.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'div-by-2')

! rm add.yaml div2.yaml add_div2.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Multiple output pipeline

Next, you design and execute a pipeline where a first component has multiple outputs, which are then used as inputs to the next component. To distinquish between the outputs, when used as inputs to the next component, you do:

1. Set the function return type to `NamedTuple`.
2. In NamedTuple, specify a name and type for each output, in the specified order.
3. In subsequent component, refer to the named output when using it as input.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/multi_output".format(BUCKET_NAME)

@component()
def multi_output(
    text1: str,
    text2: str
) -> NamedTuple(
    "Outputs",
    [
        ("output_1", str),  # Return parameters
        ("output_2", str),
    ],
):
    output_1 = text1 + ' '
    output_2 = text2
    return (output_1, output_2)

@component()
def concat(
    text1: str,
    text2: str
) -> str:
    return text1 + text2

@dsl.pipeline(
    name="multi-output",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(text1: str = "hello", text2: str = "world"):
    multi_output_task = multi_output(text1, text2)
    concat_task = concat(
        multi_output_task.outputs["output_1"],
        multi_output_task.outputs["output_2"],
    )

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="multi_output.json"
)

pipeline = aip.PipelineJob(
    display_name="multi-output",
    template_path="multi_output.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'concat')

! rm multi_output.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

## Parallel tasks in component

Next, you design and execute a pipeline with parallel tasks. In this example, one parallel task adds up a list of integers and another substracts them. Note that the compiler knows these two tasks can be ran in parallel, because their input is not dependent on the output of the other task.

Finally, the add task waits on the two parallel tasks to complete, and then adds together the two outputs.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/parallel".format(BUCKET_NAME)

@component()
def add_list(values: list) -> int:
    ret = 0
    for value in values:
        ret += 1
    return ret

@component()
def sub_list(values: list) -> int:
    ret = 0
    for value in values:
        ret -= 1
    return ret

@component()
def add(value1: int, value2: int) -> int:
    return value1 + value2

@dsl.pipeline(
    name="parallel",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(values: list = [1, 2, 3]):
    add_list_task = add_list(values)
    sub_list_task = sub_list(values)
    add_task = add(add_list_task.output, sub_list_task.output)

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="parallel.json"
)

pipeline = aip.PipelineJob(
    display_name="parallel",
    template_path="parallel.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'add')

! rm parallel.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

## Control flow in pipeline

While Python control statements, e.g., if/else, for, can be used in a component, they cannot be used in the pipeline function. Each task in the pipeline function runs as a node in a graph. Thus a control flow statement also has to run as a graph node. To support this, KFP provides a set of DSL statements that implement control flow as a graph node.

### dsl.ParallelFor

The statement `dsl.ParallelFor()` implements a for loop, where each iteration in the for loop runs in parallel.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/parallel_for".format(BUCKET_NAME)

@component()
def double(val: int) -> int:
    return val * 2

@component
def echo (val: int) -> int:
    return val

@dsl.pipeline(
    name="parallel-for",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(values: list = [1, 2, 3]):
    with dsl.ParallelFor(values) as item:
        output = double(item).output
        echo_task = echo(output)

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="parallel_for.json"
)

pipeline = aip.PipelineJob(
    display_name="parallel-for",
    template_path="parallel_for.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'echo')

! rm parallel_for.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### dsl.Condition

The statement `dsl.Condition()` implements an `if` statement. There is no support for an `else` or `elif` statement. You use a separate `dsl.Condition()` for each value you want to test for. For example, if the output from a task is `True` or `False`, you will have two `dsl.Condition()` statements, one for True and one for False.

In [None]:
@component()
def flip() -> int:
    import random
    return random.randint(0, 1)

@component()
def heads() -> bool:
    print("heads")
    return True

@component()
def tails() -> bool:
    print("tails")
    return False

@dsl.pipeline(
    name="condition",
    description="A simple intro pipeline",
    pipeline_root=PIPELINE_ROOT
)
def pipeline():
    flip_task = flip()
    with dsl.Condition(flip_task.output == 1, name="true_clause"):
        task = heads()
    with dsl.Condition(flip_task.output == 0, name="false_clause"):
        task = tails()

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="condition.json"
)

pipeline = aip.PipelineJob(
    display_name="condition",
    template_path="condition.json",
    pipeline_root=PIPELINE_ROOT,
)

pipeline.run()

print_pipeline_output(pipeline, 'flip')

! rm condition.json

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

## Errata

### Caching in pipeline components

When running a pipeline with Vertex AI Pipelines, the outcome state of each task is cached. With caching, if the pipeline is ran again, and the compiled definition of the task and state has not changed, the cached output will be used instead of running the task again.

To override caching, i.e., forceable run the task, you set the parameter `enable_caching` to `False` when creating the Vertex AI Pipeline job.

```
pipeline = aip.PipelineJob(
    display_name="example",
    template_path="example.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False
)
```

### Asynchronous execution of pipeline

When running a pipeline with the method `run()`, the pipeline is ran synchronously. To run asynchronously, you use the method `submit()`. Once the job has started, your Python script can continue to execute. Then when you need to block execution using the method `wait()`.

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Pipeline
- Model
- Endpoint
- AutoML Training Job
- Batch Job
- Custom Job
- Hyperparameter Tuning Job
- Cloud Storage Bucket

In [None]:
delete_all = True

if delete_all:
    # Delete the dataset using the Vertex dataset object
    try:
        if 'dataset' in globals():
            dataset.delete()
    except Exception as e:
        print(e)

    # Delete the model using the Vertex model object
    try:
        if 'model' in globals():
            model.delete()
    except Exception as e:
        print(e)

    # Delete the endpoint using the Vertex endpoint object
    try:
        if 'endpoint' in globals():
            endpoint.delete()
    except Exception as e:
        print(e)

    # Delete the AutoML or Pipeline training job
    try:
        if 'dag' in globals():
            dag.delete()
    except Exception as e:
        print(e)

    # Delete the custom training job
    try:
        if 'job' in globals():
            job.delete()
    except Exception as e:
        print(e)

    # Delete the batch prediction job using the Vertex batch prediction object
    try:
        if 'batch_predict_job' in globals():
            batch_predict_job.delete()
    except Exception as e:
        print(e)

    # Delete the hyperparameter tuning job using the Vertex hyperparameter tuning object
    try:
        if 'hpt_job' in globals():
            hpt_job.delete()
    except Exception as e:
        print(e)

    if 'BUCKET_NAME' in globals():
        ! gsutil rm -r $BUCKET_NAME