In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 2 : experimentation

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage2/mlops_experimentation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/ai/platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage2/mlops_experimentation.ipynb">
      Open in Google Cloud Notebooks
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 2 : experimentation.

### Dataset

The dataset used for this tutorial is the [Chicago Taxi](https://www.kaggle.com/chicago/chicago-taxi-trips-bq). The version of the dataset you will use in this tutorial is stored in a public BigQuery table. The trained model predicts whether someone would leave a tip for a taxi fare.

### Objective

In this tutorial, you create a MLOps stage 2: experimentation process.

This tutorial uses the following Vertex AI:

- `Vertex Datasets`
- `Vertex AutoML`
- `Vertex Training`
- `Vertex TensorBoard`

The steps performed include:

- Review the `Dataset` resource created during stage 1.
- Train an AutoML tabular binary classifier model in the background.
- Construct a custom training job for the `Dataset` resource.
- ?? Hyperparameter Tuning
- Train the custom model.
- Evaluate the custom model.
- ?? Tensorboard
- Wait for AutoML training job to complete.
- Evaluate the AutoML model.

## Installation

Install the latest version of Vertex SDK for Python.

In [None]:
import os


# Google Cloud Notebook
if os.path.exists("/opt/deeplearning/metadata/env_version"):
    USER_FLAG = '--user'
else:
    USER_FLAG = ''

! pip3 install --upgrade google-cloud-aiplatform $USER_FLAG

In [None]:
if os.environ["IS_TESTING"]:
    ! pip3 install --upgrade tensorflow $USER_FLAG

Install the latest GA version of *TensorFlow Transform* library as well.

In [None]:
! pip3 install -U tensorflow-transform $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = 'us-central1'  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip

#### Import TensorFlow

Import the TensorFlow package into your Python environment.

In [None]:
import tensorflow as tf

#### Import TensorFlow Transform

Import the TensorFlow Transform (TFT) package into your Python environment.

In [None]:
import tensorflow_transform as tft

### Initialize Vertex SDK for Python

Initialize the Vertex SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, location=REGION)

### Retrieve the dataset from stage 1

Next, retrieve the dataset you created during stage 1 with the helper function `find_dataset()`. This helper function finds all the datasets whose display name matches the specified prefix and import format (e.g., bq). Finally it sorts the matches by create time and returns the latest version.

In [None]:
def find_dataset(display_name_prefix, import_format):
    matches=[]
    datasets = aip.TabularDataset.list()
    for dataset in datasets:
        if dataset.display_name.startswith(display_name_prefix):
            try:
                if "bq" == import_format and dataset.to_dict()['metadata']['inputConfig']['bigquerySource']:
                    matches.append(dataset)
                if "csv" == import_format and dataset.to_dict()['metadata']['inputConfig']['gcsSource']:
                    matches.append(dataset)
            except:
                pass

    create_time = None
    for match in matches:
        if (create_time == None or
            match.create_time > create_time):
            create_time = match.create_time
            dataset = match


    return dataset

dataset = find_dataset("Chicago Taxi", "bq")

print(dataset)

### Create and run training pipeline

To train an AutoML model, you perform two steps: 1) create a training pipeline, and 2) run the pipeline.

#### Create training pipeline

An AutoML training pipeline is created with the `AutoMLTabularTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the `TrainingJob` resource.
- `optimization_prediction_type`: The type task to train the model for.
  - `classification`: A tabuar classification model.
  - `regression`: A tabular regression model.
- `column_transformations`: (Optional): Transformations to apply to the input columns
- `optimization_objective`: The optimization objective to minimize or maximize.
  - binary classification:
    - `minimize-log-loss`
    - `maximize-au-roc`
    - `maximize-au-prc`
    - `maximize-precision-at-recall`
    - `maximize-recall-at-precision`
  - multi-class classification:
    - `minimize-log-loss`
  - regression:
    - `minimize-rmse`
    - `minimize-mae`
    - `minimize-rmsle`

The instantiated object is the DAG (directed acyclic graph) for the training pipeline.

In [None]:
dag = aip.AutoMLTabularTrainingJob(
    display_name="chicago_" + TIMESTAMP,
    optimization_prediction_type="classification",
    optimization_objective="minimize-log-loss"
)

print(dag)

#### Run the training pipeline

Next, you run the DAG to start the training job by invoking the method `run`, with the following parameters:

- `dataset`: The `Dataset` resource to train the model.
- `model_display_name`: The human readable name for the trained model.
- `training_fraction_split`: The percentage of the dataset to use for training.
- `test_fraction_split`: The percentage of the dataset to use for test (holdout data).
- `validation_fraction_split`: The percentage of the dataset to use for validation.
- `target_column`: The name of the column to train as the label.
- `budget_milli_node_hours`: (optional) Maximum training time specified in unit of millihours (1000 = hour).
- `disable_early_stopping`: If `True`, training maybe completed before using the entire budget if the service believes it cannot further improve on the model objective measurements.

The `run` method when completed returns the `Model` resource.

The execution of the training pipeline will take upto 180 minutes.

In [None]:
async_model = dag.run(
    dataset=dataset,
    model_display_name="chicago_" + TIMESTAMP,
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    budget_milli_node_hours=8000,
    disable_early_stopping=False,
    target_column="tip_bin",
    sync=False
)

### Retrieve the dataset from stage 1

Next, retrieve the dataset you created during stage 1 with the helper function `find_dataset()`. This helper function finds all the datasets whose display name matches the specified prefix and import format (e.g., bq). Finally it sorts the matches by create time and returns the latest version.

In [None]:
def find_dataset(display_name_prefix, import_format):
    matches=[]
    datasets = aip.TabularDataset.list()
    for dataset in datasets:
        if dataset.display_name.startswith(display_name_prefix):
            try:
                if "bq" == import_format and dataset.to_dict()['metadata']['inputConfig']['bigquerySource']:
                    matches.append(dataset)
                if "csv" == import_format and dataset.to_dict()['metadata']['inputConfig']['gcsSource']:
                    matches.append(dataset)
            except:
                pass

    create_time = None
    for match in matches:
        if (create_time == None or
            match.create_time > create_time):
            create_time = match.create_time
            dataset = match


    return dataset

dataset = find_dataset("Chicago Taxi", "csv")

print(dataset)

### Load dataset's user metadata

Load the user metadata for the dataset.

In [None]:
import json

with tf.io.gfile.GFile('gs://' + dataset.labels['user_metadata'] + "/metadata.jsonl", "r") as f:
    metadata = json.load(f)

print(metadata)

### Create experiment for tracking training related metadata

Setup tracking the parameters (inputs) and metrics (results) for each experiment:

- `aip.init()` - Create an experiment instance
- `aip.start_run()` - Track a specific run within the experiment.

Learn more about [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).

In [None]:
EXPERIMENT_NAME = "chicago-" + TIMESTAMP
aip.init(experiment=EXPERIMENT_NAME)
aip.start_run('run-1')

### Create the input layer for custom model

Next, you create the input layer for your custom tabular model, based on the data types of each feature.

In [None]:
from tensorflow.keras.layers import Input

def create_model_inputs(numeric_features=None, categorical_features=None):
    inputs = {}
    for feature_name in numeric_features:
        inputs[feature_name] = Input(name=feature_name, shape=[], dtype=tf.float32)
    for feature_name in categorical_features:
        inputs[feature_name] = Input(name=feature_name, shape=[], dtype=tf.int64)

    return inputs

In [None]:
input_layers = create_model_inputs(numeric_features=metadata['numeric_features'],
                                   categorical_features=metadata['categorical_features']
                                  )

print(input_layers)

### Create the binary classifier custom model

Next, you create your binary classifier custom tabular model.

In [None]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Concatenate, experimental

def create_binary_classifier(input_layers, tft_output, hyperparams, numeric_features, categorical_features):
    layers = []
    for feature_name in input_layers:
        if feature_name in categorical_features:
            vocab_size = tft_output.vocabulary_size_by_name(feature_name)
            onehot_layer = experimental.preprocessing.CategoryEncoding(
                max_tokens=vocab_size,
                output_mode="binary",
                name=f"{feature_name}_onehot",
            )(input_layers[feature_name])
            layers.append(onehot_layer)
        elif feature_name in numeric_features:
            numeric_layer = tf.expand_dims(input_layers[feature_name], -1)
            layers.append(numeric_layer)
        else:
            pass

    joined = Concatenate(name="combines_inputs")(layers)
    feedforward_output = Sequential(
        [
            Dense(units, activation="relu")
            for units in hyperparams["hidden_units"]
        ],
        name="feedforward_network",
    )(joined)
    logits = Dense(units=1, name="logits")(feedforward_output)

    model = Model(inputs=input_layers, outputs=[logits])
    return model

In [None]:
TRANSFORM_ARTIFACTS_DIR = metadata['transform_artifacts_dir']
tft_output = tft.TFTransformOutput(TRANSFORM_ARTIFACTS_DIR)

hyperparams = {'hidden_units': [ 128, 64 ] }
aip.log_params(hyperparams)

model = create_binary_classifier(input_layers, tft_output, hyperparams,
                                 numeric_features=metadata['numeric_features'],
                                 categorical_features=metadata['categorical_features']
                                )

model.summary()

#### Visualize the model archirecture

Next, visualize the architecture of the custom model.

In [None]:
tf.keras.utils.plot_model(
    model,
    show_shapes=True,
    show_dtype=True
)

### Construct the training package

#### Package layout

Before you start training, you will look at how a Python package is assembled for a custom training job. When unarchived, the package contains the following directory/file layout.

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py
  - other Python scripts

The files `setup.cfg` and `setup.py` are the instructions for installing the package into the operating environment of the Docker image.

The file `trainer/task.py` is the Python script for executing the custom training job.

In [None]:
# Make folder for Python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\ntag_build =\n\ntag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'tensorflow_datasets==1.3.0',\n\n    ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\nName: Chicago Taxi tabular binary classifier\n\nVersion: 0.0.0\n\nSummary: Demostration training script\n\nHome-page: www.google.com\n\nAuthor: Google\n\nAuthor-email: aferlitsch@google.com\n\nLicense: Public\n\nDescription: Demo\n\nPlatform: Vertex"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

#### Get feature specification for the preprocessed data

Next, create the feature specification for the preprocessed data.

In [None]:
transform_feature_spec = tft_output.transformed_feature_spec()
print(transform_feature_spec)

### Load the transformed data into a tf.data.Dataset

Next, you load the gzip TFRecords on Cloud Storage storage into a `tf.data.Dataset` generator. These functions are re-used when training the custom model using `Vertex Training`, so you save them to the python training package.

In [None]:
%%writefile custom/trainer/data.py

import tensorflow as tf

def _gzip_reader_fn(filenames):
    """Small utility returning a record reader that can read gzip'ed files."""
    return tf.data.TFRecordDataset(filenames, compression_type="GZIP")


def get_dataset(file_pattern, feature_spec, label_column, batch_size=200):
    """Generates features and label for tuning/training.
    Args:
      file_pattern: input tfrecord file pattern.
      feature_spec: a dictionary of feature specifications.
      batch_size: representing the number of consecutive elements of returned
        dataset to combine in a single batch
    Returns:
      A dataset that contains (features, indices) tuple where features is a
        dictionary of Tensors, and indices is a single Tensor of label indices.
    """

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=feature_spec,
        label_key=label_column,
        reader=_gzip_reader_fn,
        num_epochs=1,
        drop_final_batch=True,
    )

    return dataset

In [None]:
from custom.trainer import data

TRANSFORMED_DATA_PREFIX = metadata['transformed_data_prefix']
LABEL_COLUMN = metadata['label_column']

train_data_file_pattern = TRANSFORMED_DATA_PREFIX + '/train/data-*.gz'
val_data_file_pattern = TRANSFORMED_DATA_PREFIX + '/val/data-*.gz'
test_data_file_pattern = TRANSFORMED_DATA_PREFIX + '/test/data-*.gz'

for input_features, target in data.get_dataset(
    train_data_file_pattern, transform_feature_spec, LABEL_COLUMN, batch_size=3).take(1):
    for key in input_features:
        print(f"{key} {input_features[key].dtype}: {input_features[key].numpy().tolist()}")
    print(f"target: {target.numpy().tolist()}")

## Train the model

blah

### Create training script

blah

In [None]:
%%writefile custom/trainer/train.py

from custom.trainer import data
import tensorflow as tf
import logging

def compile(model, hyperparams):
    optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.BinaryAccuracy(name="accuracy")]

    model.compile(optimizer=optimizer,loss=loss, metrics=metrics)
    return model


def train(
    model,
    hyperparams,
    train_data_dir,
    val_data_dir,
    label_column,
    transformed_feature_spec,
    log_dir
):

    train_dataset = data.get_dataset(
        train_data_dir,
        transformed_feature_spec,
        label_column,
        batch_size=hyperparams["batch_size"],
    )

    val_dataset = data.get_dataset(
        val_data_dir,
        transformed_feature_spec,
        label_column,
        batch_size=hyperparams["batch_size"],
    )

    logging.info("Model training started...")
    history = model.fit(
            train_dataset,
            epochs=hyperparams["num_epochs"],
            validation_data=val_dataset
    )

    logging.info("Model training completed.")
    return history


### Train the model locally

Next, test the training package locally, by training with just a few epochs:

- `num_epochs`: The number of epochs to pass to the training package.
- `compile()`: Compile the model for training.
- `train(): Train the model.

In [None]:
import logging
from custom.trainer import train

logging.getLogger().setLevel(logging.INFO)

hyperparams["learning_rate"] = 0.001
hyperparams["num_epochs"] = 5
hyperparams["batch_size"] = 512

aip.log_params(hyperparams)
train.compile(model, hyperparams)
train.train(model, hyperparams, train_data_file_pattern, val_data_file_pattern, LABEL_COLUMN, transform_feature_spec, None)

In [None]:
model = async_model

### Wait for completion of AutoML training job

Next, wait for the AutoML training job to complete. Alternatively, one can set the parameter `sync` to `True` in the `run()` method to block until the AutoML training job is completed.

In [None]:
model.wait()

## Review model evaluation scores
After your model has finished training, you can review the evaluation scores for it.

First, you need to get a reference to the new model. As with datasets, you can either use the reference to the model variable you created when you deployed the model or you can list all of the models in your project.

In [None]:
# Get model resource ID
models = aip.Model.list(filter='display_name=chicago_' + TIMESTAMP)

# Get a reference to the Model Service client
client_options = {"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
model_service_client = aip.gapic.ModelServiceClient(
    client_options=client_options
)

model_evaluations = model_service_client.list_model_evaluations(parent=models[0].resource_name)
model_evaluation = list(model_evaluations)[0]
print(model_evaluation)

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Pipeline
- Model
- Endpoint
- AutoML Training Job
- Batch Job
- Custom Job
- Hyperparameter Tuning Job
- Cloud Storage Bucket

In [None]:
delete_all = True

if delete_all:
    # Delete the dataset using the Vertex dataset object
    try:
        if 'dataset' in globals():
            dataset.delete()
    except Exception as e:
        print(e)

    # Delete the model using the Vertex model object
    try:
        if 'model' in globals():
            model.delete()
    except Exception as e:
        print(e)

    # Delete the endpoint using the Vertex endpoint object
    try:
        if 'endpoint' in globals():
            endpoint.delete()
    except Exception as e:
        print(e)

    # Delete the AutoML or Pipeline trainig job
    try:
        if 'dag' in globals():
            dag.delete()
    except Exception as e:
        print(e)

    # Delete the custom trainig job
    try:
        if 'job' in globals():
            job.delete()
    except Exception as e:
        print(e)

    # Delete the batch prediction job using the Vertex batch prediction object
    try:
        if 'batch_predict_job' in globals():
            batch_predict_job.delete()
    except Exception as e:
        print(e)

    # Delete the hyperparameter tuning job using the Vertex hyperparameter tuning object
    try:
        if 'hpt_job' in globals():
            hpt_job.delete()
    except Exception as e:
        print(e)

    if 'BUCKET_NAME' in globals():
        ! gsutil rm -r $BUCKET_NAME