In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 2 : experimentation: get started with Logging and Vertex Experiments

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage2/get_started_vertex_experiments.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/ai/platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage2/get_started_vertex_experiments.ipynb">
      Open in Google Cloud Notebooks
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 2 : experimentation: get started with Logging and Vertex Experiments.

### Objective

In this tutorial, you learn how to use Python logging and `Vertex Experiments` when training with `Vertex AI`.

This tutorial uses the following Google Cloud ML services:

- `Vertex Experiments`
- `Vertex ML Metadata`

The steps performed include:

- Use Python logging to log training configuration/results locally.
- Use Google Cloud Logging to log training configuration/results in cloud storage.
- Create a Vertex `Experiment` resource.
- Instantiate an experiment run.
- Log parameters for the run.
- Log metrics for the run.
- Display the logged experiment run.

### Recommendations

When doing E2E MLOps on Google Cloud, the following best practices for logging data when experimenting or formal training a model.

#### Python Logging

Use Python's logging package when doing ad-hoc training locally.

#### Cloud Logging

Use `Google Cloud Logging` when doing training on the cloud.

#### Experiments

Use Vertex Experiments in conjunction with logging when doing experiments to compare results for different experiment configurations.

## Installations

Install *one time* the packages for executing the MLOps notebooks.

In [None]:
ONCE_ONLY = False
if ONCE_ONLY:
    ! pip3 install -U tensorflow==2.5 $USER_FLAG
    ! pip3 install -U tensorflow-data-validation==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-transform==1.2 $USER_FLAG
    ! pip3 install -U tensorflow-io==0.18 $USER_FLAG
    ! pip3 install --upgrade google-cloud-aiplatform[tensorboard] $USER_FLAG
    ! pip3 install --upgrade google-cloud-bigquery $USER_FLAG
    ! pip3 install --upgrade google-cloud-logging $USER_FLAG
    ! pip3 install --upgrade apache-beam[gcp] $USER_FLAG
    ! pip3 install --upgrade pyarrow $USER_FLAG
    ! pip3 install --upgrade cloudml-hypertune $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "us-central1"  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip

#### Import logging

Import the logging package into your Python environment.

In [None]:
import logging

### Initialize Vertex SDK for Python

Initialize the Vertex SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, location=REGION)

## Python Logging

The Python logging package is widely used for logging within Python scripts. Commonly used features:

- Set logging levels.
- Send log output to console.
- Send log output to a file.

### Logging Levels

The logging levels in order (from least to highest) are, with each level inclusive of the previous level:

1. Informational
2. Warnings
3. Errors
4. Debugging

By default, the logging level is set to error level.

### Logging output to console

By default, the Python logging package outputs to the console. Note, in the example the debug log message is not outputted since the default logging level is set to error.

In [None]:
def logging_examples():
    logging.info("Model training started...")
    logging.warning("Using older version of package ...")
    logging.error("Training was terminated ...")
    logging.debug("Hyperparameters were ...")


logging_examples()

### Setting logging level

To set the logging level, you get the logging handler using `getLogger()`. You can have multiple logging handles. When `getLogger()` is called w/o arguments it gets the default handler, named ROOT. With the handler, you set the logging level with the method 'setLevel()`.

In [None]:
logging.getLogger().setLevel(logging.DEBUG)

logging_examples()

### Clearing handlers

At times, you may desire to reconfigure your logging. A common practice in this case is to first remove all existing logging handles for a fresh start.

In [None]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

### Output to a local file

You can preserve your logging output to a file that is local to where the Python script is running with the method `BasicConfig()`, with the following paraneters:

- `filename`: The file path to the local file to write the log output to.
- `level`: Sets the level of logging that is written to the logging file.

*Note:* You cannot use a Cloud Storage bucket as the output file.

In [None]:
logging.basicConfig(filename="mylog.log", level=logging.DEBUG)

logging_examples()

! cat mylog.log

## Logging with Google Cloud Logging

You can preserve and retrieve your logging output to `Google Cloud Logging` service. Commonly used features:

- Set logging levels.
- Send log output to storage.
- Retrieve log output from storage.

### Logging Levels

The logging levels in order (from least to highest) are, with each level inclusive of the previous level:

1. Informational
2. Warnings
3. Errors
4. Debugging

By default, the logging level is set to warning level.

### Configurable and storing log data.

To use the `Google Cloud Logging` service, you do the following steps:

1. Create a client to the service.
2. Obtain a handler for the service.
3. Create a logger instance and set logging level.
4. Attach logger instance to the service.

Learn more about [Logging client libraries](https://cloud.google.com/logging/docs/reference/libraries).

In [None]:
import google.cloud.logging
from google.cloud.logging.handlers import CloudLoggingHandler

# Connect to the Cloud Logging service
cl_client = google.cloud.logging.Client()
handler = CloudLoggingHandler(cl_client, name="mylog")

# Create a logger instance and logging level
cloud_logger = logging.getLogger("cloudLogger")
cloud_logger.setLevel(logging.INFO)

# Attach the logger instance to the service.
cloud_logger.addHandler(handler)

# Log something
cloud_logger.error("bad news")

### Logging output

To log output at specific levels is identical in method, and method names, as in Python logging, except that you use your instance of the cloud logger in place of logging.

In [None]:
cloud_logger.info("Model training started...")
cloud_logger.warning("Using older version of package ...")
cloud_logger.error("Training was terminated ...")
cloud_logger.debug("Hyperparameters were ...")

### Get logging entries

To get the logged output, you:

1. Retrieve the log handle to the service.
2. Using the handle call the method `list_entries()`
3. Iterate through the entries.

In [None]:
logger = cl_client.logger("mylog")

for entry in logger.list_entries():
    timestamp = entry.timestamp.isoformat()
    print("* {}: {}: {}".format(timestamp, entry.severity, entry.payload))

## Logging with Vertex Experiments and Vertex ML Metadata

You can log results related to training experiments with `Vertex Experiments` and `ML Metadata`:

- Preserve results of an experiment.
- Track multiple runs -- i.e., training runs -- within an experiment.
- Track parameters (configuration) and metrics (results).
- Retrieve and display the logged output.

Learn more about [Experiments](https://cloud.google.com/vertex-ai/docs/experiments/).

### Create experiment for tracking training related metadata

Setup tracking the parameters (configuration) and metrics (results) for each experiment:

- `aip.init()` - Create an experiment instance
- `aip.start_run()` - Track a specific run within the experiment.

Learn more about [Introduction to Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).

In [None]:
EXPERIMENT_NAME = "example-" + TIMESTAMP
aip.init(experiment=EXPERIMENT_NAME)
aip.start_run("run-1")

### Log parameters for the experiment

Typically, an experiment is associated with a specific dataset and model architecture. Within an experiment, you may have multiple training runs, where each run tries a different configuration. As examples:

- Dataset split
- Dataset sampling and boosting
- Depth and width of layers
- Hyperparameters

These configuration settings are referred to as parameters, which you store their key/value pair using the method `log_params()`

In [None]:
hyperparams = {}
hyperparams["epochs"] = 100
hyperparams["batch_size"] = 32
hyperparams["learning_rate"] = 0.01
aip.log_params(hyperparams)

### Log metrics for the experiment

At the completion, or termination, of a run within an experiment, you can log results that you use to compare runs. As examples:

- Evaluation metrics
- Hyperparameter search selection
- Time to train the model
- Early stop trigger

These results settings are referred to as metrics, which you store their key/value pair using the method `log_metrics()`

In [None]:
metrics = {}
metrics["test_acc"] = 98.7
metrics["train_acc"] = 99.3
aip.log_metrics(metrics)

### Get the experiment results

Next, you use the experiment name as a parameter to the method `get_experiment_df()` to get the results of the experiment as a pandas dataframe.

In [None]:
experiment_df = aip.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == "example"]
experiment_df.T

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Pipeline
- Model
- Endpoint
- AutoML Training Job
- Batch Job
- Custom Job
- Hyperparameter Tuning Job
- Cloud Storage Bucket

In [None]:
delete_all = True

if delete_all:
    # Delete the dataset using the Vertex dataset object
    try:
        if "dataset" in globals():
            dataset.delete()
    except Exception as e:
        print(e)

    # Delete the model using the Vertex model object
    try:
        if "model" in globals():
            model.delete()
    except Exception as e:
        print(e)

    # Delete the endpoint using the Vertex endpoint object
    try:
        if "endpoint" in globals():
            endpoint.delete()
    except Exception as e:
        print(e)

    # Delete the AutoML or Pipeline training job
    try:
        if "dag" in globals():
            dag.delete()
    except Exception as e:
        print(e)

    # Delete the custom training job
    try:
        if "job" in globals():
            job.delete()
    except Exception as e:
        print(e)

    # Delete the batch prediction job using the Vertex batch prediction object
    try:
        if "batch_predict_job" in globals():
            batch_predict_job.delete()
    except Exception as e:
        print(e)

    # Delete the hyperparameter tuning job using the Vertex hyperparameter tuning object
    try:
        if "hpt_job" in globals():
            hpt_job.delete()
    except Exception as e:
        print(e)

    if "BUCKET_NAME" in globals():
        ! gsutil rm -r $BUCKET_NAME