In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/comparing_local_trained_models.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/comparing_local_trained_models.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/comparing_local_trained_models.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

# Compare trained and evaluated model experiments using Vertex AI Experiments

## Overview

As a Data Scientist, you probably start running model experiments locally on your notebook. Depending on the framework you use, you would need to track parameters, training time series and evaluation metrics. In this way, you would be able to explain the modelling approach you would choose. 

### Objective

In this tutorial, you learn how to use Vertex AI Experiments to compare and evaluate model experiments.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Workbench
- Vertex AI Experiments

The steps performed include:

- log the model parameters
- log the loss and metrics on every epoch to TensorBoard
- log the evaluation metrics
- compare two experiments

in Vertex AI Experiment of a recurrent neural network (RNN) for sentiment analysis.

### Dataset

The dataset is the Tensorflow Dataset's Large Yelp Review Dataset. The Yelp reviews dataset consists of reviews from Yelp. For more information, please refer to this [link](http://www.yelp.com/dataset).


### Costs 

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

**If you are using Colab or Vertex AI Workbench Notebooks**, your environment already meets
all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements.
You need the following:

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps provide a condensed set of
instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip3 install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

### Install additional packages

Install additional package dependencies not installed in your notebook environment, such as TensorFlow, TensorFlow Datasets and Vertex AI SDK. Use the latest major GA version of each package.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install {USER_FLAG} --upgrade tensorflow==2.8.0 tensorflow_datasets==4.5.2 -q
! pip3 install --user --force-reinstall 'google-cloud-aiplatform>=1.15' -q

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=cloudresourcemanager.googleapis.com,aiplatform.googleapis.com).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Otherwise, set your project ID here.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
!gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append it onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type and select
the following role into the filter box:

    *   Storage Admin
    *   Storage Object Admin
    *   Service Account User
    *   Vertex AI Administrator


5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP
    BUCKET_URI = f"gs://{BUCKET_NAME}"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

### Import libraries

In [None]:
# General
import logging

logger = logging.getLogger("logger")
logging.basicConfig(level=logging.INFO)

# Vertex AI
from google.cloud import aiplatform as vertex_ai

### Define constants

In [None]:
# Training
DATA_DIR = "data"
LOG_DIR = "logs"

# Experiments
TASK = "classification"
MODEL_TYPE = "rnn"
EXPERIMENT_NAME = f"{PROJECT_ID}-{TASK}-{MODEL_TYPE}-{TIMESTAMP}"

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
vertex_ai.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

### Create TensorBoard instance using Vertex AI TensorBoard

You can upload your TensorBoard logs by first creating a TensorBoard instance.

Learn more about [TensorBoard overview](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview).

In [None]:
vertex_ai_tb = vertex_ai.Tensorboard.create()

In [None]:
vertex_ai.init(experiment=EXPERIMENT_NAME, experiment_tensorboard=vertex_ai_tb)

## Training with Vertex AI Experiments

Vertex AI enables users to track the steps (for example, preprocessing, training) of an experiment run, and track inputs (for example, algorithm, parameters, datasets) and outputs (for example, models, checkpoints, metrics) of those steps. 

Below you have some example of how track experiments to train recurrent neural network for sentiment analysis. 

To simplify the code, here you have helper function to cover the following steps:

- Collect training data
- Create text encoder
- Build a RNN as baseline model
- Build a LSTM as challenger model
- Train the model

In [None]:
!mkdir -p -m 777 $DATA_DIR $LOG_DIR

In [None]:
import tensorflow_datasets as tfds

tfds.disable_progress_bar()
import tensorflow as tf


# Helpers ----------------------------------------------------------------------
def get_dataset(data_dir, buffer_size, batch_size):
    """
    Returns a tf.data.Dataset object containing the training data
    Returns:
        tf.data.Dataset: A tf.data.Dataset object containing the training data
        buffer_size: The buffer size for prefetch data
        batch_size: The batch size of dataset
    """

    dataset = tfds.load(
        "imdb_reviews",
        download=True,
        data_dir=data_dir,
        with_info=False,
        as_supervised=True,
    )

    train_dataset, test_dataset = dataset["train"], dataset["test"]

    train_dataset = (
        train_dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    )
    test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return train_dataset, test_dataset


def get_encoder(train_dataset, vocab_size):
    """
    Returns a TextVectorization object for the encoder
    Args:
        train_dataset: A tf.data.Dataset object containing the training data
    Returns:
        TextVectorization: A TextVectorization object for the encoder
    """

    encoder = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
    encoder.adapt(
        train_dataset.map(lambda text, label: text, num_parallel_calls=tf.data.AUTOTUNE)
    )
    return encoder


def get_baseline_model(encoder, model_params):
    """
    Returns a tf.keras.Model object for the model as baseline
    Args:
        encoder: A TextVectorization object for the encoder
        model_params: A dictionary with model parameters
    Returns:
        tf.keras.Model: A tf.keras.Model object for the model
    """

    model = tf.keras.Sequential()
    model.add(encoder)
    model.add(
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
        )
    )
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(tf.keras.layers.Dense(64, activation="relu"))
    model.add(tf.keras.layers.Dense(1))
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=model_params["learning_rate"],
            beta_1=model_params["beta_1"],
            beta_2=model_params["beta_2"],
            epsilon=model_params["epsilon"],
        ),
        metrics=["accuracy"],
    )
    return model


def get_stacked_model(encoder, model_params):
    """
    Returns a tf.keras.Model object for the model as challenger
        Args:
            encoder: A TextVectorization object for the encoder
            model_params: A dictionary with model parameters
        Returns:
            tf.keras.Model: A tf.keras.Model object for the model
    """

    model = tf.keras.Sequential()
    model.add(encoder)
    model.add(
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
        )
    )
    model.add(
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))
    )
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
    model.add(tf.keras.layers.Dense(64, activation="relu"))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1))
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=model_params["learning_rate"],
            beta_1=model_params["beta_1"],
            beta_2=model_params["beta_2"],
            epsilon=model_params["epsilon"],
        ),
        metrics=["accuracy"],
    )
    return model


def train(train_dataset, test_dataset, model, epochs, steps, log_dir=LOG_DIR):
    """
    Trains the model
    Args:
        train_dataset: A tf.data.Dataset object containing the training data
        test_dataset: A tf.data.Dataset object containing the testing data
        model: A tf.keras.Model object for the model
        epochs: The number of epochs
        steps: The number of validation steps
        log_dir: The location of tf training logs
    """
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    history = model.fit(
        train_dataset,
        validation_data=test_dataset,
        epochs=epochs,
        validation_steps=steps,
        callbacks=[tensorboard],
    )
    return history

### Define a baseline

As part of the experimentation phase, you would like to define a baseline model to have a performance benchmark which it is represented by recurrent neural network (RNN) in this case. Then you try to improve the ML solution with more complex training configuration that produce better results.

#### Run experiment and evaluate experiment runs using `with` statement

This step would takes **10 min** approx. to finish. And it covers the following steps:

- Initialize an experiment run
- Log the parameters associated to training data
- Log the parameters of the encoder
- Log the parameters of the model
- Train the model
- Log the metrics for each epochs
- Log the overall training metrics

In [None]:
# Experiment Settings ----------------------------------------------------------
RUN_ID_1 = "run-1"
BUFFER_SIZE = 10000
BATCH_SIZE = 64
VOCAB_SIZE = 1000
EPOCHS = 2
STEPS = 20
ROLE = "baseline"
LR = 1e-4
B_1 = 0.9
B_2 = 0.999
EPS = 1e-07

# Initialize the experiment
logging.info("Initialize the experiment.")
with vertex_ai.start_run(RUN_ID_1) as run:

    # Get the training and testing datasets
    logging.info("Get the training and testing datasets.")
    data_params = {"buffer_size": BUFFER_SIZE, "batch_size": BATCH_SIZE}
    train_dataset, test_dataset = get_dataset(
        "./data",
        buffer_size=data_params["buffer_size"],
        batch_size=data_params["batch_size"],
    )
    run.log_params(data_params)

    # Get the encoder
    logging.info("Get the encoder.")
    encoder_params = {"vocab_size": VOCAB_SIZE}
    encoder = get_encoder(
        train_dataset=train_dataset, vocab_size=encoder_params["vocab_size"]
    )
    run.log_params(encoder_params)

    # Get the model
    logging.info("Get the model.")
    run.log_params({"role": ROLE})
    model_params = {"learning_rate": LR, "beta_1": B_1, "beta_2": B_2, "epsilon": EPS}
    model = get_baseline_model(encoder=encoder, model_params=model_params)
    run.log_params(model_params)

    # Train the model
    logging.info("Train the model.")
    history = train(
        model=model,
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        epochs=EPOCHS,
        steps=STEPS,
    )

    run.log_params(history.params)
    for idx in range(0, history.params["epochs"]):
        run.log_time_series_metrics(
            {
                "train_loss": history.history["loss"][idx],
                "train_accuracy": history.history["accuracy"][idx],
            }
        )

    # Evaluate model
    logging.info("Evaluate model.")
    test_loss, test_accuracy = model.evaluate(test_dataset)
    run.log_metrics({"test_loss": test_loss, "test_accuracy": test_accuracy})

    # Exit the experiment
    logging.info("Exit the run.")

#### Evaluate the baseline model

In [None]:
# Get experiment
logging.info("Get experiment status.")
experiment_df = vertex_ai.get_experiment_df()
experiment_df.T

In [None]:
# Get time series metrics
logging.info("Get time series metrics.")
ts_runs_df = run.get_time_series_data_frame()
ts_runs_df

### Make experimental improvement to the model

As challenger model you add a LSTM layer to the baseline model. This step would takes **10 min** approx. to finish.


#### Run experiment and evaluate experiment runs with `experiment` instance


In [None]:
# Experiment Settings ----------------------------------------------------------
RUN_ID_2 = "run-2"
ROLE = "stacked"

# Initialize the experiment
logger.info("Initialize the experiment.")
vertex_ai.start_run(RUN_ID_2)

# Get the model
logging.info("Get the model.")
run.log_params({"role": ROLE})
model = get_stacked_model(encoder=encoder, model_params=model_params)
vertex_ai.log_params(model_params)

# Train the model
logging.info("Train the model.")
history = train(
    model=model,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    epochs=EPOCHS,
    steps=STEPS,
)
vertex_ai.log_params(history.params)
for idx in range(0, history.params["epochs"]):
    vertex_ai.log_time_series_metrics(
        {
            "train_loss": history.history["loss"][idx],
            "train_accuracy": history.history["accuracy"][idx],
        }
    )

# Evaluate model
logging.info("Evaluate model.")
test_loss, test_accuracy = model.evaluate(test_dataset)
vertex_ai.log_metrics({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Exit the experiment
logging.info("Exit the experiment.")
vertex_ai.end_run()

#### Compare the baseline with the new model implementation

In [None]:
# Get experiment
logging.info("Get experiment status.")
experiment_df = vertex_ai.get_experiment_df()
experiment_df.T

In [None]:
# Get time series metrics
exp_run = vertex_ai.ExperimentRun(RUN_ID_2, experiment=EXPERIMENT_NAME)
logging.info("Get time series metrics.")
ts_runs_df = exp_run.get_time_series_data_frame()
ts_runs_df

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
# Delete experiment
exp = vertex_ai.Experiment(EXPERIMENT_NAME)
exp.delete(delete_backing_tensorboard_runs=True)

# Delete Tensorboard
vertex_ai_tb.delete()

# Delete Cloud Storage objects that were created
delete_bucket = True

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}

!rm -Rf $DATA_DIR $LOG_DIR

In [None]:
!rm -Rf $DATA_DIR $LOG_DIR