In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Feedback or issues?

Let us know if you have any [feedback or questions](https://forms.gle/hXDnv1T4WanMwTi79). If you provide an email address, we will follow up with you.

# Tracking and visualizing the parameters and metrics of custom training jobs using the Model Builder SDK

To use this Jupyter notebook, copy the notebook to an AI Platform (Unified) Notebooks instance with Tensorflow installed and open it. You can run each step, or cell, and see its results. To run a cell, use Shift+Enter. Jupyter automatically displays the return value of the last line in each cell. For more information about running notebooks in AI Platform (Unified) Notebook, see the [AI Platform (Unified) Notebook guide](https://cloud.google.com/ai-platform-unified/docs/general/notebooks).


This notebook demonstrates how to track metrics and parameters for AI Platform (Unified) custom training jobs, and how to perform detailed analysis using this data.



Note: you might incur charges for training, prediction, storage or usage of other GCP products in connection with testing this SDK.

## Install the Model Builder SDK

Use the instructions in section to install the Model Builder SDK.

After the SDK has been installed, the kernel is automatically restarted.

In [None]:
%%capture
!pip3 uninstall -y google-cloud-aiplatform
!pip3 install --user git+https://github.com/googleapis/python-aiplatform.git@dev-test
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

## Enter Your Project and GCS Bucket

Enter your Project Id in the cell below. Then run the cell to make sure the Cloud SDK uses the right project for all the commands in this notebook.

In [None]:
MY_PROJECT = "YOUR PROJECT ID"  # Please change this to your own test project
MY_STAGING_BUCKET = "your-bucket-name"  # bucket should be in same region as ucaip
MY_EXPERIMENT = (
    "your-experiment-name"  # Please change this to your desired experiment name
)

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

## Initialize the Model Builder SDK and set an _experiment_

Initialize the *client* for AI Platform (Unified).

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=MY_PROJECT, experiment=MY_EXPERIMENT, staging_bucket=MY_STAGING_BUCKET
)

## Tracking parameters and metrics in AI Platform (Unified) custom training jobs

This example uses the Abalone Dataset. For more information about this dataset please visit: https://archive.ics.uci.edu/ml/datasets/abalone

In [None]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv
!gsutil cp abalone_train.csv {MY_STAGING_BUCKET}/data/

gcs_csv_path = f"{MY_STAGING_BUCKET}/data/abalone_train.csv"

### Create a managed tabular dataset from a CSV

A Managed dataset can be used to create an AutoML model or a custom model. 

In [None]:
ds = aiplatform.TabularDataset.create(display_name="abalone", gcs_source=[gcs_csv_path])

ds.resource_name

### Write the training script

Run the following cell to create the training script that is used in the sample custom training job.

In [None]:
%%writefile training_script.py

import pandas as pd
import argparse
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--num_units', dest='num_units',
                    default=64, type=int,
                    help='Number of unit for first layer.')
args = parser.parse_args()
# uncomment and bump up replica_count for distributed training
# strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
# tf.distribute.experimental_set_strategy(strategy)

col_names = ["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Age"]
target = "Age"

def aip_data_to_dataframe(wild_card_path):
    return pd.concat([pd.read_csv(fp.numpy().decode(), names=col_names)
                      for fp in tf.data.Dataset.list_files([wild_card_path])])

def get_features_and_labels(df):
    return df.drop(target, axis=1).values, df[target].values

def data_prep(wild_card_path):
    return get_features_and_labels(aip_data_to_dataframe(wild_card_path))


model = tf.keras.Sequential([layers.Dense(args.num_units), layers.Dense(1)])
model.compile(loss='mse', optimizer='adam')

model.fit(*data_prep(os.environ["AIP_TRAINING_DATA_URI"]),
          epochs=args.epochs ,
          validation_data=data_prep(os.environ["AIP_VALIDATION_DATA_URI"]))
print(model.evaluate(*data_prep(os.environ["AIP_TEST_DATA_URI"])))

# save as AI Platform Managed model
tf.saved_model.save(model, os.environ["AIP_MODEL_DIR"])

### Launch a custom training job and track its trainig parameters on AI Platform (Unified) ML Metadata

In [None]:
job = aiplatform.CustomTrainingJob(
    display_name="train-abalone-dist-1-replica",
    script_path="training_script.py",
    container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
    requirements=["gcsfs==0.7.1"],
    model_serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest",
)

Start a new experiment run to track training parameters and start the training job. Note that this operation will take around 10 mins.

In [None]:
aiplatform.start_run("custom-training-run-1")  # Change this to your desired run name
parameters = {"epochs": 10, "num_units": 64}
aiplatform.log_params(parameters)

model = job.run(
    ds,
    replica_count=1,
    model_display_name="abalone-model",
    args=[f"--epochs={parameters['epochs']}", f"--num_units={parameters['num_units']}"],
)

### Deploy Model and calculate prediction metrics

Deploy model to Google Cloud. This operation will take 10-20 mins.

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

Once model is deployed, perform online prediction using the `abalone_test` dataset and calculate prediction metrics.

Prepare the prediction dataset.

In [None]:
import pandas as pd
from tensorflow.python.keras.utils import data_utils


def read_data(uri):
    dataset_path = data_utils.get_file("auto-mpg.data", uri)
    col_names = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Age",
    ]
    dataset = pd.read_csv(
        dataset_path,
        names=col_names,
        na_values="?",
        comment="\t",
        sep=",",
        skipinitialspace=True,
    )
    return dataset


def get_features_and_labels(df):
    target = "Age"
    return df.drop(target, axis=1).values, df[target].values


test_dataset, test_labels = get_features_and_labels(
    read_data(
        "https://storage.googleapis.com/download.tensorflow.org/data/abalone_test.csv"
    )
)

Perform online prediction.

In [None]:
prediction = endpoint.predict(test_dataset.tolist())
prediction

Calculate and track prediction evaluation metrics.

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mse = mean_squared_error(test_labels, prediction.predictions)
mae = mean_absolute_error(test_labels, prediction.predictions)

aiplatform.log_metrics({"mse": mse, "mae": mae})

### Extract all parameters and metrics created during this experiment.

In [None]:
aiplatform.get_experiment_df()

### View data in the Cloud Console

Parameters and metrics can also be viewed in the Cloud Console. 


In [None]:
print("AI Platform (Unified) Experiments:")
print(
    f"https://console.cloud.google.com/ai/platform/experiments/experiments?folder=&organizationId=&project={MY_PROJECT}"
)