In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Feedback or issues?

For any feedback or questions, please open an [issue](https://github.com/googleapis/python-aiplatform/issues).

# Vertex SDK for Python: Custom Training using Python Package, Managed Text Dataset, and TF-Serving Container Example
To use this Jupyter notebook, copy the notebook to a Google Cloud Notebooks instance with Tensorflow installed and open it. You can run each step, or cell, and see its results. To run a cell, use Shift+Enter. Jupyter automatically displays the return value of the last line in each cell. For more information about running notebooks in Google Cloud Notebook, see the [Google Cloud Notebook guide](https://cloud.google.com/vertex-ai/docs/general/notebooks).

This notebook demonstrate how to create a Custom Model using Custom Python Package Training, with a Vertex AI Dataset, and how to serve the model using Tensorflow-Serving Container for online prediction, and batch prediction. It will require you provide a bucket where the dataset will be stored.

Note: you may incur charges for training, prediction, storage or usage of other GCP products in connection with testing this SDK.

### Install Vertex SDK for Python


After the SDK installation the kernel will be automatically restarted.

In [None]:
!pip3 uninstall -y google-cloud-aiplatform
!pip3 install google-cloud-aiplatform
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Enter Your Project and GCS Bucket

Enter your Project Id in the cell below. Then run the cell to make sure the Cloud SDK uses the right project for all the commands in this notebook.

In [None]:
MY_PROJECT = "YOUR PROJECT ID"
MY_STAGING_BUCKET = "gs://YOUR BUCKET"  # bucket should be in same region as ucaip

### Set Your Application Name, Task Name, and Directories.


In [None]:
APP_NAME = "keras-text-class-stack-overflow-tag"
TASK_TYPE = "mbsdk_custom-py-pkg-training"

TASK_NAME = f"{TASK_TYPE}_{APP_NAME}"

TASK_DIR = f"./{TASK_NAME}"
DATA_DIR = f"{TASK_DIR}/data"

print(f"Task Name:      {TASK_NAME}")
print(f"Task Directory: {TASK_DIR}")
print(f"Data Directory: {DATA_DIR}")

### Set a GCS Prefix

If you want to centeralize all input and output files under the gcs location.

In [None]:
BUCKET_NAME = MY_STAGING_BUCKET.split("gs://")[1]
GCS_PREFIX = f"{TASK_TYPE}/{APP_NAME}"

print(f"Bucket Name:    {BUCKET_NAME}")
print(f"GCS Prefix:     {GCS_PREFIX}")

# Stack Overflow Data
We download the stack overflow data from from  https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz and will create a Vertex AI managed text dataset. 

The Stack Overflow Data is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ 

For more information about this dataset please visit: https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow


### Utility Functions to Download Data and Prepare CSV Files for Creating Vertex AI Managed Dataset

In [None]:
import csv
import os

from google.cloud import storage
from tensorflow.keras import utils


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    destination_file_name = os.path.join("gs://", bucket_name, destination_blob_name)

    return destination_file_name


def download_data(data_dir):
    """Download data."""

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"
    dataset = utils.get_file(
        "stack_overflow_16k.tar.gz",
        url,
        untar=True,
        cache_dir=data_dir,
        cache_subdir="",
    )
    data_dir = os.path.join(os.path.dirname(dataset))

    return data_dir


def upload_train_data_to_gcs(train_data_dir, bucket_name, destination_blob_prefix):
    """Create CSV file using train data content."""

    train_data_dir = os.path.join(data_dir, "train")
    train_data_fn = os.path.join(data_dir, "train.csv")

    fp = open(train_data_fn, "w", encoding="utf8")
    writer = csv.writer(
        fp, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL, lineterminator="\n"
    )

    for root, _, files in os.walk(train_data_dir):
        for file in files:
            if file.endswith(".txt"):
                class_name = root.split("/")[-1]
                file_fn = os.path.join(root, file)
                with open(file_fn, "r") as f:
                    content = f.readlines()
                    lines = [x.strip().strip('"') for x in content]
                    writer.writerow((lines[0], class_name))

    fp.close()

    train_gcs_url = upload_blob(
        bucket_name, train_data_fn, os.path.join(destination_blob_prefix, "train.csv")
    )

    return train_gcs_url

### Download Data

In [None]:
data_dir = download_data(DATA_DIR)
print(f"Data is downloaded to: {data_dir}")

In [None]:
!ls $data_dir

In [None]:
!ls $data_dir/train

### Prepare CSV Files for Creating Managed Dataset

#### Create CSV Files using Data Content

In [None]:
gcs_source_train_url = upload_train_data_to_gcs(
    train_data_dir=os.path.join(data_dir, "train"),
    bucket_name=BUCKET_NAME,
    destination_blob_prefix=f"{GCS_PREFIX}/data",
)

print(f"Train data content is loaded to {gcs_source_train_url}")

In [None]:
!gsutil ls gs://$BUCKET_NAME/$GCS_PREFIX/data

# Create Custom Training Python Package

Before you can perform custom training with a pre-built container, you must create a [Python Source Distribution](https://docs.python.org/3/distutils/sourcedist.html) that contains your training application and upload it to a Cloud Storage bucket that your Google Cloud project can access.

We will create a directory and write all of our package build artifacts into that folder.

In [None]:
PYTHON_PACKAGE_APPLICATION_DIR = f"{TASK_NAME}/trainer"

!mkdir -p $PYTHON_PACKAGE_APPLICATION_DIR
!touch $PYTHON_PACKAGE_APPLICATION_DIR/__init__.py

### Write the Training Script

In [None]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/task.py


import os
import argparse

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import json
import tqdm

VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 250

def str2bool(v):
  if isinstance(v, bool):
    return v
  if v.lower() in ('yes', 'true', 't', 'y', '1'):
    return True
  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
    return False
  else:
    raise argparse.ArgumentTypeError('Boolean value expected.')

def build_model(num_classes, loss, optimizer, metrics, vectorize_layer):
  # vocab_size is VOCAB_SIZE + 1 since 0 is used additionally for padding.
  model = tf.keras.Sequential([
      vectorize_layer,
      layers.Embedding(VOCAB_SIZE + 1, 64, mask_zero=True),
      layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      layers.GlobalMaxPooling1D(),
      layers.Dense(num_classes),
      layers.Activation('softmax')
  ])
  model.compile(
      loss=loss,
      optimizer=optimizer,
      metrics=metrics)

  return model

def get_string_labels(predicted_scores_batch, class_names):
  predicted_labels = tf.argmax(predicted_scores_batch, axis=1)
  predicted_labels = tf.gather(class_names, predicted_labels)
  return predicted_labels

def predict(export_model, class_names, inputs):
  predicted_scores = export_model.predict(inputs)
  predicted_labels = get_string_labels(predicted_scores, class_names)
  return predicted_labels

def parse_args():
  parser = argparse.ArgumentParser(
      description='Keras Text Classification on Stack Overflow Questions')
  parser.add_argument(
      '--epochs', default=25, type=int, help='number of training epochs')
  parser.add_argument(
      '--batch-size', default=16, type=int, help='mini-batch size')
  parser.add_argument(
      '--model-dir', default=os.getenv('AIP_MODEL_DIR'), type=str, help='model directory')
  parser.add_argument(
      '--data-dir', default='./data', type=str, help='data directory')
  parser.add_argument(
      '--test-run', default=False, type=str2bool, help='test run the training application, i.e. 1 epoch for training using sample dataset')
  parser.add_argument(
      '--model-version', default=1, type=int, help='model version')
  args = parser.parse_args()
  return args

def load_aip_dataset(aip_data_uri_pattern, batch_size, class_names, test_run, shuffle=True, seed=42):

  data_file_urls = list()
  labels = list()

  class_indices = dict(zip(class_names, range(len(class_names))))
  num_classes = len(class_names)

  for aip_data_uri in tqdm.tqdm(tf.io.gfile.glob(pattern=aip_data_uri_pattern)):
    with tf.io.gfile.GFile(name=aip_data_uri, mode='r') as gfile:
      for line in gfile.readlines():
        line = json.loads(line)
        data_file_urls.append(line['textContent'])
        classification_annotation = line['classificationAnnotations'][0]
        label = classification_annotation['displayName']
        labels.append(class_indices[label])
        if test_run:
          break

  data = list()
  for data_file_url in tqdm.tqdm(data_file_urls):
    with tf.io.gfile.GFile(name=data_file_url, mode='r') as gf:
      txt = gf.read()
      data.append(txt)

  print(f' data files count: {len(data_file_urls)}')
  print(f' data count: {len(data)}')
  print(f' labels count: {len(labels)}')

  dataset = tf.data.Dataset.from_tensor_slices(data)
  label_ds = tf.data.Dataset.from_tensor_slices(labels)
  label_ds = label_ds.map(lambda x: tf.one_hot(x, num_classes))

  dataset = tf.data.Dataset.zip((dataset, label_ds))

  if shuffle:
    # Shuffle locally at each iteration
    dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
  dataset = dataset.batch(batch_size)
  # Users may need to reference `class_names`.
  dataset.class_names = class_names

  return dataset

def main():

  args = parse_args()

  class_names = ['csharp', 'java', 'javascript', 'python']
  class_indices = dict(zip(class_names, range(len(class_names))))
  num_classes = len(class_names)
  print(f' class names: {class_names}')
  print(f' class indices: {class_indices}')
  print(f' num classes: {num_classes}')

  epochs = 1 if args.test_run else args.epochs

  aip_model_dir = os.environ.get('AIP_MODEL_DIR')
  aip_data_format = os.environ.get('AIP_DATA_FORMAT')
  aip_training_data_uri = os.environ.get('AIP_TRAINING_DATA_URI')
  aip_validation_data_uri = os.environ.get('AIP_VALIDATION_DATA_URI')
  aip_test_data_uri = os.environ.get('AIP_TEST_DATA_URI')

  print(f"aip_model_dir: {aip_model_dir}")
  print(f"aip_data_format: {aip_data_format}")
  print(f"aip_training_data_uri: {aip_training_data_uri}")
  print(f"aip_validation_data_uri: {aip_validation_data_uri}")
  print(f"aip_test_data_uri: {aip_test_data_uri}")

  print('Loading AIP dataset')
  train_ds = load_aip_dataset(
      aip_training_data_uri, args.batch_size, class_names, args.test_run)
  print('AIP training dataset is loaded')
  val_ds = load_aip_dataset(
      aip_validation_data_uri, 1, class_names, args.test_run)
  print('AIP validation dataset is loaded')
  test_ds = load_aip_dataset(
      aip_test_data_uri, 1, class_names, args.test_run)
  print('AIP test dataset is loaded')

  vectorize_layer = TextVectorization(
      max_tokens=VOCAB_SIZE,
      output_mode='int',
      output_sequence_length=MAX_SEQUENCE_LENGTH)

  train_text = train_ds.map(lambda text, labels: text)
  vectorize_layer.adapt(train_text)
  print('The vectorize_layer is adapted')


  print('Build model')
  optimizer = 'adam'
  metrics = ['accuracy']

  model = build_model(
      num_classes, losses.CategoricalCrossentropy(from_logits=True), optimizer, metrics, vectorize_layer)

  history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)
  history = history.history

  print('Training accuracy: {acc}, loss: {loss}'.format(
      acc=history['accuracy'][-1], loss=history['loss'][-1]))
  print('Validation accuracy: {acc}, loss: {loss}'.format(
      acc=history['val_accuracy'][-1], loss=history['val_loss'][-1]))

  loss, accuracy = model.evaluate(test_ds)
  print('Test accuracy: {acc}, loss: {loss}'.format(
      acc=accuracy, loss=loss))

  inputs = [
      "how do I extract keys from a dict into a list?",  # python
      "debug public static void main(string[] args) {...}",  # java
  ]
  predicted_labels = predict(model, class_names, inputs)
  for input, label in zip(inputs, predicted_labels):
    print(f'Question: {input}')
    print(f'Predicted label: {label.numpy()}')

  model_export_path = os.path.join(args.model_dir, str(args.model_version))
  model.save(model_export_path)
  print(f'Model version {args.model_version} is exported to {args.model_dir}')

  loaded = tf.saved_model.load(model_export_path)
  input_name = list(loaded.signatures['serving_default'].structured_input_signature[1].keys())[0]
  print(f'Serving function input: {input_name}')

  return

if __name__ == '__main__':
  main()


### Build Package

In [None]:
%%writefile {TASK_DIR}/setup.py

from setuptools import find_packages
from setuptools import setup

setup(
    name='trainer',
    version='0.1',
    packages=find_packages(),
    install_requires=(),
    include_package_data=True,
    description='My training application.'
)

In [None]:
!ls $TASK_DIR

In [None]:
!cd $TASK_DIR && python3 setup.py sdist --formats=gztar

In [None]:
!ls -ltr $TASK_DIR/dist/trainer-0.1.tar.gz

### Upload the Package to GCS

In [None]:
destination_blob_name = f"custom-training-python-package/{APP_NAME}/trainer-0.1.tar.gz"
source_file_name = f"{TASK_DIR}/dist/trainer-0.1.tar.gz"

python_package_gcs_uri = upload_blob(
    BUCKET_NAME, source_file_name, destination_blob_name
)
python_module_name = "trainer.task"

print(f"Custom Training Python Package is uploaded to: {python_package_gcs_uri}")

# Create TensorFlow Serving Container

Download the TensorFlow Serving Docker image.

In [None]:
!docker pull tensorflow/serving:latest

Create a tag for registering the image and register the image with Cloud Container Registry (gcr.io).

In [None]:
TF_SERVING_CONTAINER_IMAGE_URI = f"gcr.io/{MY_PROJECT}/tf-serving"

In [None]:
!docker tag tensorflow/serving $TF_SERVING_CONTAINER_IMAGE_URI
!docker push $TF_SERVING_CONTAINER_IMAGE_URI

# Run Custom Python Package Training with Managed Text Dataset

## Initialize Vertex SDK for Python

Initialize the *client* for Vertex AI.

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=MY_PROJECT, staging_bucket=MY_STAGING_BUCKET)

## Create a Dataset on Vertex AI
We will now create a Vertex AI text dataset using the previously prepared csv files. Choose one of the options below. 

In [None]:
dataset_display_name = f"temp-{APP_NAME}-content"
gcs_source = gcs_source_train_url

#### Option 1: Create a Dataset with CSV File

In [None]:
dataset = aiplatform.TextDataset.create(
    display_name=dataset_display_name,
    gcs_source=gcs_source,
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
    sync=False,
)

#### Option 2: Create a Dataset, then Import CSV File

```
dataset = aiplatform.TextDataset.create(
    display_name=dataset_display_name,
)
dataset.import_data(
    gcs_source=gcs_source, 
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
    sync=False
)
```

#### Option 3: Retrieve a Dataset on Vertex AI
If you have previously created a Dataset on Vertex AI, you can retrieve the dataset using the `dataset_name`.

```
dataset_name = 'YOUR DATASET NAME'

dataset = aiplatform.TextDataset(dataset_name)
dataset.resource_name
```

## Launch a Training Job and Create a Model on Vertex AI

We will now train a model with the python package we just built.

### Config a Training Job

In [None]:
MODEL_NAME = APP_NAME
PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = (
    "gcr.io/cloud-aiplatform/training/tf-cpu.2-3:latest"
)

You will need to specify the python package that was built and uploaded to GCS, the module name of the python package, the pre-built training container image uri for training, and in this example, we are using TensorFlow serving container for prediction.

In [None]:
job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=f"temp_{TASK_NAME}_tf-serving",
    python_package_gcs_uri=python_package_gcs_uri,
    python_module_name=python_module_name,
    container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,
    model_serving_container_image_uri=TF_SERVING_CONTAINER_IMAGE_URI,
    model_serving_container_command=["/usr/bin/tensorflow_model_server"],
    model_serving_container_args=[
        f"--model_name={MODEL_NAME}",
        "--model_base_path=$(AIP_STORAGE_URI)",
        "--rest_api_port=8080",
        "--port=8500",
        "--file_system_poll_wait_seconds=31540000",
    ],
    model_serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
    model_serving_container_health_route=f"/v1/models/{MODEL_NAME}",
)

### Run the Training Job

In [None]:
model = job.run(
    dataset=dataset,
    annotation_schema_uri=aiplatform.schema.dataset.annotation.text.classification,
    args=["--epochs", "50"],
    replica_count=1,
    model_display_name=f"temp_{TASK_NAME}_tf-serving",
    sync=False,
)

In [None]:
model.wait()

# Deploy a Model and Create an Endpoint on Vertex AI

Deploy your model, then wait until the model FINISHES deployment before proceeding to prediction.

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4", sync=False)

In [None]:
endpoint.wait()

## Predict on the Endpoint

In [None]:
class_names = ["csharp", "java", "javascript", "python"]

class_ids = range(len(class_names))

class_indices = dict(zip(class_names, class_ids))
class_maps = dict(zip(class_ids, class_names))
print(f"Class Indices: {class_indices}")
print(f"Class Maps:    {class_maps}")

In [None]:
text_inputs = [
    "how do I extract keys from a dict into a list?",  # python
    "debug public static void main(string[] args) {...}",  # java
]

In [None]:
import numpy as np

predictions = endpoint.predict(instances=[[text] for text in text_inputs])
for text, predicted_scores in zip(text_inputs, predictions.predictions):
    class_id = np.argmax(predicted_scores)
    class_name = class_maps[class_id]
    print(f"Question: {text}")
    print(f"Predicted Tag: {class_name}\n")

# Batch Prediction Job on the Model

In [None]:
import json

import tensorflow as tf


def upload_test_data_to_gcs(test_data_dir, test_gcs_url):
    """Create JSON file using test data content."""

    input_name = "text_vectorization_input"

    with tf.io.gfile.GFile(test_gcs_url, "w") as gf:

        for root, _, files in os.walk(test_data_dir):
            for file in files:
                if file.endswith(".txt"):
                    file_fn = os.path.join(root, file)
                    with open(file_fn, "r") as f:
                        content = f.readlines()
                        lines = [x.strip().strip('"') for x in content]

                        data = {input_name: [lines[0]]}
                        gf.write(json.dumps(data))
                        gf.write("\n")
    return

In [None]:
gcs_source_test_url = f"gs://{BUCKET_NAME}/{GCS_PREFIX}/data/test.json"
upload_test_data_to_gcs(
    test_data_dir=os.path.join(data_dir, "test"), test_gcs_url=gcs_source_test_url
)

print(f"Test data content is loaded to {gcs_source_test_url}")

In [None]:
!gsutil ls $gcs_source_test_url

In [None]:
batch_predict_job = model.batch_predict(
    job_display_name=f"temp_{TASK_NAME}_tf-serving",
    gcs_source=gcs_source_test_url,
    gcs_destination_prefix=f"gs://{BUCKET_NAME}/{GCS_PREFIX}/batch_prediction",
    machine_type="n1-standard-4",
    sync=False,
)

In [None]:
batch_predict_job.wait()
bp_iter_outputs = batch_predict_job.iter_outputs()

prediction_errors_stats = list()
prediction_results = list()
for blob in bp_iter_outputs:
    if blob.name.split("/")[-1].startswith("prediction.errors_stats"):
        prediction_errors_stats.append(blob.name)
    if blob.name.split("/")[-1].startswith("prediction.results"):
        prediction_results.append(blob.name)

In [None]:
tags = list()
for prediction_result in prediction_results:
    gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{prediction_result}"
    with tf.io.gfile.GFile(name=gfile_name, mode="r") as gfile:
        for line in gfile.readlines():
            line = json.loads(line)
            text = line["instance"]["text_vectorization_input"][0]
            prediction = line["prediction"]
            class_id = np.argmax(prediction)
            class_name = class_maps[class_id]
            tags.append([text, class_name])

In [None]:
import pandas as pd

tags_df = pd.DataFrame(tags, columns=["question", "tag"])
tags_df.head()

In [None]:
tags_df["tag"].value_counts()