In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 1 : data management: get started with Vertex datasets

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage1/get_started_vertex_datasets.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/ai/platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/official/automl/ml_ops_stage1/get_started_vertex_datasets.ipynb">
      Open in Google Cloud Notebooks
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 1 : data management: get started with Vertex datasets.

### Objective

In this tutorial, you learn how to use `Vertex Dataset` for training with `Vertex AI`.

This tutorial uses the following Google Cloud ML services:

- `Vertex Datasets`
- `BigQuery Datasets`

The steps performed include:

- Create a Vertex `Dataset` resource for:
    - image data
    - text data
    - video data
    - tabular data
    - forecasting data


- Search `Dataset` resources using a filter.
- Read a sample of a `BigQuery` dataset into a dataframe.
- Generate statistics and data schema using TensorFlow Data Validation from the samples in the dataframe.
- Detect anomalies in new data using TensorFlow Data Validation.
- Generate a TFRecord feature specification using TensorFlow Transform from the data schema.
- Export a dataset and convert to TFRecords.

### Recommendations

When doing E2E MLOps on Google Cloud, the following best practices with Vertex Datasets:

- Use CSV index file format for image data
- Use CSV index file format for text data:
    - For short text strings, embed the text string in the CSV file.
    - For long text strings, place text in referenced text file.


- Use JSON index file format for video data
- For tabular data:
    - For small datasets use CSV index file format.
    - For large datasets use BigQuery table.


- Use `filter` and `order_by` parameters in the `list()` methods to find the latest versions of datasets.

- When custom training with a `Vertex Dataset`:
    - tabular data :
        - Use the CSV index file or BigQuery table reference.
        - Create a tf.data.Dataset generator from the CSV index file/BigQuery table.
    - image/video data:
        - Export the data to a JSONL index file.
        - Using the index file, convert the images/videos and labels to TFRecords.
        - Create a tf.data.Dataset generator from the TFRercords.
    - text data:
        - If text strings are embedded:
            - Convert to CSV file.
            - Create a tf.data.Dataset generator from the CSV index file.
        - If text strings are in text files:
            - Using the JSON index file, convert the text files and labels to TFRecords.
            - Create a tf.data.Dataset from the TFRecords.

## Installation

Install the latest version of Vertex SDK for Python.

In [None]:
import os


# Google Cloud Notebook
if os.path.exists("/opt/deeplearning/metadata/env_version"):
    USER_FLAG = '--user'
else:
    USER_FLAG = ''

! pip3 install --upgrade google-cloud-aiplatform $USER_FLAG

Install the latest GA version of *TensorFlow Data Validation* library as well.

In [None]:
! pip3 install -U tensorflow-data-validation $USER_FLAG

Install the latest GA version of *TensorFlow Transform* library as well.

In [None]:
! pip3 install -U tensorflow-transform $USER_FLAG

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = 'us-central1'  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you submit a custom training job using the Vertex SDK, you upload a Python package
containing your training code to a Cloud Storage bucket. Vertex AI runs
the code from this package. In this tutorial, Vertex AI also saves the
trained model that results from your job in the same bucket. You can then
create an `Endpoint` resource based on this output in order to serve
online predictions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "gs://[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_NAME

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip

#### Import BigQuery

Import the BigQuery package into your Python environment.

In [None]:
from google.cloud import bigquery

#### Import TFDV

Import the TensorFlow Data Validation (TFDV) package into your Python environment.

In [None]:
import tensorflow_data_validation as tfdv

#### Import TensorFlow Transform

Import the TensorFlow Transform (TFT) package into your Python environment.

In [None]:
import tensorflow_transform as tft

### Initialize Vertex SDK for Python

Initialize the Vertex SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, location=REGION)

### Create BigQuery client

Create the BigQuery client.

In [None]:
bqclient = bigquery.Client()

## Vertex Datasets

Vertex `Datasets` are the means for managing your datasets within Vertex AI services. Vertex Datasets are also referred to as `Dataset` resources. There are four types of `Dataset` resources, specific to the data type:

- `ImageDataset`: image data
- `TabularDataset`: tabular (structured) data
- `TextDataset`: text (natural language) data
- `VideoDataset`: video data
- `TimeSeriesDataset`: forecasting data

A Vertex `Dataset` provides the following capabilities:

- A unique internal identifier for automatic (programatic) processes.
- A user specificed (display name) identifier for interactive processes.
- Compatible with AutoML training.
- Exporting dataset for custom training.
- Dataset search capability.
- Creation/update timestamps.
- Statistics

Learn more about [All dataset documentation](https://cloud.google.com/vertex-ai/docs/datasets/datasets)

In [None]:
IMPORT_FILE = 'gs://cloud-samples-data/vision/automl_classification/flowers/all_data_v2.csv'

### Create the Dataset

Next, create the `Dataset` resource using the `create` method for the `ImageDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.
- `import_schema_uri`: The data labeling schema for the data items:
  - `single_label`: Binary and multi-class classification
  - `multi_label`: Multi-label multi-class classification
  - `bounding_box`: Object detection
  - `image_segmentation`: Segmentation

Learn more about [ImageDataset](https://cloud.google.com/vertex-ai/docs/datasets/prepare-image).

In [None]:
dataset = aip.ImageDataset.create(
    display_name="example" + "_" + TIMESTAMP,
    gcs_source=[IMPORT_FILE],
    import_schema_uri=aip.schema.dataset.ioformat.image.single_label_classification
)

print(dataset.resource_name)

In [None]:
IMPORT_FILE = 'gs://automl-video-demo-data/hmdb_split1_5classes_train_inf.csv'

### Create the Dataset

Next, create the `Dataset` resource using the `create` method for the `VideoDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.
- `import_schema_uri`: The data labeling schema for the data items.
  - `classification`: Binary and multi-class classification
  - `object_tracking`: Object tracking
  - `action_recognition`: Action recognition

Learn more about [VideoDataset](https://cloud.google.com/vertex-ai/docs/datasets/prepare-video).

In [None]:
dataset = aip.VideoDataset.create(
    display_name="example" + "_" + TIMESTAMP,
    gcs_source=[IMPORT_FILE],
    import_schema_uri=aip.schema.dataset.ioformat.video.classification
)

print(dataset.resource_name)

In [None]:
IMPORT_FILE = 'gs://cloud-ml-data/NL-classification/happiness.csv'

### Create the Dataset

Next, create the `Dataset` resource using the `create` method for the `TextDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.
- `import_schema_uri`: The data labeling schema for the data items.
  - `single_label`: Binary and multi-class classification
  - `multi_label`: Multi-label multi-class classification
  - `sentiment`: Sentiment analysis
  - `extraction`: Entity extraction

Learn more about [TextDataset](https://cloud.google.com/vertex-ai/docs/datasets/prepare-text).

In [None]:
dataset = aip.TextDataset.create(
    display_name="example" + "_" + TIMESTAMP,
    gcs_source=[IMPORT_FILE],
    import_schema_uri=aip.schema.dataset.ioformat.text.single_label_classification
)

print(dataset.resource_name)

In [None]:
IMPORT_FILE = "bq://bigquery-public-data.samples.gsod"
BQ_TABLE = 'bigquery-public-data.samples.gsod'

### Create the Dataset

#### CSV input data

Next, create the `Dataset` resource using the `create` method for the `TabularDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.

Learn more about [TabularDataset from CSV files](https://cloud.google.com/vertex-ai/docs/datasets/create-dataset-api#aiplatform_create_dataset_tabular_gcs_sample-python)

In [None]:
dataset = aip.TabularDataset.create(
    display_name="example" + "_" + TIMESTAMP,
    bq_source=[IMPORT_FILE]
)

print(dataset.resource_name)

In [None]:
IMPORT_FILE = 'gs://cloud-samples-data/ai-platform/covid/bigquery-public-covid-nyt-us-counties-train.csv'

### Create the Dataset

Next, create the `Dataset` resource using the `create` method for the `TimeSeriesDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.
- `bq_source`: Alternatively, import data items from a BigQuery table into the `Dataset` resource.

Learn more about [TimeSeriesDataset](https://cloud.google.com/vertex-ai/docs/datasets/prepare-tabular).

In [None]:
dataset = aip.TimeSeriesDataset.create(
    display_name="example" + "_" + TIMESTAMP,
    gcs_source=[IMPORT_FILE],
)

print(dataset.resource_name)

## Vertex Dataset properties and methods

The following are the `Dataset` methods:

- `list()`: List instances of a `Dataset` resource.
- `import_data()`: Import additional data into a `Dataset` resource.
- `export_data()`: Export dataset index file for custom training.
- `delete()`: Delete the dataset.
- `update()`: Not implemented yet.

Get more information on each method, by executing in Python: help(method_name)

The following are the `Dataset` properties:

- `name`: The internal unique identifier.
- `resource_name`: The fully qualified internal unique identifier.
- `display_name`: The human assigned identifier.
- `create_time`: The timestamp when the dataset was created.
- `update_time`: The timestamp when the dataset was last updated.
- `metadata_schema_uri`: The data labeling schema.

### List datasets

The `list()` method returns all datasets, as a list, of the corresponding data type -- e.g., Tabular.

In [None]:
datasets = aip.TabularDataset.list()
for dataset in datasets:
    print(dataset.name)

### List datasets matching a filter

The `list()` method supports returning only datasets that match a `filter`. For example, all datasets where the `display_name` matches the specified display name:
```
    list(filter='display_name=my_display_name')
```

When the search matches multiple datasets, one can sort the list based on the dataset properties. For eample, sort by creation time, where the first dataset is the latest:

```
    list(filter='display_name=my_display_name,order_by=create_time')
```

In [None]:
datasets = aip.TabularDataset.list(filter=f'display_name="example_{TIMESTAMP}"',order_by='create_time')
latest_dataset = datasets[0]
print(latest_dataset)

## TensorFlow Data Validation

The TensorFlow Data Validation (TFDV) package is used in conjunction with Vertex and BigQuery datasets for:

- Generating dataset statistics.
- Generating data schema for data validation.
- Detecting anomalies in new data using the data schema.
- Generating feature specifications for data conversion to TFRecords.

Learn more about [TensorFlow Data Validation: Get Started](https://www.tensorflow.org/tfx/data_validation/get_started)

### Read the BigQuery dataset into a pandas dataframe

Next, you read a sample of the dataset into a pandas dataframe using BigQuery `list_rows()` and `to_dataframe()` method, as follows:

- `list_rows()`: Performs a query on the specified table and returns a row iterator to the query results. Optionally specify:
 - `selected_fields`: Subset of fields (columns) to return.
 - `max_results`: The maximum number of rows to return. Same as SQL LIMIT command.


- `rows.to_dataframe()`: Invokes the row iterator and reads in the data into a pandas dataframe.

Learn more about [Loading BigQuery table into a dataframe](https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas)

In [None]:
# Download a table.
table = bigquery.TableReference.from_string(
    "bigquery-public-data.samples.gsod"
)

rows = bqclient.list_rows(
    table,
    max_results=500,
    selected_fields=[
        bigquery.SchemaField("station_number", "STRING"),
        bigquery.SchemaField("year", "INTEGER"),
        bigquery.SchemaField("month", "INTEGER"),
        bigquery.SchemaField("day", "INTEGER"),
        bigquery.SchemaField("mean_temp", "FLOAT"),
    ]

)

dataframe = rows.to_dataframe()
print(dataframe.head())

###  Generate dataset statistics

#### Dataframe input data

Generate statistics on the dataset with the TensorFlow Data Validation (TFDV) package. Use the `generate_statistics_from_dataframe()` method, with the following parameters:

- `dataframe`: The dataset in an in-memory pandas dataframe.
- `stats_options`: The selected statistics options:
  - `label_feature`: The column which is the label to predict.
  - `sample_rate`: The sampling rate. If specified, statistics is computed over the sample.
  - `num_top_values`: number of most frequent feature values to keep for string features.

Learn about [TensorFlow Data Validation (TFDV)](https://www.tensorflow.org/tfx/data_validation/get_started).

In [None]:
stats = tfdv.generate_statistics_from_dataframe(
    dataframe=dataframe,
    stats_options=tfdv.StatsOptions(
        label_feature='mean_temp',
        sample_rate=1,
        num_top_values=50
    )
)

print(stats)

### Visualize dataset statistics

A visualization of the dataset statistics can be displayed using the TFDV `visualize_statistics()` method.

In [None]:
tfdv.visualize_statistics(stats)

###  Generate the raw data schema

Generate the data schema on the dataset with the TensorFlow Data Validation (TFDV) package. Use the `infer_schema()` method, with the following parameters:

- `statistics`: The statistics generated by TFDV.

In [None]:
schema = tfdv.infer_schema(statistics=stats)
print(schema)

### Detect Anomalizes in additional data

When additional data is available for a dataset, you can check for anomalies between the new and previous data using the TFDV `validate_statistics` method.

The accompanying code example mimics new data by getting the next slice of data (2nd 500 rows) and including an additional field into a dataframe and generating statistics for the new data.

The `validate_statistics()` method is called with the following parameters:

- `statistics`: The statistics for the new data.
- `schema`: The data schema for the previous data.

In [None]:
# Download a table.
table = bigquery.TableReference.from_string(
    "bigquery-public-data.samples.gsod"
)

rows = bqclient.list_rows(
    table,
    max_results=500,
    start_index=500,
    selected_fields=[
        bigquery.SchemaField("station_number", "STRING"),
        bigquery.SchemaField("year", "INTEGER"),
        bigquery.SchemaField("month", "INTEGER"),
        bigquery.SchemaField("day", "INTEGER"),
        bigquery.SchemaField("mean_temp", "FLOAT"),
        bigquery.SchemaField("num_mean_temp_samples", "INTEGER"),
    ]

)

dataframe = rows.to_dataframe()

new_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=dataframe,
    stats_options=tfdv.StatsOptions(
        label_feature='mean_temp',
        sample_rate=1,
        num_top_values=50
    )
)

anomalies = tfdv.validate_statistics(statistics=new_stats, schema=schema)
print(anomalies.anomaly_info)

### Generate the feature specification

Generate the feature specification, compatible with TFRecords, on the dataset with the TensorFlow Transform (TFT) package. Use the `schema_as_feature_spec()` method, with the following parameters:

- `schema`: The data schema generated by TFDV.

In [None]:
feature_spec = tft.tf_metadata.schema_utils.schema_as_feature_spec(
        schema
    ).feature_spec

print(feature_spec)

### Export `Text Dataset` to pandas dataframe.

The property `gca_resource.metadata['inputConfig']['gcsSource']['uri']` contains the list of the one or more imported CSV files.

To create a dataframe from multiple CSV sources, you read each CSV file and concatenate the dataframes together.

In [None]:
all_files = dataset.gca_resource.metadata['inputConfig']['gcsSource']['uri']
df = pd.concat((pd.read_csv(f) for f in all_files))

print(df.head)

### Export dataset index

Next, you export the dataset index to a JSONL file which will then be used by your custom training job to get the data and corresponding labels for training your model. The JSONL index file is generated by calling the export_data() method, with the following parameters:
 - `output_dir`: The Cloud Storage bucket to write the JSONL dataset index file to.

In [None]:
datasets = aip.ImageDataset.list(filter=f'display_name="example_{TIMESTAMP}"',order_by='create_time')
latest_dataset = datasets[0]
print(latest_dataset)

In [None]:
dataset = latest_dataset

In [None]:
EXPORTED_DIR = f'{BUCKET_NAME}/exported'
exported_files = dataset.export_data(output_dir=EXPORTED_DIR)

! gsutil ls $EXPORTED_DIR

#### Quick peak at your exported dataset index file

Take a quick peak at the contents of the exported dataset index file. When the `export_data()` completed, the method returned a list of the paths to the exported dataset index files.

You get the path to the exported dataset index file (`exported_files[0]`) and then display the first ten JSON objects in the file -- i.e., data items.

The JSONL format for each data item is:

    { "imageGcsUri": path_to_the_image, "classificationAnnotation": { "displayName": label } }

In [None]:
jsonl_index = exported_files[0]

! gsutil cat $jsonl_index | head

#### Reading the index file

You will need to add code to your custom training Python script to read the exported dataset index, so that you can generate training batches for custom training your model.

Below is an example of how you might each the exported dataset index file:

1. Use Tensorflow's Cloud Storage file methods to open the file (`tf.io.gfile.GFile()`) and read all the lines (`f.readlines()`), where each line is a data item represented as a JSONL object.

2. For each line in the file, convert the line to a JSON object (`json.loads()`).

3. Extract the path to the image (`['imageGcsUri']`) and label (`['classificationAnnotation']['displayName']`).

In [None]:
import tensorflow as tf
import json
with tf.io.gfile.GFile(jsonl_index, 'r') as f:
    export_data_items = f.readlines()

for _ in range(10):
    j = json.loads(export_data_items[_])
    print(j['imageGcsUri'], j['classificationAnnotation']['displayName'])

#### Create TFRecords

Next, we needs to create a feeder mechanism to feed data to the model you will train from the dataset index file. There are lots of choices for how to construct a feeder. We will cover a two options here, both using TFRecords:

    1. Storing the image data as raw uncompressed image data (1 byte per pixel).
    2. Storing the image data as preprocessed data -- machine learning ready --  (4 bytes per pixel).

These two methods demonstrate a trade-off between disk storage and compute time. In both cases, we do a prepass over the image data to cache the data into a form that will accelerate the training time for the model. But, by caching we are both using disk space and increasing I/O traffic from the disk to the compute device -- e.g., CPU, GPU, TPU.

In the raw uncompressed format, you are minimizing the size on disk and I/O traffic for the cache data, but have the overhead that on each epoch, the preprocessing of the image data has to be repeated. In the preprocessed format, you are minimizing the compute time by preprocessing once and caching the preprocessed data -- i.e., machine learning ready. The amount of data on disk will be four times the size when training as Float32, and you are increasing by the same amount disk space and I/O traffic from the disk to the compute engine.

The helper functions `TFExampleImageUncompressed` and `TFExampleImagePreprocessed` both take the parameters:

- `path`: The Cloud Storage path to the image file.
- `label`: The corresponding label for the image file.
- `shape`: The (H,W) input shape to resize the image. If `None`, no resizing occurs.

The helper function `TFExampleImagePreprocessed` has an additional parameter:

- `dtype`: The floating point representation after the pixel data has been normalized. By default, it is set to 32-bit float (np.float32). If you are using NVIDIA GPUs or TPUs you can alternatively train in 16-bit float, by setting `dtype = np.float16`. There are two benefits to training with 16-bit float, when it does not effect the accuracy or number of epochs:

    1. Each matrix multiply operation is 4 times faster than the 32-bit equivalent -- albeit the model weights need to be stored as 16-bit as well.
    2. The disk space and I/O bandwidth is reduced by 1/2.

Let's look at bit deeper into the functions for creating `TFRecord` training data. First, `TFRecord` is a serialized binary encoding of the training data. As an encoding, one needs to specify a schema for how the fields are encoded, which is then used later to decode during when feeding training data to your model.

The schema is defined as an instance of `tf.train.Example` per data item in the training data. Each instance of `tf.train.Example` consists of a sequence fields, each defined as a key/value pair. In our helper function, the key entries are:

- `image`: The encoded raw image data.
- `label`: The label assigned to the image.
- `shape`: The shape of the image when decoded.

The value for each key/value pair is an instance of `tf.train.Feature`, where:

- `bytes_list`: the data to encode is a byte string.
- `int64_list`: the data to encode is an array of one or more integer values.
- `float_list`: the data to encode is an array of one or more floating point values.

In [None]:
import numpy as np


def TFExampleImageUncompressed(path, label, shape=None):
        ''' The uncompressed version of the image '''

        # read in (and uncompress) the image
        with tf.io.gfile.GFile(path, 'rb') as f:
            data = f.read()
        image = tf.io.decode_image(data)

        if shape:
            image = tf.image.resize(image, shape)
        image = image.numpy()
        shape = image.shape

        # make the record
        return tf.train.Example(features = tf.train.Features(feature = {
        'image': tf.train.Feature(bytes_list = tf.train.BytesList(value =
                                  [image.tostring()])),
        'label': tf.train.Feature(int64_list = tf.train.Int64List(value =
                                  [label])),
        'shape': tf.train.Feature(int64_list = tf.train.Int64List(value =
                                  [shape[0], shape[1], shape[2]]))
        }))

def TFExampleImagePreprocessed(path, label, shape=None, dtype=np.float32):
        ''' The normalized version of the image '''

        # read in (uncompress) the image and normalize the pixel data
        image = (cv2.imread(path) / 255.0).astype(dtype)

        if shape:
            image = tf.image.resize(image, shape)
        image = image.numpy()
        shape = image.shape

        # make the record
        return tf.train.Example(features = tf.train.Features(feature = {
        'image': tf.train.Feature(bytes_list = tf.train.BytesList(value =
                                  [image.tostring()])),
        'label': tf.train.Feature(int64_list = tf.train.Int64List(value =
                                  [label])),
        'shape': tf.train.Feature(int64_list = tf.train.Int64List(value =
                                  [shape[0], shape[1], shape[2]]))
        }))

#### Write training data to TFRecord file

Next, you will create a single TFRecord for all the training data specified in the exported dataset index:

- Specify the cache method by setting the variable `CACHE` to either `TFExampleImageUncompressed` or `TFExampleImagePreprocessed`.
- Convert class names from the dataset to integer labels, using `cls2label`.
- Read in the data item list from the exported dataset index file -- `tf.io.gfile.GFile(jsonl_index, 'r')`.
- Set the Cloud Storage location to store the cached TFRecord file -- `GCS_TFRECORD_URI`.
- Generate the cached data using `tf.io.TFRecordWriter(gcs_tfrecord_uri)` for each data item in the exported dataset index.
 - Extract the Cloud Storage path and class name - `json.loads(data_item)`
 - Convert class name to integer label - `label = cls2label[cls]`
 - Encode the data item - `example = CACHE(image, label)`
 - Write the encoded data item to the TFRecord file - `writer.write(example.SerializeToString())`

This may take about 20 minutes.

In [None]:
# Select TFRecord method of encoding
CACHE = TFExampleImageUncompressed  # [ TFExampleImageUncompressed, TFExampleImagePreprocessed]

# Map labels to class names
cls2label = {
    'daisy': 0,
    'dandelion': 1,
    'roses': 2,
    'sunflowers': 3,
    'tulips': 4
}

# Read in each example from exported dataset index
with tf.io.gfile.GFile(jsonl_index, 'r') as f:
    data = f.readlines()

# The path to the TFRecord cached file.
GCS_TFRECORD_URI = BUCKET_NAME + '/flowers.tfrecord'

# Create the TFRecord cached file
with tf.io.TFRecordWriter(GCS_TFRECORD_URI) as writer:
    n=0
    for data_item in data:
        j = json.loads(data_item)
        image = j['imageGcsUri']
        cls = j['classificationAnnotation']['displayName']
        label = cls2label[cls]
        example = CACHE(image, label, shape=(128, 128))
        writer.write(example.SerializeToString())
        n += 1
        if n % 10 == 0:
            print(n, image)

listing = ! gsutil ls -la $GCS_TFRECORD_URI
print("TFRecord File", listing)

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Pipeline
- Model
- Endpoint
- AutoML Training Job
- Batch Job
- Custom Job
- Hyperparameter Tuning Job
- Cloud Storage Bucket

In [None]:
delete_all = True

if delete_all:
    # Delete the dataset using the Vertex dataset object
    try:
        if 'dataset' in globals():
            dataset.delete()
    except Exception as e:
        print(e)

    # Delete the model using the Vertex model object
    try:
        if 'model' in globals():
            model.delete()
    except Exception as e:
        print(e)

    # Delete the endpoint using the Vertex endpoint object
    try:
        if 'endpoint' in globals():
            endpoint.delete()
    except Exception as e:
        print(e)

    # Delete the AutoML or Pipeline trainig job
    try:
        if 'dag' in globals():
            dag.delete()
    except Exception as e:
        print(e)

    # Delete the custom trainig job
    try:
        if 'job' in globals():
            job.delete()
    except Exception as e:
        print(e)

    # Delete the batch prediction job using the Vertex batch prediction object
    try:
        if 'batch_predict_job' in globals():
            batch_predict_job.delete()
    except Exception as e:
        print(e)

    # Delete the hyperparameter tuning job using the Vertex hyperparameter tuning object
    try:
        if 'hpt_job' in globals():
            hpt_job.delete()
    except Exception as e:
        print(e)

    if 'BUCKET_NAME' in globals():
        ! gsutil rm -r $BUCKET_NAME