In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Tabular Workflows: TabNet Pipeline

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/official/tabular_workflows/tabnet_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/official/tabular_workflows/tabnet_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/official/tabular_workflows/tabnet_on_vertex_pipelines.ipynb">
        <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview

This notebook showcases how to run the TabNet algorithm using Vertex AI Tabular Workflows.


### Dataset

The dataset you will be using is the [Safe Driver Prediction](https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction/data?select=train.csv) dataset for predicting the probability of an auto insurance policy holder filing a claim for a given incident.

### Objective

In this tutorial, you will create two classification models using [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) downloaded from [Google Cloud Pipeline Components](https://cloud.google.com/vertex-ai/docs/pipelines/components-introduction) (GCPC). These pipelines will be Vertex AI Tabular Workflow TabNet pipelines which are maintained by Google. The tutorial first covers how to create a TabNet CustomJob pipeline, and then a HyperparameterTuningJob pipeline.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

**If you are using Colab or Vertex AI Workbench Notebooks**, your environment already meets all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements. You need the following:

- The Cloud Storage SDK
- Python 3
- virtualenv
- Jupyter notebook running in a virtual environment with Python 3

The Cloud Storage guide to [Setting up a Python development environment](https://cloud.google.com/python/setup) and the [Jupyter installation guide](https://jupyter.org/install) provide detailed instructions for meeting these requirements. The following steps provide a condensed set of instructions:

1. [Install and initialize the SDK](https://cloud.google.com/sdk/docs/).

2. [Install Python 3](https://cloud.google.com/python/setup#installing_python).

3. [Install virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv) and create a virtual environment that uses Python 3.  Activate the virtual environment.

4. To install Jupyter, run `pip3 install jupyter` on the command-line in a terminal shell.

5. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

6. Open this notebook in the Jupyter Notebook Dashboard.

## Install additional packages

Install the latest version of the Google Cloud Pipeline Components (GCPC) SDK.

In [None]:
# Depending on the environment, this might throw a
# pip dependency resolver error. Please ignore it.
!pip install -U google-cloud-pipeline-components -q

### Restart the kernel
Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.


**Note: Once this cell has finished running, continue on. You do not need to re-run any of the cells above.**


In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### GPU runtime

This tutorial does not require a GPU runtime.

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the following APIs: Vertex AI APIs, Dataflow APIs, Compute Engine APIs, and Cloud Storage.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,dataflow.googleapis.com,compute_component,storage-component.googleapis.com)

4. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$`.

## Notes about service account and permission

**By default no configuration is required**, if you run into any permission related issue, please make sure the service accounts above have the required roles:

|Service account email|Description|Roles|
|---|---|---|
|PROJECT_NUMBER-compute@developer.gserviceaccount.com|Compute Engine default service account|Dataflow Admin, Dataflow Worker, Storage Admin, BigQuery Admin, Vertex AI User|
|service-PROJECT_NUMBER@gcp-sa-aiplatform.iam.gserviceaccount.com|AI Platform Service Agent|Vertex AI Service Agent|


1. Goto https://console.cloud.google.com/iam-admin/iam.
2. Check the "Include Google-provided role grants" checkbox.
3. Find the above emails.
4. Grant the corresponding roles.

### Using data source from a different project
- For the BQ data source, grant both service accounts the "BigQuery Data Viewer" role.
- For the CSV data source, grant both service accounts the "Storage Object Viewer" role.


### Set your project ID

Set your project ID below. If you know know your project ID, leave the field blank and the following cells may be able to find it. Optionally, you may also set a service account in the cell below.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

### Region
You may change the `REGION` variable, which is used for Vertex Forecasting operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebooks**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

- In the Cloud Console, go to the [Create service account key](https://console.cloud.google.com/apis/credentials/serviceaccountkey) page.

- **Click Create service account**.

- In the **Service account name** field, enter a name, and click **Create**.

- In the **Grant this service account access to project** section, click the Role drop-down list. Type "Vertex" into the filter box, and select **Vertex Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

- Click Create. A JSON file that contains your key downloads to your local environment.

- Enter the path to your service account key as the GOOGLE_APPLICATION_CREDENTIALS variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Google Cloud Notebook, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account. Alternatively, you may edit this notebook to authenticate using
    # gcloud.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

All training related files (TF model checkpoint, TensorBoard file, etc) will be saved to the GCS bucket. The pipeline will not clean up the files since some of them might be useful for you, **please make sure to clean up the files**. For easy cleanup, you can set [GCS bucket level TTL](https://cloud.google.com/storage/docs/lifecycle).

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}
GENERATE_BUCKET_URI = True  # @param {type:"boolean"}

Create the bucket if it doesn't already exist.

In [None]:
import uuid

if GENERATE_BUCKET_URI:
    bucket_name = "gs://test-{}".format(uuid.uuid4())
    !gsutil mb -p {PROJECT_ID} -l {REGION} {bucket_name}

    # set GCS bucket object TTL to 7 days
    !echo '{"rule":[{"action": {"type": "Delete"},"condition": {"age": 7}}]}' > gcs_lifecycle.tmp
    !gsutil lifecycle set gcs_lifecycle.tmp {bucket_name}
    !rm gcs_lifecycle.tmp

    BUCKET_URI = bucket_name
    print(f"changed BUCKET_URI to {BUCKET_URI} due to GENERATE_BUCKET_URI is True")

if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + uuid.uuid4()

! gsutil ls -b $BUCKET_URI || gsutil mb -l $DATA_REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

## Import libraries and define constants

In [None]:
# Import required modules
import json
from typing import Any, Dict, List

from google.cloud import aiplatform, storage
from google_cloud_pipeline_components.experimental.automl.tabular import \
    utils as automl_tabular_utils

## Initialize Vertex SDK for Python

Initialize the Vertex SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

### Define helper functions

In [None]:
def get_model_artifacts_path(task_details: List[Dict[str, Any]], task_name: str) -> str:
    task = get_task_detail(task_details, task_name)
    return task.outputs["unmanaged_container_model"].artifacts[0].uri


def get_model_uri(task_details: List[Dict[str, Any]]) -> str:
    task = get_task_detail(task_details, "model-upload")
    # in format https://<location>-aiplatform.googleapis.com/v1/projects/<project_number>/locations/<location>/models/<model_id>
    model_id = task.outputs["model"].artifacts[0].uri.split("/")[-1]
    return f"https://console.cloud.google.com/vertex-ai/locations/{GCP_REGION}/models/{model_id}?project={GCP_PROJECT}"


def get_bucket_name_and_path(uri: str) -> str:
    no_prefix_uri = uri[len("gs://") :]
    splits = no_prefix_uri.split("/")
    return splits[0], "/".join(splits[1:])


def download_from_gcs(uri: str) -> str:
    bucket_name, path = get_bucket_name_and_path(uri)
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    return blob.download_as_string()


def write_to_gcs(uri: str, content: str):
    bucket_name, path = get_bucket_name_and_path(uri)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    blob.upload_from_string(content)


def get_task_detail(
    task_details: List[Dict[str, Any]], task_name: str
) -> List[Dict[str, Any]]:
    for task_detail in task_details:
        if task_detail.task_name == task_name:
            return task_detail


def get_evaluation_metrics(
    task_details: List[Dict[str, Any]],
) -> str:
    ensemble_task = get_task_detail(task_details, "model-evaluation")
    return download_from_gcs(
        ensemble_task.outputs["evaluation_metrics"].artifacts[0].uri
    )

## Define training specification

### Training inputs

#### The following parameters can be set:

##### Pipeline parameters

- `project`: The GCP project that runs the pipeline components.
- `location`: The GCP region that runs the pipeline components.
- `root_dir`: The root GCS directory for the pipeline components.
- `target_column`: The target column name.
- `prediction_type`: The type of prediction the model is to produce.
  'classification' or 'regression'.
- `transform_config`: The path to a GCS file containing the transformations to apply.
data_source_csv_filenames: The CSV data source.
- `data_source_bigquery_table_path`: The BigQuery data source.
- `predefined_split_key`: The predefined_split column name.
- `timestamp_split_key`: The timestamp_split column name.
- `stratified_split_key`: The stratified_split column name.
- `training_fraction`: The training fraction.
- `validation_fraction`: The validation fraction.
- `test_fraction`: The test fraction.
- `weight_column`: The weight column name.
- `stats_and_example_gen_dataflow_machine_type`: The dataflow machine type for
  stats_and_example_gen component.
- `stats_and_example_gen_dataflow_max_num_workers`: The max number of Dataflow
  workers for stats_and_example_gen component.
- `stats_and_example_gen_dataflow_disk_size_gb`: Dataflow worker's disk size in GB for stats_and_example_gen component.
- `transform_dataflow_machine_type`: The dataflow machine type for transform
  component.
- `transform_dataflow_max_num_workers`: The max number of Dataflow workers for
  transform component.
- `transform_dataflow_disk_size_gb`: Dataflow worker's disk size in GB for
  transform component.
- `training_machine_spec`: The machine spec for trainer component. See https://cloud.google.com/compute/docs/machine-types for options.
- `training_replica_count`: The replica count for the trainer component.
- `run_evaluation`: Whether to run evaluation steps during training.
- `evaluation_batch_predict_machine_type`: The prediction server machine type for batch predict components during evaluation.
- `evaluation_batch_predict_starting_replica_count`: The initial number of prediction server for batch predict components during evaluation.
- `evaluation_batch_predict_max_replica_count`: The max number of prediction server for batch predict components during evaluation.
- `evaluation_dataflow_machine_type`: The dataflow machine type for evaluation components.
- `evaluation_dataflow_max_num_workers`: The max number of Dataflow workers for evaluation components.
- `evaluation_dataflow_disk_size_gb`: Dataflow worker's disk size in GB for evaluation components.
- `dataflow_service_account`: Custom service account to run dataflow jobs.
- `dataflow_subnetwork`: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
- `dataflow_use_public_ips`: Specifies whether Dataflow workers use public IP
  addresses.
- `encryption_spec_key_name`: The KMS key name.

##### Model hyperparameters
- `learning_rate`: The learning rate used by the linear optimizer.
- `max_steps`: Number of steps to run the trainer for.
- `max_train_secs`: Amount of time in seconds to run the trainer for.
- `large_category_dim`: Embedding dimension for categorical feature with large
  number of categories.
- `large_category_thresh`: Threshold for number of categories to apply
  large_category_dim embedding dimension to.
- `yeo_johnson_transform`: Enables trainable Yeo-Johnson power transform.
- `feature_dim`: Dimensionality of the hidden representation in feature
  transformation block.
- `feature_dim_ratio`: The ratio of output dimension (dimensionality of the
  outputs of each decision step) to feature dimension.
- `num_decision_steps`: Number of sequential decision steps.
- `relaxation_factor`: Relaxation factor that promotes the reuse of each feature at different decision steps. When it is 1, a feature is enforced to be
  used only at one decision step and as it increases, more flexibility is
  provided to use a feature at multiple decision steps.
- `decay_rate`: Learning rate decaying.
- `decay_every`: Number of iterations for periodically applying learning rate
  decaying.
- `gradient_thresh`: Threshold for the norm of gradients for clipping.
- `sparsity_loss_weight`: Weight of the loss for sparsity regularization
  (increasing it will yield more sparse feature selection).
- `batch_momentum`: Momentum in ghost batch normalization.
- `batch_size_ratio`: The ratio of virtual batch size (size of the ghost batch
  normalization) to batch size.
- `num_transformer_layers`: The number of transformer layers for each decision
  step. used only at one decision step and as it increases, more flexibility
  is provided to use a feature at multiple decision steps.
- `num_transformer_layers_ratio`: The ratio of shared transformer layer to
  transformer layers.
- `class_weight`: The class weight is used to computes a weighted cross entropy which is helpful in classify imbalanced dataset. Only used for classification.
- `loss_function_type`: Loss function type. Loss function in classification
  [cross_entropy, weighted_cross_entropy, focal_loss], default is
  cross_entropy. Loss function in regression: [rmse, mae, mse], default is mse.
- `alpha_focal_loss`: Alpha value (balancing factor) in focal_loss function. Only used for classification.
- `gamma_focal_loss`: Gamma value (modulating factor) for focal loss for focal
  loss. Only used for classification.
- `enable_profiler`: Enables profiling and saves a trace during evaluation.
- `seed`: Seed to be used for this run.
- `eval_steps`: Number of steps to run evaluation for. If not
  specified or negative, it means run evaluation on the whole validation
  dataset. If set to 0, it means run evaluation for a fixed number of
  samples.
- `batch_size`: Batch size for training.
- `eval_frequency_secs`: Frequency at which evaluation and checkpointing will
  take place.

HyperparameterTuningJob-specific parameters
- `study_spec_metrics`: List of dictionaries representing metrics to optimize.
The dictionary contains the metric_id, which is reported by the training job, ands the optimization goal of the metric. One of 'minimize' or 'maximize'.
- `study_spec_parameters_override`: List of dictionaries representing parameters to
optimize. The dictionary key is the parameter_id, which is passed to
training job as a command line argument, and the dictionary value is the
parameter specification of the metric.
- `max_trial_count`: The desired total number of trials.
- `parallel_trial_count`: The desired number of trials to run in parallel.
- `algorithm`: Which algorithm to train. Specify 'tabnet' here.
- `max_failed_trial_count`: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails.
- `study_spec_algorithm`: The search algorithm specified for the study. One of
'ALGORITHM_UNSPECIFIED', 'GRID_SEARCH', or 'RANDOM_SEARCH'.- `study_spec_measurement_selection_type`: Which measurement to use if/when the
service automatically selects the final measurement from previously reported intermediate measurements. One of 'BEST_MEASUREMENT' or 'LAST_MEASUREMENT'.


### Configure dataset

In [None]:
data_source_csv_filenames = (
    "gs://cloud-samples-data/vertex-ai/tabular-workflows/datasets/safe-driver/train.csv"
)
data_source_bigquery_table_path = None  # format: bq://bq_project.bq_dataset.bq_table

features = [
    "ps_ind_01",
    "ps_ind_02_cat",
    "ps_ind_03",
    "ps_ind_04_cat",
    "ps_ind_05_cat",
    "ps_ind_06_bin",
    "ps_ind_07_bin",
    "ps_ind_08_bin",
    "ps_ind_09_bin",
    "ps_ind_10_bin",
    "ps_ind_11_bin",
    "ps_ind_12_bin",
    "ps_ind_13_bin",
    "ps_ind_14",
    "ps_ind_15",
    "ps_ind_16_bin",
    "ps_ind_17_bin",
    "ps_ind_18_bin",
    "ps_reg_01",
    "ps_reg_02",
    "ps_reg_03",
    "ps_car_01_cat",
    "ps_car_02_cat",
    "ps_car_03_cat",
    "ps_car_04_cat",
    "ps_car_05_cat",
    "ps_car_06_cat",
    "ps_car_07_cat",
    "ps_car_08_cat",
    "ps_car_09_cat",
    "ps_car_10_cat",
    "ps_car_11_cat",
    "ps_car_11",
    "ps_car_12",
    "ps_car_13",
    "ps_car_14",
    "ps_car_15",
    "ps_calc_01",
    "ps_calc_02",
    "ps_calc_03",
    "ps_calc_04",
    "ps_calc_05",
    "ps_calc_06",
    "ps_calc_07",
    "ps_calc_08",
    "ps_calc_09",
    "ps_calc_10",
    "ps_calc_11",
    "ps_calc_12",
    "ps_calc_13",
    "ps_calc_14",
    "ps_calc_15_bin",
    "ps_calc_16_bin",
    "ps_calc_17_bin",
    "ps_calc_18_bin",
    "ps_calc_19_bin",
    "ps_calc_20_bin",
]

### Configure feature transformation

Transformations can be specified using Feature Transform Engine (FTE) specific configurations. In the following, we provide some sample transform configurations to demonstrate FTE's capabilities:
- Full auto transformations (i.e., `auto_transform_config`): FTE automatically configure a set of built-in transformations for each input column based on its data statistics. 
- Fully specified transformations (i.e., `no_auto_transform_config`): All transformations on input columns are explicitly specified with FTE's built-in transformations. Chaining of multiple transformations on a single column is also supported.
- Mix of auto and explicit transformations (i.e., `mixed_transform_config`).
- Custom transformations (i.e., `transform_config_with_custom_transform`): A mixture of auto and explicit transformations and custom, bring-your-own transform function, where users can define and import their own transform function and use it with FTE's built-in transformations.

In [None]:
auto_transform_config = {"auto_transforms": features}

no_auto_transform_config = {
    "transforms": [
        {"transform": "ZScaleTransform", "input_column_names": ["ps_reg_01"]},
        {"transform": "ZScaleTransform", "input_column_names": ["ps_reg_02"]},
        {"transform": "ZScaleTransform", "input_column_names": ["ps_reg_03"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_10_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_11_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_12_bin"]},
        {
            "transform": "VocabularyTransform",
            "input_column_names": ["target"],
            "output_column_names": ["target"],
        },
    ]
}

mixed_transform_config = {
    "auto_transforms": ["ps_reg_01", "ps_reg_02", "ps_reg_03"],
    "transforms": [
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_10_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_11_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_12_bin"]},
    ],
}

"""
$gsutil cat gs://pvnguyen-us-central1/mp_notebook/custom_transform_fn.py
import tensorflow.compat.v1 as tf


def plus_one_transform(x: tf.SparseTensor) -> tf.SparseTensor:
  return tf.SparseTensor(x.indices, tf.add(x.values, 1), x.dense_shape)
"""
transform_config_with_custom_transform = {
    "auto_transforms": ["ps_reg_02", "ps_reg_03"],
    "modules": [
        {
            "transform": "PlusOneTransform",
            "module_path": "gs://pvnguyen-us-central1/mp_notebook/custom_transform_fn.py",
            "function_name": "plus_one_transform",
        }
    ],
    "transforms": [
        {
            "transform": "CastToFloatTransform",
            "input_column_names": ["ps_reg_01"],
            "output_column_names": ["ps_reg_01"],
        },
        {"transform": "PlusOneTransform", "input_column_names": ["ps_reg_01"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_10_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_11_bin"]},
        {"transform": "VocabularyTransform", "input_column_names": ["ps_ind_12_bin"]},
    ],
}

Additional transformations to try out and their sample configurations:

* `DatetimeTransform`:
``` python
# Outputs columns with granular datetime information (year, month, day, etc.).
{
    'transform': 'DatetimeTransform',
    'input_column_names': ['feature_1'],
    'time_format': '%Y-%m-%d'  # time format of input column
}
```

* `LogTransform`:
``` python
# Outputs a column of the element-wise, natural logarithm of our input.
{
    'transform': 'LogTransform',
    'input_column_names': ['feature_1']
}
```

* `ZScaleTransform`:
``` python
# Outputs a z-scale normallized input column.
{
    'transform': 'ZScaleTransform',
    'input_column_names': ['feature_1']
}
```

* `NGramTransform`:
``` python
# Outputs a column containing the vocab lookup incidies of n-grams in our
# input.
{
    'transform': 'NGramTransform',
    'input_column_names': ['feature_1'],
    'min_ngram_size': 1,  # min number of tokens in our n-gram
    'max_ngram_size': 2,  # max number of tokens in our n-gram
    'separator': ' '  # seperator between tokens
  }
```
* `ClipTransform`:
``` python
# Outputs a column where all values < min_value are assigned min_value
# and all columns > max_value are assigned max_value.
{
    'transform': 'ClipTransform',
    'input_column_names': ['col1'],
    'output_column_names': ['col1_clipped'],
    'min_value': 1.,
    'max_value': 10.,
}
```
* `MaxAbsScaleTransform`:
``` python
# Outputs a column where all input elements are divided by abs(max(input)).
{
    'transform': 'MaxAbsScaleTransform',
    'input_column_names': ['col1'],
    'output_column_names': ['col1_max_abs_scaled']
}
```

### Setup training configuration

In [None]:
run_evaluation = True  # @param {type:"boolean"}
prediction_type = "classification"
target_column = "target"
optimization_objective = "maximize-au-roc"

# Fraction split
training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

timestamp_split_key = None  # timestamp column name when using timestamp split
stratified_split_key = None  # target column name when using stratified split
training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

predefined_split_key = None
if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None

weight_column = None

transform_config = auto_transform_config

## VPC related config

If you need to use a custom Dataflow subnetwork, you can set it through the `dataflow_subnetwork` parameter. The requirements are:
1. `dataflow_subnetwork` must be fully qualified subnetwork name.
   [[reference](https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications)]
1. The following service accounts must have [Compute Network User role](https://cloud.google.com/compute/docs/access/iam#compute.networkUser) assigned on the specified dataflow subnetwork [[reference](https://cloud.google.com/dataflow/docs/guides/specifying-networks#shared)]:
    1. Compute Engine default service account: PROJECT_NUMBER-compute@developer.gserviceaccount.com
    1. Dataflow service account: service-PROJECT_NUMBER@dataflow-service-producer-prod.iam.gserviceaccount.com

If your project has VPC-SC enabled, please make sure:

1. The dataflow subnetwork used in VPC-SC is configured properly for Dataflow.
   [[reference](https://cloud.google.com/dataflow/docs/guides/routes-firewall)]
1. `dataflow_use_public_ips` is set to False.


In [None]:
# Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used.
# Fully qualified subnetwork name is in the form of
# https://www.googleapis.com/compute/v1/projects/HOST_PROJECT_ID/regions/REGION_NAME/subnetworks/SUBNETWORK_NAME
# reference: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_subnetwork = ""  # @param {type:"string"}
# Specifies whether Dataflow workers use public IP addresses.
dataflow_use_public_ips = True  # @param {type:"boolean"}

## Customize TabNet CustomJob configuration and create pipeline

We will create a TabNet CustomJob pipeline with the following specifications:
- Custom training machine type
- Specify the following hyperparameters: `learning_rate`, `max_steps`, `max_train_secs`

In [None]:
custom_job_root_dir = os.path.join(BUCKET_URI, "tabnet_custom_job")

# max_steps and/or max_train_secs must be set. If both are
# specified, training will stop after either condition is met.
# By default, max_train_secs is set to -1.
max_steps = 1000
max_train_secs = -1

learning_rate = 0.01  # The learning rate used by the linear optimizer.
dnn_learning_rate = 0.01  # The learning rate for training the deep part of the model.

training_machine_spec = {"machine_type": "c2-standard-16"}  # Override for TF chief node

transform_config_path = os.path.join(custom_job_root_dir, "transform_config.json")
write_to_gcs(transform_config_path, json.dumps(transform_config))

# To test GPU training, the training_machine_spec can be specified like this.
# training_machine_spec = {
#     'machine_type': 'n1-highmem-32',
#     'accelerator_type': 'NVIDIA_TESLA_V100',
#     'accelerator_count': 2
# }

# If your system does not use Python, you can save the JSON file (`template_path`),
# and use another programming language to submit the pipeline.
(
    custom_job_template_path,
    custom_job_parameter_values,
) = automl_tabular_utils.get_tabnet_trainer_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=custom_job_root_dir,
    max_steps=max_steps,
    max_train_secs=max_train_secs,
    learning_rate=learning_rate,
    target_column=target_column,
    prediction_type=prediction_type,
    transform_config=transform_config_path,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    data_source_csv_filenames=data_source_csv_filenames,
    training_machine_spec=training_machine_spec,
    dataflow_use_public_ips=dataflow_use_public_ips,
    dataflow_subnetwork=dataflow_subnetwork,
    run_evaluation=run_evaluation,
)

custom_job_id = f"automl-tabular-tabnet-{uuid.uuid4()}"
# More info on parameters PipelineJob accepts:
# https://cloud.google.com/vertex-ai/docs/pipelines/run-pipeline#create_a_pipeline_run
custom_job = aiplatform.PipelineJob(
    display_name=custom_job_id,
    template_path=custom_job_template_path,
    job_id=custom_job_id,
    pipeline_root=custom_job_root_dir,
    parameter_values=custom_job_parameter_values,
    enable_caching=False,
)

custom_job.run()

In [None]:
# Get model URI
tabnet_trainer_pipeline_task_details = aiplatform.PipelineJob.get(
    custom_job_id
).gca_resource.job_detail.task_details
print("model uri:", get_model_uri(tabnet_trainer_pipeline_task_details))
print(
    "model artifacts:",
    get_model_artifacts_path(
        tabnet_trainer_pipeline_task_details, "automl-tabular-tabnet-trainer"
    ),
)

## Customize TabNet HyperparameterTuningJob configuration and create pipeline

We will create a TabNet HyperparameterTuningJob pipeline with the following specifications:
- Change training machine type

The parameter specs specified in `parameters` can be modified to tune different hyperparameters. The available parameter spec types are [double_value_spec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/StudySpec#doublevaluespec), [integer_value_spec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/StudySpec#integervaluespec), [categorical_value_spec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/StudySpec#integervaluespec), and [discrete_value_spec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/StudySpec#discretevaluespec). 

In [None]:
hpt_job_root_dir = os.path.join(BUCKET_URI, "tabnet_hyperparameter_tuning_job")

training_machine_spec = {"machine_type": "c2-standard-16"}  # Override for TF chief node

# To test GPU training, the training_machine_spec can be specified like this.
# training_machine_spec = {
#     'machine_type': 'n1-highmem-32',
#     'accelerator_type': 'NVIDIA_TESLA_V100',
#     'accelerator_count': 2
# }

metrics = [{"metric_id": "loss", "goal": "MINIMIZE"}]
parameters = automl_tabular_utils.get_tabnet_study_spec_parameters_override(
    dataset_size_bucket="small",
    prediction_type=prediction_type,
    training_budget_bucket="small",
)
# max_steps and/or max_train_secs must be set. If both are
# specified, training will stop after either condition is met.
# By default, max_train_secs is set to -1 and max_steps is set to
# an appropriate range given dataset_size and training budget.

# If your system does not use Python, you can save the JSON file (`template_path`),
# and use another programming language to submit the pipeline.
(
    hpt_job_template_path,
    hpt_job_parameter_values,
) = automl_tabular_utils.get_builtin_algorithm_hyperparameter_tuning_job_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=hpt_job_root_dir,
    algorithm="tabnet",
    target_column=target_column,
    prediction_type=prediction_type,
    transform_config=transform_config_path,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    data_source_csv_filenames=data_source_csv_filenames,
    study_spec_metrics=metrics,
    study_spec_parameters_override=parameters,
    max_trial_count=1,
    parallel_trial_count=1,
    max_failed_trial_count=0,
    training_machine_spec=training_machine_spec,
    dataflow_use_public_ips=dataflow_use_public_ips,
    dataflow_subnetwork=dataflow_subnetwork,
    run_evaluation=True,
)

hpt_job_id = f"automl-tabular-tabnet-hpt-{uuid.uuid4()}"
# More info on parameters PipelineJob accepts:
# https://cloud.google.com/vertex-ai/docs/pipelines/run-pipeline#create_a_pipeline_run
hpt_job = aiplatform.PipelineJob(
    display_name=hpt_job_id,
    template_path=hpt_job_template_path,
    job_id=hpt_job_id,
    pipeline_root=hpt_job_root_dir,
    parameter_values=hpt_job_parameter_values,
    enable_caching=False,
)

hpt_job.run()

In [None]:
# Get model URI
tabnet_hpt_pipeline_task_details = aiplatform.PipelineJob.get(
    hpt_job_id
).gca_resource.job_detail.task_details
print("model uri:", get_model_uri(tabnet_hpt_pipeline_task_details))
print(
    "model artifacts:",
    get_model_artifacts_path(
        tabnet_hpt_pipeline_task_details, "get-best-hyperparameter-tuning-job-trial"
    ),
)

## Clean up Vertex and BigQuery resources

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Cloud Storage Bucket

In [None]:
if os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI