In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train a TabNet model using Vertex AI Remote Training with Vertex AI SDK 2.0

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tabnet/sdk2_remote_tabnet_training.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tabnet/sdk2_remote_tabnet_training.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
    <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/tabnet/sdk2_remote_tabnet_training.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Vertex AI logo">Open in Vertex AI Workbench
    </a>
</table>


## Overview

This tutorial demonstrates how to use Vertex AI SDK 2.0 for remote TabNet model training of a local TabNet model training job.

### Objective

In this tutorial, you learn to use `Vertex AI SDK 2.0` to remotely train a TabNet model as a local (on-prem) training job.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Training`
- `Vertex AI Remote Training`


The steps performed include:

- Download and split the dataset
- Ingest the data in a Dataframe and perform transformations.
- Train a tabular classification model.
- Train a tabular regression model.


**TabNet remote training**
```
from google.cloud.aiplatform.private_preview.vertex_ai.tabular_models import TabNetTrainer

vertex_ai.init(remote=True, project="my-project", location="my-location", staging_bucket="gs://my-bucket")

# Instantiate TabNetTrainer
tabnet_trainer = TabNetTrainer(...)

# Optional set training config
tabnet_trainer.fit.vertex.training_config.staging_bucket = "gs://my-bucket"

# This `fit` call will be executed remotely
tabnet_trainer.fit(...)

# This `predict` call will be executed locally
tabnet_trainer.predict(...)
```


### Dataset

This tutorial uses the <a href="https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html">IRIS dataset</a>, which predicts the iris species.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the following packages required to execute this notebook.

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform[preview] \
                                 scikit-learn \
                                 pyarrow \
                                 fastparquet \
                                 tensorflow

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import uuid

import pandas as pd
import vertexai.preview
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from vertexai.preview.tabular_models import TabNetTrainer

## Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
vertexai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=BUCKET_URI,
)

### Prepare the dataset

Now load the Iris dataset and split the data into train, evaluation and test sets.

In [None]:
dataset = load_iris()

X, X_validation, y, y_validation = train_test_split(
    dataset.data, dataset.target, test_size=0.60, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Data size: ", len(dataset.target))
print("X_train size: ", len(X_train))
print("X_validation size: ", len(X_validation))
print("X_test size: ", len(X_test))

## Feature transformation

Next, you do ingest the training and evaluation data in pandas Dataframe and perform feature transformations.

In [None]:
columns = [f"arg_{i}" for i in range(X_train.shape[1])]

training_data = pd.DataFrame(X_train, columns=columns)
training_data["target"] = y_train
training_data["target"] = training_data["target"].astype("category")

validation_data = pd.DataFrame(X_validation, columns=columns)
validation_data["target"] = y_validation
validation_data["target"] = validation_data["target"].astype("category")

eval_data = pd.DataFrame(X_test, columns=columns)

training_data.head()

## Tabular classification model

### Initialize TabNetTrainer

Next, initialize the TabNetTrainer for training a tabular classification model.

In [None]:
tabnet_trainer = TabNetTrainer(
    model_type="classification",
    target_column="target",
    learning_rate=0.01,
    max_train_secs=1800,
)

### Remote training

Now, train a TabNet tabular classification model as a remote training job.

In [None]:
REMOTE_JOB_NAME = "test-sdk2-remote-tabnet-training"
REMOTE_JOB_BUCKET = "/".join([BUCKET_URI, REMOTE_JOB_NAME])

# Sets the staging bucket for remote training
tabnet_trainer.fit.vertex.remote_config.staging_bucket = (
    REMOTE_JOB_BUCKET + f"_{uuid.uuid4()}"
)

# Executes remote training
tabnet_trainer.fit(training_data=training_data, validation_data=validation_data)

### Local prediction

Finally, make a local prediction with the trained tabular classification model.

In [None]:
from sklearn.metrics import accuracy_score

predictions = tabnet_trainer.predict(eval_data)
accuracy_score(y_test, predictions)

## Tabular regression model

### Initialize TabNetTrainer

Next, initialize the TabNetTrainer for training a tabular regression model.

In [None]:
tabnet_trainer = TabNetTrainer(
    model_type="regression",
    target_column="target",
    learning_rate=0.01,
    max_train_secs=1800,
)

### Remote training

Now, train a TabNet tabular regression model as a remote training job.

In [None]:
# Sets the staging bucket for remote training
tabnet_trainer.fit.vertex.remote_config.staging_bucket = (
    REMOTE_JOB_BUCKET + f"_{uuid.uuid4()}"
)

# Executes remote training
tabnet_trainer.fit(training_data=training_data, validation_data=validation_data)

### Local prediction

Finally, make a local prediction with the trained tabular regression model.

In [None]:
from sklearn.metrics import mean_squared_error

predictions = tabnet_trainer.predict(eval_data)
mean_squared_error(y_test, predictions, squared=False)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
import os

delete_bucket = False

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}