# Vertex SDK Custom Training with Custom Container for PyTorch Image Classification GPU and Custom Prediction with TorchServe

## Setup

In [None]:
PROJECT_ID="aiplatform-dev"
BUCKET_NAME="gs://aiplatform-dev"
REGION="us-central1"

In [None]:
!gsutil mb -l $REGION $BUCKET_NAME

In [None]:
!gsutil ls -al $BUCKET_NAME

In [None]:
tutorial_name_train = "pytorch-img-cls-gpu"
tutorial_name_serve = "pytorch-img-cls-torchserve"

## Local Training

In [None]:
!ls trainer

In [None]:
!cat trainer/requirements.txt

In [None]:
!pip install -r trainer/requirements.txt

In [None]:
!cat trainer/task.py

In [None]:
%run trainer/task.py \
  --epochs 5

In [None]:
!ls ./tmp

In [None]:
!rm -rf ./tmp

## Vertex SDK Custom Training using Custom Container

### Custom PyTorch Container for Training

In [None]:
hostname = "gcr.io"
image_name_train = tutorial_name_train
tag = "latest"

custom_container_image_uri_train=f"{hostname}/{PROJECT_ID}/{image_name_train}:{tag}"

In [None]:
!cd trainer && docker build -t $custom_container_image_uri_train -f Dockerfile .

In [None]:
!docker run --rm $custom_container_image_uri_train --epochs 5

In [None]:
!docker push $custom_container_image_uri_train

In [None]:
!gcloud container images list --repository $hostname/$PROJECT_ID

### Configs

In [None]:
!cat requirements.txt

In [None]:
!pip install -r requirements.txt

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    location=REGION,
)

In [None]:
display_name = tutorial_name_train
gcs_output_uri_prefix = f"{BUCKET_NAME}/{display_name}"

replica_count = 1
machine_type = "n1-standard-32"
accelerator_count = 4
accelerator_type = "NVIDIA_TESLA_P100"

container_args = [
    '--batch-size', '4',
    '--epochs', '5',
]

### Run a CustomContainerTrainingJob

In [None]:
custom_container_training_job = aiplatform.CustomContainerTrainingJob(
    display_name=display_name,
    container_uri=custom_container_image_uri_train,
)

In [None]:
custom_container_training_job.run(
    args=container_args,
    base_output_dir=gcs_output_uri_prefix,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    sync=False,
)

In [None]:
print(f'Custom Training Job Name: {custom_container_training_job.resource_name}')
print(f'GCS Output URI Prefix: {gcs_output_uri_prefix}')

### Training Artifact

In [None]:
!gsutil ls $gcs_output_uri_prefix

## Vertex SDK Custom Prediction using TorchServe

### Test Sample Image

In [None]:
!curl -O https://raw.githubusercontent.com/alvarobartt/pytorch-model-serving/master/images/sample.jpg

In [None]:
!ls sample.jpg

In [None]:
%run convert_b64.py

In [None]:
!ls sample_b64.json

### Model Archive for TorchServe

In [None]:
!cp -r ./tmp/model ./model_server/
# !gsutil cp -r $gcs_output_uri_prefix/model ./model_server/

In [None]:
!ls ./model_server/model/

In [None]:
!cd model_server && torch-model-archiver \
     --model-name antandbee \
     --version 1.0 \
     --serialized-file ./model/antandbee.pth \
     --model-file ./model.py \
     --handler ./handler.py \
     --extra-files ./index_to_name.json \
     -f

In [None]:
!ls model_server/antandbee.mar

### TorchServe Local Run

```
cd model_server
torchserve --model-store ./ \
  --ts-config ./config.properties \
  --models antandbee=antandbee.mar

curl http://localhost:8080/ping

curl http://127.0.0.1:8081/models/antandbee

curl -X POST \
  -H "Content-Type: application/json; charset=utf-8" \
  -d @sample_b64.json \
  http://localhost:8080/predictions/antandbee

torchserve --stop

```

In [None]:
!rm model_server/antandbee.mar
!rm -rf model_server/logs

### Custom TorchServe Container for Prediction

In [None]:
image_name_serve = tutorial_name_serve
custom_container_image_uri_serve=f"{hostname}/{PROJECT_ID}/{image_name_serve}:{tag}"

In [None]:
!cd model_server && docker build -t $custom_container_image_uri_serve -f Dockerfile .

In [None]:
!rm -rf ./model_server/model/

In [None]:
!docker run \
    --rm -it \
    -d \
    --name ts_antandbee \
    -p 8080:8080 \
    -p 8081:8081 \
    $custom_container_image_uri_serve

In [None]:
!curl http://localhost:8080/ping

In [None]:
!curl http://127.0.0.1:8081/models/antandbee

In [None]:
!curl -X POST \
  -H "Content-Type: application/json; charset=utf-8" \
  -d @sample_b64.json \
  localhost:8080/predictions/antandbee

In [None]:
!docker stop ts_antandbee

In [None]:
!docker push $custom_container_image_uri_serve

In [None]:
!gcloud container images list --repository $hostname/$PROJECT_ID

In [None]:
model_name = "antandbee"
model_display_name = tutorial_name_serve

In [None]:
model = aiplatform.Model.upload(
    display_name=model_display_name,
    serving_container_image_uri=custom_container_image_uri_serve,
    serving_container_ports=[8080],
    serving_container_predict_route=f"/predictions/{model_name}",
    serving_container_health_route="/ping",
)

In [None]:
endpoint = model.deploy(
    machine_type="n1-standard-4",
)

In [None]:
endpoint.resource_name

In [None]:
import base64

def convert_b64(input_file_name):
  """Open image and convert it to Base64"""
  with open(input_file_name, 'rb') as input_file:
    jpeg_bytes = base64.b64encode(input_file.read()).decode('utf-8')
  return jpeg_bytes

In [None]:
image_file_name = "./sample.jpg"
instance = {"data": {"b64": convert_b64(image_file_name)}}
prediction = endpoint.predict(instances=[instance])
prediction

In [None]:
!gsutil rm -rf $gcs_output_uri_prefix

In [None]:
!rm sample.jpg
!rm sample_b64.json