In [12]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Define GCP Project
### Assume you already have a GCS path set up as below
<pre>
[YOUR GCS Path - gs://dlvm-dataset/bert]    
      ├──────────  checkpoint 
      │               └─── bert_tf_v1_1_large_fp16_384_v2
      │
      ├──────────  output   
      ├──────────  squad  
      ├──────────  trt_engine  
      └──────────  trt_deployment  
                      └─── bert
                             └─── 1
</pre>

This notebook demonstrate a 3 steps workflow, including Finetune, TRT optimization, Triton Model Server. For user who interest in CAIP Prediction custom container integration with Triton, please skip the first 2 steps.

Container image for the first 2 steps are included in the repo as well.

In [11]:
%env PROJECT_ID=[YOUR GCP Project]
%env MODEL_GCS_PATH=[YOUR GCS Path] 
%env ENDPOINT=https://alpha-ml.googleapis.com/v1

env: PROJECT_ID=k80-exploration
env: MODEL_GCS_PATH=gs://dlvm-dataset/bert/trt_deployment
env: ENDPOINT=https://alpha-ml.googleapis.com/v1


# Launching NGC BERT Fine Tuning Training Job in AI Platform Training Custom Container

In [None]:
import os
os.environ["FINE_TUNE_JOB_NAME"] = "bert_finetuning_0001"

In [None]:
!gcloud ai-platform jobs submit training $FINE_TUNE_JOB_NAME \
    --master-image-uri gcr.io/$Project_ID/tf_bert_gcsfuse:latest \
    --region us-central1 \
    --master-accelerator count=8,type=nvidia-tesla-v100 \
    --master-machine-type n1-highmem-96 \
    --scale-tier custom

In [None]:
!gcloud ai-platform jobs stream-logs $FINE_TUNE_JOB_NAME 

# Launching NGC TensorRT container to Optimize TF checkpoint to TRT Engine in AI Platform Training Custom Container

In [None]:
os.environ["TRT_JOB_NAME"] = "bert_trt_123129"

In [None]:
!gcloud ai-platform jobs submit training $JOB_NAME \
    --master-image-uri gcr.io/$Project_ID/bert_trt_gcsfuse:latest \
    --region us-central1 \
    --master-accelerator count=1,type=nvidia-tesla-t4 \
    --master-machine-type n1-highmem-8 \
    --scale-tier custom

In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME 

In [None]:
!gsutil cp gs://dlvm-dataset/bert/trt_engine/bert_large_384_int8.engine \
    gs://dlvm-dataset/bert/trt_deployment/bert/1/model.plan

# Deploying NVIDIA Triton Inference Server in AI Platform Prediction Custom Container (REST API)

In this notebook, we will walk through the process of deploying NVIDIA's Triton Inference Server into AI Platform Prediction Custom Container service in the Direct Model Server mode:

![](img/caip_triton_container_diagram_direct.jpg)


## Create and deploy Model and Model Version

In this section, we will deploy the BERT large QA TensorRT Engine that optimized for INT8 on T4 to AI platform prediction custom container

### BERT TRT Model

#### Create Model

AI Platform Prediction uses a Model/Model Version Hierarchy, where the Model is a logical grouping of Model Versions.  We will first create the Model.

Because the MODEL_NAME variable will be used later to specify the predict route, and Triton will use that route to run prediction on a specific model, we must set the value of this variable to a valid name of a model.  For this section, will use the "simple" model.

In [1]:
%env MODEL_NAME=bert

env: MODEL_NAME=bert


In [4]:
!curl -X \
    POST -k -H "Content-Type: application/json" \
    -d "{'name': '"$MODEL_NAME"'}" \
    -H "Authorization: Bearer `gcloud auth print-access-token`" \
    "${ENDPOINT}/projects/${PROJECT_ID}/models/"

{
  "error": {
    "code": 409,
    "message": "Field: model.name Error: A model with the same name already exists.",
    "status": "ALREADY_EXISTS",
    "details": [
      {
        "@type": "type.googleapis.com/google.rpc.BadRequest",
        "fieldViolations": [
          {
            "field": "model.name",
            "description": "A model with the same name already exists."
          }
        ]
      }
    ]
  }
}


#### Create Model Version

After the Model is created, we can now create a Model Version under this Model.  Each Model Version will need a name that is unique within the Model.  In AI Platform Prediction Custom Container, a {Project}/{Model}/{ModelVersion} uniquely identifies the specific container and model artifact used for inference.

In [5]:
%env VERSION_NAME=vdongm02
%env TRITON_MODEL_NAME=bert

env: VERSION_NAME=vdongm02
env: TRITON_MODEL_NAME=bert


The following specifications tell AI Platform how to create the Model Version.

In [None]:
import json
import os

triton_bert_version = {
  "name": os.getenv("VERSION_NAME"),
  "deployment_uri": os.getenv("MODEL_GCS_PATH"),
  "container": {
    "image": "gcr.io/"+os.getenv("PROJECT_ID")+"/tritonserver:20.08-py3",
    "args": ["tritonserver",
             "--model-repository=$(AIP_STORAGE_URI)",
             "--strict-model-config=false"
    ],
    "env": [
    ], 
    "ports": [
      { "containerPort": 8000 }
    ]
  },
  "routes": {
    "predict": "/v2/models/"+os.getenv("TRITON_MODEL_NAME")+"/infer",
    "health": "/v2/models/"+os.getenv("TRITON_MODEL_NAME")
  },
  "machine_type": "n1-standard-4",
  "acceleratorConfig": {
    "count":1,
    "type":"nvidia-tesla-t4"
  },
  "autoScaling": {
    "minNodes": 1
  }
}

with open("triton_bert_version.json", "w") as f: 
  json.dump(triton_bert_version, f)

In [8]:
!curl -X \
    POST -k -H "Content-Type: application/json" \
    -d @triton_bert_version.json \
    -H "Authorization: Bearer `gcloud auth print-access-token`" \
    "${ENDPOINT}/projects/${PROJECT_ID}/models/${MODEL_NAME}/versions"

{
  "name": "projects/k80-exploration/operations/create_bert_vdongm02-1600377033412",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.ml.v1.OperationMetadata",
    "createTime": "2020-09-17T21:10:34Z",
    "operationType": "CREATE_VERSION",
    "modelName": "projects/k80-exploration/models/bert",
    "version": {
      "name": "projects/k80-exploration/models/bert/versions/vdongm02",
      "deploymentUri": "gs://dlvm-dataset/bert/trt_deployment",
      "createTime": "2020-09-17T21:10:33Z",
      "autoScaling": {
        "minNodes": 1
      },
      "etag": "XbXqCY1HEiI=",
      "machineType": "n1-standard-4",
      "acceleratorConfig": {
        "count": "1",
        "type": "NVIDIA_TESLA_T4"
      },
      "container": {
        "image": "gcr.io/k80-exploration/tritonserver:20.08-py3",
        "args": [
          "tritonserver",
          "--model-repository=$(AIP_STORAGE_URI)",
          "--strict-model-config=false"
        ],
        "ports": [
          {
          

#### Check the status of Model Version creation

Creating a Model Version may take several minutes.  You can check on the status of this specfic Model Version with the following, and a successful deployment will show:

`"state": "READY"`

In [14]:
%env VERSION_NAME=dongm01

env: VERSION_NAME=dongm01


In [15]:
!curl -X GET -k -H "Content-Type: application/json" \
    -H "Authorization: Bearer `gcloud auth print-access-token`" \
    "${ENDPOINT}/projects/${PROJECT_ID}/models/${MODEL_NAME}/versions/${VERSION_NAME}" 

{
  "name": "projects/k80-exploration/models/bert/versions/dongm01",
  "deploymentUri": "gs://dlvm-dataset/bert/trt_deployment",
  "createTime": "2020-09-17T03:16:54Z",
  "lastUseTime": "2020-09-17T20:58:45Z",
  "autoScaling": {
    "minNodes": 1
  },
  "state": "READY",
  "etag": "73gEAhBaORs=",
  "machineType": "n1-standard-4",
  "acceleratorConfig": {
    "count": "1",
    "type": "NVIDIA_TESLA_T4"
  },
  "container": {
    "image": "gcr.io/k80-exploration/tritonserver:20.08-py3",
    "args": [
      "tritonserver",
      "--model-repository=$(AIP_STORAGE_URI)",
      "--strict-model-config=false"
    ],
    "ports": [
      {
        "containerPort": 8000
      }
    ]
  },
  "routes": {
    "predict": "/v2/models/bert/infer",
    "health": "/v2/models/bert"
  }
}


#### To list all Model Versions and their states in this Model:

In [None]:
!curl -X GET -k -H "Content-Type: application/json" \
    -H "Authorization: Bearer `gcloud auth print-access-token`" \
    "${ENDPOINT}/projects/${PROJECT_ID}/models/${MODEL_NAME}/versions/" 

#### Run Prediction

[TODO] add the basic description of preprocessing and post processing with bert

In [4]:
from utils.create_squad_data import read_squad_examples, convert_examples_to_features

import os, requests, ast
import tokenization
import tensorflow as tf

from get_request_body_bert import *

In [5]:
def init_bert_config():
    """
    Defines the configuration of BERT model
    """
    global do_lower_case 
    global predict_batch_size
    global max_seq_length
    global doc_stride 
    global max_query_length 
    global verbose_logging 
    global version_2_with_negative 
    global n_best_size
    global max_answer_length

    # Set True for uncased model
    do_lower_case = True

    # Total batch size for predictions
    predict_batch_size = 1

    # The maximum total input sequence length after WordPiece tokenization. 
    # Sequences longer than this will be truncated, and sequences shorter than this will be padded.
    max_seq_length = 384

    # When splitting up a long document into chunks, how much stride to take between chunks.
    doc_stride = 128

    # The maximum number of tokens for the question. 
    # Questions longer than this will be truncated to this length.
    max_query_length = 64

    # Set True for verbosity
    verbose_logging = True

    # Set True if the dataset has samples with no answers. For SQuAD 1.1, this is set to False
    version_2_with_negative = False

    # The total number of n-best predictions to generate in the nbest_predictions.json output file.
    n_best_size = 20

    # The maximum length of an answer that can be generated. 
    # This is needed  because the start and end predictions are not conditioned on one another.
    max_answer_length = 30

    return

In [6]:
init_bert_config()

input_data = [{"paragraphs": 
                   [{"context":
                         """TensorRT is a high performance deep learning inference platform 
                         that delivers low latency and high throughput for apps such as 
                         recommenders, speech and image/video on NVIDIA GPUs. It includes 
                         parsers to import models, and plugins to support novel ops and 
                         layers before applying optimizations for inference. Today NVIDIA 
                         is open-sourcing parsers and plugins in TensorRT so that the deep 
                         learning community can customize and extend these components to 
                         take advantage of powerful TensorRT optimizations for your apps.""", 
                     "qas": [{ 
                         "question": "What is TensorRT?", 
                         "id": "Q1"}]}]}]

# input_data = [{"paragraphs":
#                    [{"context":
#                          """The Apollo program, also known as Project Apollo, was the third 
#                          United States human spaceflight program carried out by the National 
#                          Aeronautics and Space Administration (NASA), which accomplished 
#                          landing the first humans on the Moon from 1969 to 1972. First 
#                          conceived during Dwight D. Eisenhower's administration as a 
#                          three-man spacecraft to follow the one-man Project Mercury which 
#                          put the first Americans in space, Apollo was later dedicated to 
#                          President John F. Kennedy's national goal of landing a man on 
#                          the Moon and returning him safely to the Earth by the end of the 
#                          1960s, which he proposed in a May 25, 1961, address to Congress. 
#                          Project Mercury was followed by the two-man Project Gemini. 
#                          The first manned flight of Apollo was in 1968. Apollo ran from 
#                          1961 to 1972, and was supported by the two man Gemini program 
#                          which ran concurrently with it from 1962 to 1966. Gemini missions 
#                          developed some of the space travel techniques that were necessary 
#                          for the success of the Apollo missions. Apollo used Saturn family 
#                          rockets as launch vehicles. Apollo/Saturn vehicles were also used 
#                          for an Apollo Applications Program, which consisted of Skylab, 
#                          a space station that supported three manned missions in 1973-74, 
#                          and the Apollo-Soyuz Test Project, a joint Earth orbit mission with 
#                          the Soviet Union in 1975.""",
#                     "qas":[{
#                         "id": 'Q1', 
#                         "question":"What project put the first Americans into space?"}]}]}]

vocab_file = 'vocab.txt'

In [7]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

request_body, inputs_dict, eval_examples, eval_features = get_bert_request_body(input_data, 
                                     version_2_with_negative, 
                                     tokenizer, 
                                     max_seq_length, 
                                     doc_stride,
                                     max_query_length,
                                     )

In [8]:
url = "https://alpha-ml.googleapis.com/v1/projects/k80-exploration/models/bert/versions/dongm01:predict"
headers = {
  'Content-Type': 'application/json',
  'Authorization': 'Bearer {}'.format(
      os.popen('gcloud auth application-default print-access-token').read().rstrip())
}

In [9]:
response = requests.request("POST", url, 
                 headers=headers, 
                 data = request_body).content
response_data = ast.literal_eval(response.decode("UTF-8"))
start_logits = response_data["outputs"][0]['data'][0::2]
end_logits = response_data["outputs"][0]['data'][1::2]

In [10]:
from run_squad import get_predictions, RawResult

all_results = []
unique_id = inputs_dict['unique_ids'][0][0]

all_results.append(
    RawResult(
        start_logits=start_logits,
        end_logits=end_logits,
        unique_id=unique_id)
)

all_predictions, all_nbest_json, scores_diff_json = get_predictions(
          eval_examples, eval_features, all_results,
          n_best_size, max_answer_length,
          do_lower_case, version_2_with_negative,
          verbose_logging)

qas_id = input_data[0]['paragraphs'][0]['qas'][0]['id']
question = input_data[0]['paragraphs'][0]['qas'][0]['question']

print(f'Question: {question}')
print(f'Answer: {all_predictions[qas_id]}')

Question: What project put the first Americans into space?
Answer: Project Mercury
