In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Model Deployment with Merlin Inference API

## Overview

In the previous notebook we explained and showed how we can preprocess data with NVTabular, and train an TF MLP model using NVTabular KerasSequenceLoader. We learned how to save a workflow, a trained TF model, and the ensemble model. In this notebook, we will show example request scripts sent to triton inference server
- to transform new/streaming data with NVTabular library
- to generate prediction results for new data from trained model 
- to deploy the end-to-end pipeline.

## Getting Started

In [2]:
# External dependencies
import os
from time import time
import warnings 

from tritonclient.utils import *
import tritonclient.grpc as grpcclient
import nvtabular
import cudf
from timeit import default_timer as timer
from datetime import timedelta

We define our base directory containing the raw and processed data.

In [3]:
MODEL_PATH = os.environ.get('MODEL_BASE_DIR', '/model/models/')
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', '/model/data/')

Let's deactivate the warnings before sending requests. 

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Verify Triton Is Running Correctly

Use Triton’s ready endpoint to verify that the server and the models are ready for inference. Replace `localhost` with your host ip address.

In [8]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))

client created.


In [9]:
triton_client.is_server_live() 

GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


True

The HTTP request returns status 200 if Triton is ready and non-200 if it is not ready.

## Send request to Triton IS to transform raw dataset

Now we send a request to the running triton inference server using our raw validation set in parquet format. This request is going to load the saved NVTabular workflow and then transform the new dataset samples.

In [10]:
# read in the workflow (to get input/output schema to call triton with)
MODEL_NAME_NVT = os.environ.get('MODEL_NAME_NVT', 'movielens_nvt')
MODEL_PATH_NVT = os.path.join(MODEL_PATH, MODEL_NAME_NVT)

workflow = nvtabular.Workflow.load(os.path.join(MODEL_PATH_NVT, "1/workflow"))

# read in a batch of data to get transforms for
batch = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "valid.parquet"), num_rows=3)[workflow.column_group.input_column_names]

print("raw data:\n", batch, "\n")
# convert the batch to a triton inputs
columns = [(col, batch[col][0:3]) for col in workflow.column_group.input_column_names]
inputs = []

col_dtypes = [np.int64, np.int64]

for i, (name, col) in enumerate(columns):
    d = col.values_host.astype(col_dtypes[i])
    d = d.reshape(len(d),1)
    inputs.append(grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)

# placeholder variables for the output
outputs = [grpcclient.InferRequestedOutput(name) for name in workflow.column_group.columns]

# make the request
# replace <localhost> with your host ip address.
with grpcclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(MODEL_NAME_NVT, inputs, request_id="1",outputs=outputs)
    
# convert output from triton back to a nvt dataframe  
output = cudf.DataFrame({col: response.as_numpy(col).T[0] for col in workflow.column_group.columns})
print("transformed data:\n", output)

raw data:
           userId  movieId
15347762   99476   104374
16647840  107979     2634
23915192  155372     1614 

transformed data:
    userId  movieId
0   99476    19997
1  107979     2543
2  155372     1557


## Running the MovieLens rating classification example

A minimal model repository for a TensorFlow SavedModel model is:
```
  <model-repository-path>/<model-name>/
      config.pbtxt
      1/
        model.savedmodel/
           <saved-model files>
```


Let's check out our model repository layout. You can install `tree` library with  `apt-get install tree`, and then run `tree /model/models/` to print out the model repository layout as below:
```
/model/models/
├── movielens
│   ├── 1
│   └── config.pbtxt
├── movielens_nvt
│   ├── 1
│   │   ├── model.py
│   │   └── workflow
│   │       ├── categories
│   │       │   ├── unique.movieId.parquet
│   │       │   └── unique.userId.parquet
│   │       ├── metadata.json
│   │       └── workflow.pkl
│   └── config.pbtxt
└── movielens_tf
    ├── 1
    │   └── model.savedmodel
    │       ├── assets
    │       ├── saved_model.pb
    │       └── variables
    │           ├── variables.data-00000-of-00001
    │           └── variables.index
    └── config.pbtxt
 ```

You can see that we have a config.pbtxt file. Each model in a model repository must include a model configuration that provides required and optional information about the model. Typically, this configuration is provided in a `config.pbtxt` file specified as [ModelConfig protobuf](https://github.com/triton-inference-server/server/blob/r20.12/src/core/model_config.proto).

In [11]:
# read in a batch of data to get transforms for

batch = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "valid/*.parquet"), num_rows=3)

batch = batch[batch.columns][0:3]
batch = batch.drop(columns=["rating"])

print("input data:\n", batch, "\n")

inputs = [] 
for i, col in enumerate(batch.columns):
    d = batch[col].values_host.astype(np.int32)
    d = d.reshape(len(d),1)
    inputs.append(grpcclient.InferInput(col, d.shape, np_to_triton_dtype(np.int32)))
    inputs[i].set_data_from_numpy(d)

outputs = [grpcclient.InferRequestedOutput("dense_3")]

MODEL_NAME_TF = os.environ.get('MODEL_NAME_TF', 'movielens_tf')

with grpcclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(MODEL_NAME_TF, inputs, request_id="1",outputs=outputs)

print("predicted sigmoid result:\n", response.as_numpy('dense_3'))

input data:
    userId  movieId
0   99476    19997
1  107979     2543
2  155372     1557 

predicted sigmoid result:
 [[0.6250564]
 [0.6247618]
 [0.6251831]]


## END-2-END INFERENCE PIPELINE

In this request example below, we show that we can feed raw unprocessed parquet file, and obtain final prediction results coming from the last layer of the TF model that we built in `movilens_TF` notebook. The output we get is a sigmoid value.

We use `InferInput` to describe the tensors we'll be sending to the server. It needs the name of the input, the shape of the tensor we'll be passing to the server, and its datatype.

## Send request to Triton IS to generate prediction results for raw dataset

In [12]:
# read in the workflow (to get input/output schema to call triton with)
batc_size = 64
batch = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "valid.parquet"), num_rows=3, columns=['userId', 'movieId'])
batch = batch[batch.columns][0:3]

print("raw data:\n", batch, "\n")

# convert the batch to a triton inputs
inputs = []

col_names = ['userId', 'movieId'] 
col_dtypes = [np.int64, np.int64]

for i, col in enumerate(batch.columns):
    d = batch[col].values_host.astype(col_dtypes[i])
    d = d.reshape(len(d),1)
    inputs.append(grpcclient.InferInput(col_names[i], d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)

# placeholder variables for the output
outputs = [grpcclient.InferRequestedOutput("dense_3")]

MODEL_NAME_ENSEMBLE = os.environ.get('MODEL_NAME_ENSEMBLE', 'movielens')

# build a client to connect to our server. 
# This InferenceServerClient object is what we'll be using to talk to Triton.
# make the request with tritonclient.grpc.InferInput object
with grpcclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(MODEL_NAME_ENSEMBLE, inputs, request_id="1",outputs=outputs)

print("predicted sigmoid result:\n", response.as_numpy('dense_3'))

raw data:
           userId  movieId
15347762   99476   104374
16647840  107979     2634
23915192  155372     1614 

predicted sigmoid result:
 [[0.6250564]
 [0.6247618]
 [0.6251831]]


Let's send request for a larger batch size and measure the total run time and throughput.

In [13]:
# read in the workflow (to get input/output schema to call triton with)
batch_size = 64
batch = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "valid.parquet"), num_rows=batch_size, columns=['userId', 'movieId'])
batch = batch[batch.columns][0:batch_size]

print("raw data:\n", batch, "\n")

start = time()
# convert the batch to a triton inputs
inputs = []

col_names = ['userId', 'movieId'] 
col_dtypes = [np.int64, np.int64]

for i, col in enumerate(batch.columns):
    d = batch[col].values_host.astype(col_dtypes[i])
    d = d.reshape(len(d),1)
    inputs.append(grpcclient.InferInput(col_names[i], d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)

# placeholder variables for the output
outputs = [grpcclient.InferRequestedOutput("dense_3")]

MODEL_NAME_ENSEMBLE = os.environ.get('MODEL_NAME_ENSEMBLE', 'movielens')

# build a client to connect to our server. 
# This InferenceServerClient object is what we'll be using to talk to Triton.
# make the request with tritonclient.grpc.InferInput object
with grpcclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(MODEL_NAME_ENSEMBLE, inputs, request_id="1",outputs=outputs)

t_final = time() - start
print("predicted sigmoid result:\n", response.as_numpy('dense_3'), "\n")

print(f"run_time(sec): {t_final} - rows: {batch_size} - inference_thru: {batch_size / t_final}")

raw data:
           userId  movieId
15347762   99476   104374
16647840  107979     2634
23915192  155372     1614
10052313   65225     7153
12214125   79161      500
...          ...      ...
17138306  111072     1625
21326655  138575    81591
5664631    36671     8861
217658      1535   111759
11842246   76766   109487

[64 rows x 2 columns] 

predicted sigmoid result:
 [[0.6250564 ]
 [0.6247618 ]
 [0.6251831 ]
 [0.6250345 ]
 [0.62659883]
 [0.62566674]
 [0.62582093]
 [0.62555826]
 [0.62475634]
 [0.62531734]
 [0.62435687]
 [0.6254294 ]
 [0.6249607 ]
 [0.6250089 ]
 [0.62605655]
 [0.6254521 ]
 [0.62492216]
 [0.6259799 ]
 [0.6250382 ]
 [0.62497985]
 [0.6252896 ]
 [0.6264592 ]
 [0.625206  ]
 [0.6248848 ]
 [0.62532496]
 [0.6252597 ]
 [0.6252666 ]
 [0.6246876 ]
 [0.62549347]
 [0.6246998 ]
 [0.6252818 ]
 [0.62472236]
 [0.62552464]
 [0.62541705]
 [0.62485063]
 [0.62516356]
 [0.6248119 ]
 [0.62541467]
 [0.6259715 ]
 [0.6251849 ]
 [0.62577355]
 [0.62549895]
 [0.6247672 ]
 [0.62464315]
 [0.62576