In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

## Overview

In this notebook, we will show how we do inference with our trained deep learning recommender model using Triton Inference Server. In this example, we deploy the NVTabular workflow and HugeCTR model with Triton Inference Server. We deploy them as an ensemble. For each request, Triton Inference Server will feed the input data through the NVTabular workflow and its output through the HugeCR model.

As we went through in the previous notebook, [movielens-HugeCTR](https://github.com/NVIDIA/NVTabular/blob/main/examples/inference_triton/inference-HugeCTR/movielens-HugeCTR.ipynb), NVTabular provides a function to save the NVTabular workflow via `export_hugectr_ensemble`. This function does not only save NVTabular workflow, but also saves the trained HugeCTR model and ensemble model to be served to Triton IS.

## Getting Started

Let's import required libraries.

In [2]:
import tritonclient.grpc as httpclient

import cudf
import numpy as np

### Load Models on Triton Server

At this stage, you should launch the Triton Inference Server docker container with the following script:

```
docker run -it --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ${PWD}:/model nvcr.io/nvidia/merlin/merlin-inference:0.5.1
```

After you started the container you can start triton server with the command below:

```
tritonserver --model-repository=path_to_models --backend-config=hugectr,movielens=path_to_json_file --backend-config=hugectr,supportlonglong=true --model-control-mode=explicit 
```

Note: The model-repository path is `/model/models/`. The models haven't been loaded, yet. We can request triton server to load the saved ensemble.  We initialize a triton client. The path for the json file is `/model/models/movielens/1/movielens.json`.

In [12]:
# disable warnings
import warnings

warnings.filterwarnings("ignore")

  and should_run_async(code)


In [4]:
import tritonhttpclient

try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))

client created.




In [5]:
triton_client.is_server_live()

GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


  and should_run_async(code)


True

In [6]:
triton_client.get_model_repository_index()

POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '72'}>
bytearray(b'[{"name":"movielens"},{"name":"movielens_ens"},{"name":"movielens_nvt"}]')


  and should_run_async(code)


[{'name': 'movielens'}, {'name': 'movielens_ens'}, {'name': 'movielens_nvt'}]

Let's load our models to Triton Server.

In [7]:
%%time

triton_client.load_model(model_name="movielens_nvt")

  and should_run_async(code)


POST /v2/repository/models/movielens_nvt/load, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens_nvt'
CPU times: user 2.51 ms, sys: 2.96 ms, total: 5.47 ms
Wall time: 4.12 s


In [8]:
%%time

triton_client.load_model(model_name="movielens")

POST /v2/repository/models/movielens/load, headers None



  and should_run_async(code)


<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens'
CPU times: user 4.64 ms, sys: 450 µs, total: 5.09 ms
Wall time: 5.06 s


Finally, we load our ensemble model `movielens_ens`.

In [9]:
%%time

triton_client.load_model(model_name="movielens_ens")

POST /v2/repository/models/movielens_ens/load, headers None



  and should_run_async(code)


<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens_ens'
CPU times: user 4.63 ms, sys: 438 µs, total: 5.07 ms
Wall time: 4.48 s


Let's send a request to Inference Server and print out the response. Since in our example above we do not have continuous columns, below our only inputs are categorical columns.

In [13]:
from tritonclient.utils import np_to_triton_dtype

model_name = "movielens_ens"
col_names = ["movieId", "userId"]
# read in a batch of data to get transforms for
batch = cudf.read_parquet("/model/data/valid.parquet", num_rows=64)[col_names]
print(batch, "\n")

# convert the batch to a triton inputs
columns = [(col, batch[col]) for col in col_names]
inputs = []

col_dtypes = [np.int64, np.int64]
for i, (name, col) in enumerate(columns):
    d = col.values_host.astype(col_dtypes[i])
    d = d.reshape(len(d), 1)
    inputs.append(httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)
# placeholder variables for the output
outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
# make the request
with httpclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
# print(response.as_numpy('OUTPUT0'))
print("predicted sigmoid result:\n", response.as_numpy("OUTPUT0"))

          movieId  userId
15347762   104374   99476
16647840     2634  107979
23915192     1614  155372
10052313     7153   65225
12214125      500   79161
...           ...     ...
17138306     1625  111072
21326655    81591  138575
5664631      8861   36671
217658     111759    1535
11842246   109487   76766

[64 rows x 2 columns] 

predicted sigmoid result:
 [0.7573269  0.6642067  0.5219038  0.9162213  0.58373827 0.6324592
 0.1261984  0.7433809  0.7342346  0.5113202  0.32252765 0.32908657
 0.73969156 0.81043386 0.9233688  0.63236904 0.4797384  0.75307035
 0.53202295 0.7541297  0.40705425 0.9277518  0.689459   0.72485703
 0.8788407  0.83017814 0.88228446 0.93667686 0.8267219  0.6621109
 0.86495745 0.81340396 0.2001776  0.4336695  0.7589197  0.40920126
 0.05241419 0.507262   0.86438596 0.64993507 0.8638992  0.8295686
 0.5768085  0.7233483  0.8432365  0.92196935 0.6212369  0.03016632
 0.90098035 0.9210639  0.49144918 0.18722329 0.500137   0.73863095
 0.72936064 0.8874768  0.4512655  0.