In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

## Overview

In this notebook, we will show how we do inference with our trained deep learning recommender model using Triton Inference Server. In this example, we deploy the NVTabular workflow and HugeCTR model with Triton Inference Server. We deploy them as an ensemble. For each request, Triton Inference Server will feed the input data through the NVTabular workflow and its output through the HugeCR model.

As we went through in the previous notebook, [movielens-HugeCTR](https://github.com/NVIDIA/NVTabular/blob/main/examples/inference_triton/inference-HugeCTR/movielens-HugeCTR.ipynb), NVTabular provides a function to save the NVTabular workflow via `export_hugectr_ensemble`. This function does not only save NVTabular workflow, but also saves the trained HugeCTR model and ensemble model to be served to Triton IS.

## Getting Started

Let's import required libraries.

In [2]:
# External dependencies
import os
import time
import gc

from tritonclient.utils import *
import tritonclient.grpc as httpclient

import nvtabular as nvt
import cudf
import numpy as np

### Load Models on Triton Server

At this staged, you should have already launched the Triton Inference Server docker container with the following script:

```
docker run -it --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ${PWD}:/model nvcr.io/nvidia/merlin_inference

source activate rapids
```

After you started the container you can start triton server with the command below:

```
tritonserver --model-repository=path_to_models --backend-config=hugectr,movielens=path_to_json_file --backend-config=hugectr,supportlonglong=true --model-control-mode=explicit 
```

Note: The model-repository path is `/model/models/`. The models haven't been loaded, yet. We can request triton server to load the saved ensemble.  We initialize a triton client. The path for the json file is `/model/models/movielens/1/movielens.json`.

In [6]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))

client created.


  and should_run_async(code)


In [7]:
triton_client.is_server_live() 

GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


  and should_run_async(code)


True

In [8]:
triton_client.get_model_repository_index() 

POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '162'}>
bytearray(b'[{"name":"movielens","version":"1","state":"READY"},{"name":"movielens_ens","version":"1","state":"READY"},{"name":"movielens_nvt","version":"1","state":"READY"}]')


  and should_run_async(code)


[{'name': 'movielens', 'version': '1', 'state': 'READY'},
 {'name': 'movielens_ens', 'version': '1', 'state': 'READY'},
 {'name': 'movielens_nvt', 'version': '1', 'state': 'READY'}]

We load our ensemble model `movielens_ens`.

In [9]:
%%time

triton_client.load_model(model_name='movielens_ens')

  and should_run_async(code)


POST /v2/repository/models/movielens_ens/load, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens_ens'
CPU times: user 1.17 ms, sys: 6.15 ms, total: 7.32 ms
Wall time: 32.8 s


In [11]:
# warnings can be disabled
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Let's send a request to Inference Server and print out the response. Since in our example above we do not have continuous columns, below our only inputs are categorical columns.

In [12]:
import tritonclient.grpc as httpclient
import nvtabular
import cudf
from timeit import default_timer as timer
from datetime import timedelta

model_name = 'movielens_ens'
col_names = ["movieId", "userId"]
# read in a batch of data to get transforms for
batch = cudf.read_parquet('/model/data/valid/*.parquet', num_rows=3)[col_names]
print(batch, "\n")

# convert the batch to a triton inputs
columns = [(col, batch[col][0:3]) for col in col_names]
inputs = []

col_dtypes = [np.int64, np.int64]
for i, (name, col) in enumerate(columns):
    d = col.values_host.astype(col_dtypes[i])
    d = d.reshape(len(d), 1)
    inputs.append(httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)
# placeholder variables for the output
outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
# make the request
with httpclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
# print(response.as_numpy('OUTPUT0'))
print("predicted softmax result:\n", response.as_numpy("OUTPUT0"))

   movieId  userId
0    19997   99476
1     2543  107979
2     1557  155372 

predicted softmax result:
 [0.57567745 0.5913081  0.53693664]
