In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

The last step is to deploy the ETL workflow and saved model to production. In the production setting, we want to transform the input data as during training (ETL). We need to apply the same mean/std for continuous features and use the same categorical mapping to convert the categories to continuous integer before we use the deep learning model for a prediction. Therefore, we deploy the NVTabular workflow with the HugeCTR model as an ensemble model to Triton Inference. The ensemble model garantuees that the same transformation are applied to the raw inputs.

## Learning objectives
In this notebook, we learn how to deploy our models to production

- Use NVTabular to generate config and model files for Triton Inference Server
- Deploy an ensemble of NVTabular workflow and HugeCTR model
- Send example request to Triton Inference Server

## Getting Started

Before we get started, you should launch the Triton Inference Server docker container with the following script. This script will mount your local `model-repository` folder where you stored the models to `/model` into the `merlin-inference` docker container.

```
docker run -it --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ${PWD}:/model nvcr.io/nvidia/merlin/merlin-inference:0.5
```
Once the container launches, activate the `merlin` environment:
```
source activate merlin
```


### Saving Ensemble Model for Triton Inference Server¶

First, we need to generate the Triton Inference Server configurations and save the models in the correct format. In the previous notebooks [02-ETL-with-NVTabular](https://github.com/NVIDIA/NVTabular/blob/main/examples/getting-started-movielens/02-ETL-with-NVTabular.ipynb) and [03c-Training-with-HugeCTR](https://github.com/NVIDIA/NVTabular/blob/main/examples/getting-started-movielens/03a-Training-with-HugeCTR.ipynb) we saved the NVTabular workflow and HugeCTR model to disk. We will load them.

After training terminates, we can see that two `.model` files are generated. We need to move them inside `1` folder under the `movielens_hugectr` folder. Let's create these folders first.

In [2]:
import os

In [3]:
os.system('mkdir -p /model/movielens_hugectr/1')

0

Now we move our saved .model files inside 1 folder. We use only the last snapshot after 1900 iterations.

In [4]:
os.system('mv *.model /model/movielens_hugectr/1/')

256

Note that these stored .model files will be used in the inference. Now we have to create a JSON file for inference which has a similar configuration as our training file. We should remove the solver and optimizer clauses and add the inference clause in the JSON file. The paths of the stored dense model and sparse model(s) should be specified at dense_model_file and sparse_model_file within the inference clause. We need to make some modifications to data in the layers clause. Besides, we need to change the last layer from BinaryCrossEntropyLoss to Sigmoid. The rest of "layers" should be exactly the same as that in the training model.py file.

Now let's create a movielens.json file inside the movielens/1 folder. We have already retrieved the cardinality of each categorical column using get_embedding_sizes function above. We will use these cardinalities below in the movielens.json file as well.



In [5]:
%%writefile '/model/movielens_hugectr/1/movielens.json'

{
   "inference": {
    "max_batchsize": 64,
    "hit_rate_threshold": 0.6,
    "dense_model_file": "/model/models/movielens/1/_dense_1900.model",
    "sparse_model_file": "/model/models/movielens/1/0_sparse_1900.model",
    "label": 1,
    "input_key_type": "I64"
  },
  "layers": [
    {
      "name": "data",
      "type": "Data",
      "format": "Parquet",
      "slot_size_array": [56586, 162542],
      "source": "/model/data/train/_file_list.txt",
      "eval_source": "/model/data/valid/_file_list.txt",
      "check": "Sum",
      "label": {
        "top": "label",
        "label_dim": 1
      },
      "dense": {
        "top": "dense",
        "dense_dim": 0
      },
      "sparse": [
        {
          "top": "data1",
          "type": "DistributedSlot",
          "max_feature_num_per_sample": 3,
          "slot_num": 2
        }
      ]
    },
    {
      "name": "sparse_embedding1",
      "type": "DistributedSlotSparseEmbeddingHash",
      "bottom": "data1",
      "top": "sparse_embedding1",
      "sparse_embedding_hparam": {
        "max_vocabulary_size_per_gpu": 219128,
        "embedding_vec_size": 16,
        "combiner": 0
      }
    },
    {
      "name": "reshape1",
      "type": "Reshape",
      "bottom": "sparse_embedding1",
      "top": "reshape1",
      "leading_dim": 32
    },
    {
      "name": "fc1",
      "type": "InnerProduct",
      "bottom": "reshape1",
      "top": "fc1",
      "fc_param": {
        "num_output": 128
      }
    },
    {
      "name": "relu1",
      "type": "ReLU",
      "bottom": "fc1",
      "top": "relu1"
    },
    {
      "name": "fc2",
      "type": "InnerProduct",
      "bottom": "relu1",
      "top": "fc2",
      "fc_param": {
        "num_output": 128
      }
    },
    {
      "name": "relu2",
      "type": "ReLU",
      "bottom": "fc2",
      "top": "relu2"
    },
    {
      "name": "fc3",
      "type": "InnerProduct",
      "bottom": "relu2",
      "top": "fc3",
      "fc_param": {
        "num_output": 1
      }
    },
    {
      "name": "sigmoid",
      "type": "Sigmoid",
      "bottom": "fc3",
      "top": "sigmoid"
    } 
  ]
}

Overwriting /model/movielens_hugectr/1/movielens.json


Now we can save our models to be deployed at the inference stage. To do so we will use `export_hugectr_ensemble` method below. With this method, we can generate the `config.pbtxt` files automatically for each model. In doing so, we should also create a `hugectr_params` dictionary, and define the parameters  like where the `movielens.json` file will be read, `slots` which corresponds to number of categorical features, `embedding_vector_size`, `max_nnz`, and `n_outputs` which is number of outputs.

The script below creates an ensemble triton server model where  

- `workflow` is the the nvtabular workflow used in preprocessing, 
- `hugectr_model_path` is the HugeCTR model that should be served. This path includes the `.model` files.
- `name` is the base name of the various triton models
- `output_path` is the path where is model will be saved to.
- `cats` are the categorical column names
- `label_columns` are the label column names

In [8]:
import nvtabular as nvt

# path to store raw and preprocessed data
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', os.path.expanduser("~/nvt-examples/movielens/data/"))
# path to save models
MODEL_BASE_DIR = os.environ.get('MODEL_BASE_DIR', '/model/')

workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "workflow"))

In [9]:
CATEGORICAL_COLUMNS = ['userId', 'movieId']
LABEL_COLUMNS = ['rating']

In [None]:
from nvtabular.inference.triton import export_hugectr_ensemble
hugectr_params = dict()
hugectr_params["config"] = "/model/models/movielens/1/movielens.json"
hugectr_params["slots"] = 2
hugectr_params["max_nnz"] = 2
hugectr_params["embedding_vector_size"] = 16
hugectr_params["n_outputs"] = 1
export_hugectr_ensemble(workflow=workflow, 
                        hugectr_model_path="/model/movielens_hugectr/1/",
                        hugectr_params=hugectr_params,
                        name="movielens", 
                        output_path="/model/models/", 
                        label_columns=LABEL_COLUMNS, 
                        cats=CATEGORICAL_COLUMNS,
                        max_batch_size=64)

After we run the script above, we will have three model folders saved as `movielens_nvt`, `movielens` and `movielens_ens`.

## Load Models on Triton Server

At this stage, you should have already started the `merlin-inference` container. After you started the container you can start triton server with the command below:

```
tritonserver --model-repository=path_to_models --backend-config=hugectr,movielens=path_to_json_file --backend-config=hugectr,supportlonglong=true --model-control-mode=explicit
```

Note: The model-repository path is `/model/models/`. The models haven't been loaded, yet. We can request triton server to load the saved ensemble. We initialize a triton client. The path for the json file is `/model/models/movielens/1/movielens.json`.

In [9]:
# disable warnings
import warnings
warnings.filterwarnings('ignore')

In [10]:
import tritonhttpclient
try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))



client created.


In [None]:
triton_client.is_server_live()

In [None]:
triton_client.get_model_repository_index()

We check the available models in the repositories:

- criteo_ens: Ensemble
- criteo_nvt: NVTabular
- criteo: HugeCTR model

In [None]:
triton_client.get_model_repository_index()

Let's load our models to Triton Server.

In [None]:
%%time

triton_client.load_model(model_name='movielens_nvt')

In [None]:
%%time

triton_client.load_model(model_name='movielens')

Finally, we load our ensemble model movielens_ens.

In [None]:
%%time

triton_client.load_model(model_name='movielens_ens')

Let's send a request to Inference Server and print out the response. Since in our example above we do not have continuous columns, below our only inputs are categorical columns.

In [None]:
import tritonclient.grpc as httpclient
import nvtabular
import cudf
from timeit import default_timer as timer
from datetime import timedelta

model_name = 'movielens_ens'
col_names = ["movieId", "userId"]
# read in a batch of data to get transforms for
batch = cudf.read_parquet('/model/data/valid.parquet', num_rows=3)[col_names]
print(batch, "\n")

# convert the batch to a triton inputs
columns = [(col, batch[col][0:3]) for col in col_names]
inputs = []

col_dtypes = [np.int64, np.int64]
for i, (name, col) in enumerate(columns):
    d = col.values_host.astype(col_dtypes[i])
    d = d.reshape(len(d), 1)
    inputs.append(httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
    inputs[i].set_data_from_numpy(d)
# placeholder variables for the output
outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
# make the request
with httpclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
# print(response.as_numpy('OUTPUT0'))
print("predicted sigmoid result:\n", response.as_numpy("OUTPUT0"))