In [1]:
# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.


## Serving Ranking Models With Merlin Systems
This notebook is created using the latest stable merlin-tensorflow container. This Jupyter notebook example demonstrates how to deploy a ranking model to Triton Inference Server (TIS) and generate prediction results for a given query. As a prerequisite, the ranking model must be trained and saved with Merlin Models. Please read the README for the instructions.

## Overview

NVIDIA Merlin is an open source framework that accelerates and scales end-to-end recommender system pipelines. The Merlin framework is broken up into several sub components, these include: Merlin-Core, Merlin-Models, NVTabular and Merlin-Systems. Merlin Systems will be the focus of this example.

The purpose of the Merlin Systems library is to make it easy for Merlin users to quickly deploy their recommender systems from development to Triton Inference Server. We extended the same user-friendly API users are accustomed to in NVTabular and leveraged it to accommodate deploying recommender system components to TIS.

There are some points we need ensure before we continue with this Notebook. Please ensure you have a working NVTabular workflow and model stored in an accessible location. Merlin Systems take the data preprocessing workflow defined in NVTabular and load that into Triton Inference Server as a model. Subsequently it does the same for the trained model. Lets take a closer look at how Merlin Systems makes deploying to TIS simple and effortless, in the rest of this notebook.

### Starting Triton Inference Server

After we export the ensemble, we are ready to start the Triton Inference Server. The server is installed in all the Merlin inference containers. If you are not using one of our containers, then ensure it is installed in your environment. For more information, see the Triton Inference Server documentation.

You can start the server by running the following command:

tritonserver --model-repository=/workspace/data/ensemble --backend-config=tensorflow,version=2
For the --model-repository argument, specify the same value as the export_path that you specified previously in the ensemble.export method.

After you run the tritonserver command, wait until your terminal shows messages like the following example:

I0414 18:29:50.741833 4067 grpc_server.cc:4421] Started GRPCInferenceService at 0.0.0.0:8001
I0414 18:29:50.742197 4067 http_server.cc:3113] Started HTTPService at 0.0.0.0:8000
I0414 18:29:50.783470 4067 http_server.cc:178] Started Metrics Service at 0.0.0.0:8002

In [2]:
import os
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
from nvtabular.workflow import Workflow
import merlin.models.tf as mm
import tensorflow as tf

2023-05-09 21:02:53.368848: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  warn(f"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}")


[INFO]: sparse_operation_kit is imported
[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so
[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so


2023-05-09 21:03:01.087078: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-09 21:03:01.282550: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-05-09 21:03:01.282603: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0
2023-05-09 21:03:01.282795: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8192 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:07:00.0, compute capability: 7.0
  from .autonotebook import tqd

[SOK INFO] Initialize finished, communication tool: horovod


In [3]:
input_path = os.environ.get("INPUT_FOLDER", "/workspace/data/Tenrec/outputs/dataset/")

workflow_stored_path = os.path.join(input_path, "workflow")

workflow = Workflow.load(workflow_stored_path)

In [4]:
workflow.input_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged
0,user_id,(),"DType(name='int32', element_type=<ElementType....",False,False
1,item_id,(),"DType(name='int32', element_type=<ElementType....",False,False
2,video_category,(),"DType(name='int8', element_type=<ElementType.I...",False,False
3,gender,(),"DType(name='int8', element_type=<ElementType.I...",False,False
4,age,(),"DType(name='int8', element_type=<ElementType.I...",False,False


In [5]:
workflow.output_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.domain.min,properties.domain.max,properties.domain.name,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension
0,user_id,"(Tags.USER, Tags.CATEGORICAL, Tags.ID, Tags.US...","DType(name='int64', element_type=<ElementType....",False,False,,0,0,0,.//categories/unique.user_id.parquet,0,16844,user_id,16845,372
1,item_id,"(Tags.ITEM_ID, Tags.CATEGORICAL, Tags.ITEM, Ta...","DType(name='int64', element_type=<ElementType....",False,False,,0,0,0,.//categories/unique.item_id.parquet,0,7539,item_id,7540,237
2,video_category,(Tags.CATEGORICAL),"DType(name='int64', element_type=<ElementType....",False,False,,0,0,0,.//categories/unique.video_category.parquet,0,3,video_category,4,16
3,gender,(Tags.CATEGORICAL),"DType(name='int64', element_type=<ElementType....",False,False,,0,0,0,.//categories/unique.gender.parquet,0,7,gender,8,16
4,age,(Tags.CATEGORICAL),"DType(name='int64', element_type=<ElementType....",False,False,,0,0,0,.//categories/unique.age.parquet,0,2,age,3,16


In [6]:
from merlin.schema.tags import Tags

label_columns = workflow.output_schema.select_by_tag(Tags.TARGET).column_names
workflow.remove_inputs(label_columns)

<nvtabular.workflow.workflow.Workflow at 0x7f693b07ee80>

In [7]:
label_columns

[]

In [None]:
tf_model_path = '/workspace/data/Tenrec/saved_model/'

model = tf.keras.models.load_model(tf_model_path)