In [1]:
# Copyright 2024 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_hps-sok-to-dlrm-demo/nvidia_logo.png" style="width: 90px; float: right;">

# HPS Inference DLRM with SOK weights Demo

## Overview

This notebook demonstrates how to enable HPS to use SOK's embedding weights for inference. 

Note, before running this notebook, you must first run the [sok_train_hps_inference_demo.ipynb](sok_train_hps_inference_demo.ipynb) notebook to generate SOK's weights, otherwise, this notebook will crash!

For more details about SOK, please refer to [SOK Documentation](https://nvidia-merlin.github.io/HugeCTR/sparse_operation_kit/master/index.html). For more details about HPS APIs, please refer to [HPS APIs](https://nvidia-merlin.github.io/HugeCTR/main/hierarchical_parameter_server/api/index.html).

## Installation

### Get SOK from NGC

SOK and HPS Python modules are preinstalled in the 23.12 and later [Merlin Tensorflow Container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow): `nvcr.io/nvidia/merlin/merlin-tensorflow:nightly`.

You can check the existence of the required libraries by running the following Python code after launching this container.

```bash
$ python3 -c "import sparse_operation_kit as sok"
$ python3 -c "import hierarchical_parameter_server as hps"
```

## Configurations

First, we need to convert the SOK trained key-value weights into a form that HPS can use as key-value weights. The function generate_kv_file_for_hps is the weight conversion function. Additionally, to run HPS, some configuration information is also required.

In [1]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'
import numpy as np
import tensorflow as tf
import struct
import json

args = dict()
args["gpu_num"] = 1 
args["slot_num"] = 26                             # the number of feature fields in this embedding layer
args["iter_num"] = 10                             # the number of feature fields in this embedding layer
args["embed_vec_sizes"] = [16]*args["slot_num"]                       # the dimension of embedding vectors
args["dense_dim"] = 13                            # the dimension of dense features
args["global_batch_size"] = 1024                 # the globally batchsize for all GPUs
args["table_names"] = ["table"+str(i) for i in range(args["slot_num"])]                            # embedding table names
args["max_vocabulary_sizes"] = np.random.randint(1000, 1200, size=args["slot_num"]).tolist()
args["max_nnz"] = [1]*args["slot_num"]

args["ps_config_file"] = "dlrm.json"
args["dense_model_path"] = "dlrm_dense.model"
args["sparse_model_path"] = "dlrm_sparse.model"
args["saved_path"] = "dlrm_tf_saved_model"
args["np_key_type"] = np.int64
args["np_vector_type"] = np.float32
args["tf_key_type"] = tf.int64
args["tf_vector_type"] = tf.float32
args["sok_embedding_table_path"] = "sok_dlrm_sparse.model"

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(args["gpu_num"])))

data = {
    "supportlonglong": True,
    "models": [{
        "model": "dlrm",
        "sparse_files": ["dlrm_sparse.model/table"+str(i) for i in range(args["slot_num"])],
        "num_of_worker_buffer_in_pool": 30,
        "embedding_table_names": ["table"+str(i) for i in range(args["slot_num"])],
        "embedding_vecsize_per_table": [16]*args["slot_num"],
        "maxnum_catfeature_query_per_table_per_sample": [10]*args["slot_num"],
        "default_value_for_each_table": [1.0]*args["slot_num"],
        "deployed_device_list": [0],
        "max_batch_size": 1024,
        "cache_refresh_percentage_per_iteration": 0.2,
        "hit_rate_threshold": 1.0,
        "gpucacheper": 1.0,
        "gpucache": True
    }]
}


def generate_kv_file_for_hps(args):

    def convert_sok_weights_for_hps(sok_file_path,dtype,dim,output_path):
       file_head_length=296
       with open(sok_file_path, "rb") as f:
           f.seek(file_head_length, os.SEEK_SET)
           array_np = np.fromfile(f, dtype=dtype)
           array_np = array_np.reshape((-1,dim))
       with open(output_path, mode="wb") as f:
           array_np.tofile(f)

    sok_weight_file = args["sok_embedding_table_path"]+"/"
    hps_spase_model_path =  args["sparse_model_path"]+"/"
    if not os.path.exists(hps_spase_model_path):
        os.makedirs(hps_spase_model_path, exist_ok=True)
    table_names = args["table_names"]
    for i,table_name in enumerate(table_names):
        table_output_path = hps_spase_model_path+table_name+"/"
        if not os.path.exists(table_output_path):
            os.makedirs(table_output_path, exist_ok=True)

        #Note:The suffix "_0" is an automatic numbering added by SOK to prevent users from inputting duplicate table names.
        #For example, if the names of two sok.DynamicVariable are "table1", the first created will have the name "table1_0" 
        #and the subsequently created will have the name "table1_1". 
        #In this example, there are no duplicate names inputted, so SOK generates the weight names as name+"_0".
        key_filename = table_name+"_0-key" 
        value_filename = table_name+"_0-weight"
        key_path = sok_weight_file+key_filename
        value_path = sok_weight_file+value_filename
        
        key_output_path= table_output_path + "key"
        value_output_path= table_output_path + "emb_vector"
        convert_sok_weights_for_hps(key_path,args["np_key_type"],1,key_output_path)
        convert_sok_weights_for_hps(value_path,args["np_vector_type"],args["embed_vec_sizes"][i],value_output_path)
    print("generate hps weight success!")
    
generate_kv_file_for_hps(args)

generate hps weight success!


Define how to generate dataset function.

In [2]:
with open('dlrm.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

import hierarchical_parameter_server as hps

def generate_random_samples(num_samples, vocabulary_range_per_slot, max_nnz, dense_dim):
    def generate_sparse_keys(num_samples, vocabulary_range_per_slot, max_nnz, key_dtype = args["np_key_type"]):
        slot_num = len(vocabulary_range_per_slot)
        total_indices = []
        for i in range(slot_num):
            indices = []
            values = []
            for j in range(num_samples):
                vocab_range = vocabulary_range_per_slot[i]
                nnz = np.random.randint(low=1, high=max_nnz+1)
                entries = sorted(np.random.choice(max_nnz, nnz, replace=False))
                for entry in entries:
                    indices.append([j, 0, entry])
                values.extend(np.random.randint(low=0, high=vocab_range, size=(nnz, )))
            values = np.array(values, dtype=key_dtype)
            total_indices.append(tf.sparse.SparseTensor(indices = indices,
                                    values = values,
                                    dense_shape = (num_samples, 1, max_nnz)))
        return total_indices

    sparse_keys = generate_sparse_keys(num_samples, vocabulary_range_per_slot, max_nnz)
    dense_features = np.random.random((num_samples, dense_dim)).astype(np.float32)
    labels = np.random.randint(low=0, high=2, size=(num_samples, 1))
    return sparse_keys, dense_features, labels

def tf_dataset(sparse_keys, dense_features, labels, batchsize):
    total_data = []
    #total_data.extend(sparse_keys)
    total_data.extend(sparse_keys)
    total_data.append(dense_features)
    total_data.append(labels)
    dataset = tf.data.Dataset.from_tensor_slices(tuple(total_data))
    dataset = dataset.batch(batchsize, drop_remainder=True)
    return dataset

[INFO] hierarchical_parameter_server is imported


## Build model with HPS embedding layers

Create an inference model using `hps.SparseLookupLayer` and args.

In [3]:
class InferenceModel(tf.keras.models.Model):
    def __init__(self,
                 slot_num,
                 embed_vec_size,
                 dense_model_path,
                 **kwargs):
        super(InferenceModel, self).__init__(**kwargs)
        
        self.slot_num = slot_num
        self.embed_vec_size = embed_vec_size
        
        self.sparse_lookup_layer_list = []
        for i in range(self.slot_num):
            self.sparse_lookup_layer_list.append(hps.SparseLookupLayer(model_name = "dlrm", 
                                                table_id = i,
                                                emb_vec_size = self.embed_vec_size[i],
                                                emb_vec_dtype = args["tf_vector_type"]))
        self.reshape_layer_list = []
        for i in range(self.slot_num):
            self.reshape_layer_list.append(tf.keras.layers.Reshape((1, args["embed_vec_sizes"][i]), name = "reshape"+str(i)))
        self.concat1 = tf.keras.layers.Concatenate(axis=1, name = "concat1")

        self.dense_model = tf.keras.models.load_model(dense_model_path)
        self.dense_model.summary()
    
    def call(self, inputs):
        #input_sparse = inputs[:]
        input_sparse = inputs[:-1]
        input_dense = inputs[-1]
        embeddings = []
        for i in range(self.slot_num):
            tmp_embedding = self.sparse_lookup_layer_list[i](sp_ids=input_sparse[i], sp_weights=None, combiner="mean")
            embeddings.append(tmp_embedding)

        concat_embeddings = []
        for i in range(args["slot_num"]):
            concat_embeddings.append(self.reshape_layer_list[i](embeddings[i]))
        concat_embeddings = self.concat1(concat_embeddings)

        logit = self.dense_model([concat_embeddings,input_dense])
        return logit, embeddings
    def summary(self):
        inputs = []
        for i in range(args["slot_num"]):
            inputs.append(tf.keras.Input(shape=(args["max_nnz"][i], ), sparse=True, dtype=args["tf_key_type"])) 
        dense_input = tf.keras.Input(shape=(args["dense_dim"], ), dtype=tf.float32)
        inputs.append(dense_input)
        model = tf.keras.models.Model(inputs=inputs, outputs=self.call(inputs))
        return model.summary()
    
def create_and_save_inference_graph(args): 
    model = InferenceModel(args["slot_num"], args["embed_vec_sizes"], args["dense_model_path"])
    model.summary()
    inputs = []
    for i in range(args["slot_num"]):
        inputs.append(tf.keras.Input(shape=(args["max_nnz"][i], ), sparse=True, dtype=args["tf_key_type"])) 
    dense_input = tf.keras.Input(shape=(args["dense_dim"], ), dtype=tf.float32)
    inputs.append(dense_input)
    _ = model(inputs)
    model.save(args["saved_path"])

create_and_save_inference_graph(args)


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_27 (InputLayer)          [(None, 13)]         0           []                               
                                                                                                  
 input_28 (InputLayer)          [(None, 26, 16)]     0           []                               
                                                                                                  
 bottom (MLP)                   (None, 16)           38544       ['input_27[0][0]']               
                                                                                                  
 second_order_feature_interacti  (None, None)        0           ['input_28[0][0]']               
 on (SecondOrderFeatureInteract                                                             



INFO:tensorflow:Assets written to: dlrm_tf_saved_model/assets


INFO:tensorflow:Assets written to: dlrm_tf_saved_model/assets


## Inference with HPS models

Run the HPS model and print the results.

In [4]:
def inference_with_saved_model(args):
    hps.Init(global_batch_size = args["global_batch_size"],
             ps_config_file = args["ps_config_file"])
    model = tf.keras.models.load_model(args["saved_path"])
    model.summary()
    def _infer_step(tmp_inputs, labels):
        logit, embeddings = model(tmp_inputs)
        return logit, embeddings

    embeddings_peek = list()
    inputs_peek = list()

    sparse_keys, dense_features, labels = generate_random_samples(args["global_batch_size"]  * args["iter_num"], args["max_vocabulary_sizes"], args["max_nnz"][0], args["dense_dim"])
    dataset = tf_dataset(sparse_keys, dense_features, labels, args["global_batch_size"])
    for i, input_pack in enumerate(dataset):
        inputs = []
        for table_id in range(args["slot_num"]):
            inputs.append(tf.sparse.reshape(input_pack[table_id], [-1, input_pack[table_id].shape[-1]]))
        inputs.append(input_pack[-2])
        labels = input_pack[-1]
        logit,embeddings = _infer_step(inputs, labels)
        embeddings_peek.append(embeddings)
        inputs_peek.append(inputs)
        print("-"*20, "Step {}".format(i),  "-"*20)

    return embeddings_peek, inputs_peek

embeddings_peek, inputs_peek = inference_with_saved_model(args)

# embedding table, input keys are SparseTensor
print(inputs_peek[-1][0].values)
print(embeddings_peek[0][0])

[HCTR][04:03:31.190][INFO][RK0][main]: fuse_embedding_table is not specified using default: 0
[HCTR][04:03:31.190][INFO][RK0][main]: dense_file is not specified using default: 
[HCTR][04:03:31.190][INFO][RK0][main]: num_of_refresher_buffer_in_pool is not specified using default: 1
[HCTR][04:03:31.190][INFO][RK0][main]: maxnum_des_feature_per_sample is not specified using default: 26
[HCTR][04:03:31.190][INFO][RK0][main]: refresh_delay is not specified using default: 0
[HCTR][04:03:31.190][INFO][RK0][main]: refresh_interval is not specified using default: 0
[HCTR][04:03:31.190][INFO][RK0][main]: use_static_table is not specified using default: 0
[HCTR][04:03:31.190][INFO][RK0][main]: use_context_stream is not specified using default: 1
[HCTR][04:03:31.190][INFO][RK0][main]: use_hctr_cache_implementation is not specified using default: 1
[HCTR][04:03:31.190][INFO][RK0][main]: thread_pool_size is not specified using default: 16
[HCTR][04:03:31.190][INFO][RK0][main]: init_ec is not specifi



Model: "inference_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sparse_lookup_layer (Sparse  multiple                 0         
 LookupLayer)                                                    
                                                                 
 sparse_lookup_layer_1 (Spar  multiple                 0         
 seLookupLayer)                                                  
                                                                 
 sparse_lookup_layer_2 (Spar  multiple                 0         
 seLookupLayer)                                                  
                                                                 
 sparse_lookup_layer_3 (Spar  multiple                 0         
 seLookupLayer)                                                  
                                                                 
 sparse_lookup_layer_4 (Spar  multiple             