In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_training-with-hdfs/nvidia_logo.png" style="width: 90px; float: right;">

# HugeCTR End-end Example with NVTabular

## Overview

In this sample notebook, we are going to:
1. Preprocess data using **NVTabular**
2. Training model with **HugeCTR**
3. Do offline inference using **HugeCTR HPS**

## Setup

To setup the environment, refer to [HugeCTR Example Notebooks](../notebooks) and follow the instructions there before running the following.

## Data Preparation

In [1]:
import os
import shutil

In [2]:
!mkdir -p /hugectr_e2e
!mkdir -p /hugectr_e2e/criteo/train
!mkdir -p /hugectr_e2e/criteo/val
!mkdir -p /hugectr_e2e/model

In [3]:
BASE_DIR = os.environ.get("BASE_DIR", "/hugectr_e2e")
DATA_DIR = os.environ.get("DATA_DIR", BASE_DIR + "/criteo")
TRAIN_DIR = os.environ.get("TRAIN_DIR", DATA_DIR +"/train")
VAL_DIR = os.environ.get("VAL_DIR", DATA_DIR +"/val")
MODEL_DIR = os.environ.get("MODEL_DIR", BASE_DIR + "/model")

**Download the Criteo data for 1 day:**

In [4]:
#!wget -P $DATA_DIR https://storage.googleapis.com/criteo-cail-datasets/day_0.gz  #decomment this line to download, otherwise soft link the data.
#!gzip -d -c $DATA_DIR/day_0.gz > $DATA_DIR/day_0
INPUT_DATA = os.environ.get("INPUT_DATA", DATA_DIR + "/day_0")
!ln -s $INPUT_DATA $DATA_DIR/day_0

ln: failed to create symbolic link '/hugectr_e2e/criteo/day_0': File exists


**Unzip and split data**

In [5]:
!head -n 10000000 $DATA_DIR/day_0 > $DATA_DIR/train/train.txt
!tail -n 2000000 $DATA_DIR/day_0 > $DATA_DIR/val/test.txt 

## Data Preprocessing using NVTabular

In [6]:
import sys
import argparse
import glob
import time
import numpy as np
import numba

import dask_cudf
import cudf
import nvtabular as nvt
from nvtabular.io import Shuffle
from nvtabular.ops import Categorify, Clip, FillMissing, Normalize, get_embedding_sizes
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from nvtabular.utils import pynvml_mem_size, device_mem_size
import warnings

import logging
logging.basicConfig(format='%(asctime)s %(message)s')
logging.root.setLevel(logging.NOTSET)

# define dataset schema
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS +  CATEGORICAL_COLUMNS
#/samples/criteo mode doesn't have dense features
criteo_COLUMN=LABEL_COLUMNS +  CATEGORICAL_COLUMNS
#For new feature cross columns
CROSS_COLUMNS = ["C1_C2", "C3_C4"]

NUM_INTEGER_COLUMNS = 13
NUM_CATEGORICAL_COLUMNS = 26
NUM_TOTAL_COLUMNS = 1 + NUM_INTEGER_COLUMNS + NUM_CATEGORICAL_COLUMNS

In [7]:
# Dask dashboard
dashboard_port = "8787"

# Deploy a Single-Machine Multi-GPU Cluster
protocol = "tcp"  # "tcp" or "ucx"
if numba.cuda.is_available():
    NUM_GPUS = list(range(len(numba.cuda.gpus)))
else:
    NUM_GPUS = []
visible_devices = ",".join([str(n) for n in NUM_GPUS])  # Delect devices to place workers
device_limit_frac = 0.7  # Spill GPU-Worker memory to host at this limit.
device_pool_frac = 0.8
part_mem_frac = 0.15

# Use total device size to calculate args.device_limit_frac
device_size = device_mem_size(kind="total")
device_limit = int(device_limit_frac * device_size)
device_pool_size = int(device_pool_frac * device_size)
part_size = int(part_mem_frac * device_size)

# Check if any device memory is already occupied
for dev in visible_devices.split(","):
    fmem = pynvml_mem_size(kind="free", index=int(dev))
    used = (device_size - fmem) / 1e9
    if used > 1.0:
        warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!")

cluster = None  # (Optional) Specify existing scheduler port
if cluster is None:
    cluster = LocalCUDACluster(
        protocol=protocol,
        n_workers=len(visible_devices.split(",")),
        CUDA_VISIBLE_DEVICES=visible_devices,
        device_memory_limit=device_limit,
        dashboard_address=":" + dashboard_port,
        rmm_pool_size=(device_pool_size // 256) * 256
    )

# Create the distributed client
client = Client(cluster)
client

2023-01-06 04:03:09,380 Using selector: EpollSelector
2023-01-06 04:03:11,334 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-06 04:03:11,334 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-01-06 04:03:11,343 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-06 04:03:11,344 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-01-06 04:03:11,362 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-06 04:03:11,362 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-01-06 04:03:11,381 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-06 04:03:11,381 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-01-06 04:03:11,402 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-06 04:03:11,402 - distributed.preloading - IN

0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 8,Total memory: 503.79 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44759,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 503.79 GiB

0,1
Comm: tcp://127.0.0.1:41549,Total threads: 1
Dashboard: http://127.0.0.1:44715/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:35427,
Local directory: /tmp/dask-worker-space/worker-j9o6tjnq,Local directory: /tmp/dask-worker-space/worker-j9o6tjnq
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:33573,Total threads: 1
Dashboard: http://127.0.0.1:36839/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:43633,
Local directory: /tmp/dask-worker-space/worker-g048l4_5,Local directory: /tmp/dask-worker-space/worker-g048l4_5
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:40435,Total threads: 1
Dashboard: http://127.0.0.1:37093/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:41905,
Local directory: /tmp/dask-worker-space/worker-nb0yv_rz,Local directory: /tmp/dask-worker-space/worker-nb0yv_rz
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:41707,Total threads: 1
Dashboard: http://127.0.0.1:37285/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:43925,
Local directory: /tmp/dask-worker-space/worker-8vibnk55,Local directory: /tmp/dask-worker-space/worker-8vibnk55
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:40165,Total threads: 1
Dashboard: http://127.0.0.1:46549/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:40305,
Local directory: /tmp/dask-worker-space/worker-p9qwcklg,Local directory: /tmp/dask-worker-space/worker-p9qwcklg
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:36597,Total threads: 1
Dashboard: http://127.0.0.1:42895/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:41439,
Local directory: /tmp/dask-worker-space/worker-y00valwq,Local directory: /tmp/dask-worker-space/worker-y00valwq
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:45495,Total threads: 1
Dashboard: http://127.0.0.1:44953/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:37091,
Local directory: /tmp/dask-worker-space/worker-4lj3i2cp,Local directory: /tmp/dask-worker-space/worker-4lj3i2cp
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB

0,1
Comm: tcp://127.0.0.1:46123,Total threads: 1
Dashboard: http://127.0.0.1:40675/status,Memory: 62.97 GiB
Nanny: tcp://127.0.0.1:36179,
Local directory: /tmp/dask-worker-space/worker-dvzzo2d1,Local directory: /tmp/dask-worker-space/worker-dvzzo2d1
GPU: Tesla V100-SXM2-32GB,GPU memory: 31.75 GiB


In [8]:
train_output = os.path.join(DATA_DIR, "train")
print("Training output data: "+train_output)
val_output = os.path.join(DATA_DIR, "val")
print("Validation output data: "+val_output)
train_input = os.path.join(DATA_DIR, "train/train.txt")
print("Training dataset: "+train_input)
val_input = os.path.join(DATA_DIR, "val/test.txt")
PREPROCESS_DIR_temp_train = os.path.join(DATA_DIR, 'train/temp-parquet-after-conversion')  
PREPROCESS_DIR_temp_val = os.path.join(DATA_DIR, "val/temp-parquet-after-conversion")
if not os.path.exists(PREPROCESS_DIR_temp_train):
    os.makedirs(PREPROCESS_DIR_temp_train)

if not os.path.exists(PREPROCESS_DIR_temp_val):
    os.makedirs(PREPROCESS_DIR_temp_val)

PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]

# Make sure we have a clean parquet space for cudf conversion
for one_path in PREPROCESS_DIR_temp:
    if os.path.exists(one_path):
        shutil.rmtree(one_path)
    os.mkdir(one_path)

#calculate the total processing time
runtime = time.time()

## train/valid txt to parquet
train_valid_paths = [(train_input,PREPROCESS_DIR_temp_train),(val_input,PREPROCESS_DIR_temp_val)]

for input, temp_output in train_valid_paths:

    ddf = dask_cudf.read_csv(input,sep='\t',names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS)

    ddf["label"] = ddf['label'].astype('float32')
    ddf[CONTINUOUS_COLUMNS] = ddf[CONTINUOUS_COLUMNS].astype('float32')

    # Save it as parquet format for better memory usage
    ddf.to_parquet(temp_output,header=True)
    ##-----------------------------------##

COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
train_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))

categorify_op = Categorify()
cat_features = CATEGORICAL_COLUMNS >> categorify_op
cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> Normalize()
cross_cat_op = Categorify(encode_type="combo")

features = LABEL_COLUMNS

features += cont_features
if CROSS_COLUMNS:
    feature_pairs = [pair.split("_") for pair in CROSS_COLUMNS]
    for pair in feature_pairs:
        features += [pair] >> cross_cat_op

features += cat_features

workflow = nvt.Workflow(features)

logging.info("Preprocessing")

output_format = 'parquet'

# just for /samples/criteo model
train_ds_iterator = nvt.Dataset(train_paths, engine='parquet')
valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet')

shuffle = nvt.io.Shuffle.PER_PARTITION

logging.info('Train Datasets Preprocessing.....')

dict_dtypes = {}
for col in CATEGORICAL_COLUMNS:
    dict_dtypes[col] = np.int64
for col in CONTINUOUS_COLUMNS:
    dict_dtypes[col] = np.float32
for col in CROSS_COLUMNS:
    dict_dtypes[col] = np.int64
for col in LABEL_COLUMNS:
    dict_dtypes[col] = np.float32

conts = CONTINUOUS_COLUMNS

workflow.fit(train_ds_iterator)

if output_format == 'hugectr':
    workflow.transform(train_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=train_output,
            shuffle=shuffle)
else:
    workflow.transform(train_ds_iterator).to_parquet(
            output_path=train_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle)

###Getting slot size###    
#--------------------##
embeddings_dict_cat = categorify_op.get_embedding_sizes(CATEGORICAL_COLUMNS)
embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
embeddings = [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] + [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS]

print(embeddings)
##--------------------##

logging.info('Valid Datasets Preprocessing.....')

if output_format == 'hugectr':
    workflow.transform(valid_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=val_output,
            shuffle=shuffle)
else:
    workflow.transform(valid_ds_iterator).to_parquet(
            output_path=val_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle)

embeddings_dict_cat = categorify_op.get_embedding_sizes(CATEGORICAL_COLUMNS)
embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
embeddings = [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] + [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS]

print(embeddings)
##--------------------##

## Shutdown clusters
client.close()

runtime = time.time() - runtime

print("\nDask-NVTabular Criteo Preprocessing Done!")
print(f"Runtime[s]         | {runtime}")
print("======================================\n")

Training output data: /hugectr_e2e/criteo/train
Validation output data: /hugectr_e2e/criteo/val
Training dataset: /hugectr_e2e/criteo/train/train.txt


2023-01-06 04:03:19,407 Preprocessing
2023-01-06 04:03:19,749 Train Datasets Preprocessing.....
2023-01-06 04:03:25,189 Valid Datasets Preprocessing.....


[1234907, 19683, 13780, 6867, 18490, 4, 6264, 1235, 50, 854680, 114026, 75736, 11, 2159, 7533, 61, 4, 919, 15, 1307783, 404742, 1105613, 87714, 9032, 77, 34, 1581605, 1093030]
[1234907, 19683, 13780, 6867, 18490, 4, 6264, 1235, 50, 854680, 114026, 75736, 11, 2159, 7533, 61, 4, 919, 15, 1307783, 404742, 1105613, 87714, 9032, 77, 34, 1581605, 1093030]

Dask-NVTabular Criteo Preprocessing Done!
Runtime[s]         | 11.454381227493286



In [9]:
client.shutdown()
cluster.close()

In [10]:
### Record the slot size array
SLOT_SIZE_ARRAY = embeddings

## Training a WDL model with HugeCTR

In [11]:
%%writefile './train.py'
import hugectr
import os
import argparse
from mpi4py import MPI
parser = argparse.ArgumentParser(description=("HugeCTR Training"))
parser.add_argument("--data_path", type=str, help="Input dataset path (Required)")
parser.add_argument("--model_path", type=str, help="Directory path to write output (Required)")
args = parser.parse_args()
SLOT_SIZE_ARRAY = [1234907, 19683, 13780, 6867, 18490, 4, 6264, 1235, 50, 854680, 114026, 75736, 11, 2159, 7533, 61, 4, 919, 15, 1307783, 404742, 1105613, 87714, 9032, 77, 34, 1581605, 1093030]

solver = hugectr.CreateSolver(max_eval_batches = 4000,
                              batchsize_eval = 2720,
                              batchsize = 2720,
                              lr = 0.001,
                              vvgpu = [[0]],
                              repeat_dataset = True,
                              i64_input_key = True)

reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = [os.path.join(args.data_path, "train/_file_list.txt")],
                                  eval_source = os.path.join(args.data_path, "val/_file_list.txt"),
                                  check_type = hugectr.Check_t.Non,
                                  slot_size_array = SLOT_SIZE_ARRAY)
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                    update_type = hugectr.Update_t.Global,
                                    beta1 = 0.9,
                                    beta2 = 0.999,
                                    epsilon = 0.0000001)
model = hugectr.Model(solver, reader, optimizer)

model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 13, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("wide_data", 1, True, 2),
                        hugectr.DataReaderSparseParam("deep_data", 2, False, 26)]))

model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 80,
                            embedding_vec_size = 1,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding2",
                            bottom_name = "wide_data",
                            optimizer = optimizer))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 1350,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "deep_data",
                            optimizer = optimizer))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=416))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding2"],
                            top_names = ["reshape2"],
                            leading_dim=2))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReduceSum,
                            bottom_names = ["reshape2"],
                            top_names = ["wide_redn"],
                            axis = 1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "dense"],
                            top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Add,
                            bottom_names = ["fc3", "wide_redn"],
                            top_names = ["add1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["add1", "label"],
                            top_names = ["loss"]))
model.compile()
model.summary()
model.fit(max_iter = 21000, display = 1000, eval_interval = 4000, snapshot = 20000, snapshot_prefix = os.path.join(args.model_path, "wdl/"))
model.graph_to_json(graph_config_file = os.path.join(args.model_path, "wdl.json"))

Overwriting ./train.py


In [12]:
!python train.py --data_path $DATA_DIR --model_path $MODEL_DIR

HugeCTR Version: 4.2
[HCTR][04:03:34.493][INFO][RK0][main]: Global seed is 1268002110
[HCTR][04:03:34.495][INFO][RK0][main]: Device to NUMA mapping:
  GPU 0 ->  node 0
[HCTR][04:03:36.281][DEBUG][RK0][main]: [device 0] allocating 0.0000 GB, available 30.1804 
[HCTR][04:03:36.281][INFO][RK0][main]: Start all2all warmup
[HCTR][04:03:36.281][INFO][RK0][main]: End all2all warmup
[HCTR][04:03:36.282][INFO][RK0][main]: Using All-reduce algorithm: NCCL
[HCTR][04:03:36.282][INFO][RK0][main]: Device 0: Tesla V100-SXM2-32GB
[HCTR][04:03:36.283][INFO][RK0][main]: num of DataReader workers for train: 1
[HCTR][04:03:36.283][INFO][RK0][main]: num of DataReader workers for eval: 1
[HCTR][04:03:36.283][DEBUG][RK0][main]: [device 0] allocating 0.0054 GB, available 29.9246 
[HCTR][04:03:36.283][DEBUG][RK0][main]: [device 0] allocating 0.0054 GB, available 29.9187 
[HCTR][04:03:36.284][DEBUG][RK0][main]: [device 0] allocating 0.0000 GB, available 29.9187 
[HCTR][04:03:36.284][DEBUG][RK0][main]: [device 0

## Load model to HPS and inference with HugeCTR

In [15]:
from hugectr.inference import InferenceModel, InferenceParams
import hugectr
import os

model_config = os.path.join(MODEL_DIR, "wdl.json")
inference_params = InferenceParams(
    model_name = "wdl",
    max_batchsize = 1024,
    hit_rate_threshold = 1.0,
    dense_model_file = os.path.join(MODEL_DIR, "wdl/_dense_20000.model"),
    sparse_model_files = [os.path.join(MODEL_DIR, "wdl/0_sparse_20000.model"), os.path.join(MODEL_DIR, "wdl/1_sparse_20000.model")],
    deployed_devices = [0],
    use_gpu_embedding_cache = True,
    cache_size_percentage = 1.0,
    i64_input_key = True
)
inference_model = InferenceModel(model_config, inference_params)
pred = inference_model.predict(
    10,
    "/hugectr_e2e/criteo/val/_file_list.txt",
    hugectr.DataReaderType_t.Parquet,
    hugectr.Check_t.Non,
    SLOT_SIZE_ARRAY
)
print(pred.shape)
print(pred)

[HCTR][05:22:53.279][INFO][RK0][main]: Global seed is 2968606722
[HCTR][05:22:53.279][INFO][RK0][main]: Device to NUMA mapping:
  GPU 0 ->  node 0
[HCTR][05:22:53.317][DEBUG][RK0][main]: [device 0] allocating 0.0000 GB, available 29.8757 
[HCTR][05:22:53.317][INFO][RK0][main]: Start all2all warmup
[HCTR][05:22:53.317][INFO][RK0][main]: End all2all warmup
[HCTR][05:22:53.318][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0
[HCTR][05:22:53.318][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0
[HCTR][05:22:53.318][INFO][RK0][main]: Creating HashMap CPU database backend...
[HCTR][05:22:53.318][DEBUG][RK0][main]: Created blank database backend in local memory!
[HCTR][05:22:53.318][INFO][RK0][main]: Volatile DB: initial cache rate = 1
[HCTR][05:22:53.318][INFO][RK0][main]: Volatile DB: cache missed embeddings = 0
[HCTR][05:22:53.318][DEBUG][RK0][main]: Created raw model loader in local memory!
[HCTR][05:22:53.318][INFO][RK0][main]: Using Loc