In [1]:
# Copyright 2024 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_hps-sok-to-dlrm-demo/nvidia_logo.png" style="width: 90px; float: right;">

# SOK Incremental dump Demo

## Overview

Due to the massive memory occupation of some users' Embedding tables, the cost of dumping the entire embedding table to the filesystem is significant. Therefore, SOK offers the option to input a certain time threshold and dump the key-value pairs updated after this time threshold into host memory. This notebook demonstrates how to use the SOK incremental_dump API.

For more details about SOK, please refer to [SOK Documentation](https://nvidia-merlin.github.io/HugeCTR/sparse_operation_kit/master/index.html). 

## Installation

### Get SOK from NGC

SOK Python modules are preinstalled in the 23.12 and later [Merlin Tensorflow Container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow): `nvcr.io/nvidia/merlin/merlin-tensorflow:nightly`.

You can check the existence of the required libraries by running the following Python code after launching this container.

```bash
$ python3 -c "import sparse_operation_kit as sok"
```

## Configurations

SOK will dump the incrementally trained key-value pairs into a numpy array, allowing users to push these keys and values to their inference processes. This demo will show how to use the sok.incremental_dump API.

The operation method of this Notebook is based on [Introduction to Horovod](https://enccs.github.io/upscalingAIcontainer/hvd_intro/?highlight=jupyter#training-with-model-fit). The process is to first define the function, and then use horovod.run to execute it.

In the first of all, we need to configure some SOK Variable properties and define a function for SOK forward and backward. This function takes SOK variables, lookup ids, and an optimizer to perform the tasks of forward, backward, and optimizer update for SOK variable. 

In [6]:
import time
import pytz
from datetime import datetime
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
import sparse_operation_kit as sok

rows = [8192 * 10, 8192]
cols = [128, 4]
hotness = [10, 3]
combiners = ["sum", "sum"]
batch_size = 8192
iters = 5
initial_vals = [13, 17]
sleep_seconds = 15    

# train step
def train_step(params, indices,sok_optimizer):
    with tf.GradientTape() as tape:
        embeddings = sok.lookup_sparse(params, indices, combiners=combiners)
        loss = 0
        for i in range(len(embeddings)):
            loss = loss + tf.reduce_sum(embeddings[i])
    grads = tape.gradient(loss, params)
    sok_optimizer.apply_gradients(zip(grads, params))
    loss = hvd.allreduce(loss, op=hvd.Sum)
    return loss

## Train the SOK variable for N iterations and incrementally dump the keys and values that are incrementally trained at each iteration

In this section, starting from the second iteration, the code will incrementally dump the keys and values looked up in the previous iteration, and then compare them with the previously retained lookup keys to see if it is possible to incrementally dump all keys updated after a certain point in time. Note: incremental dump can only be performed in sok.DynamicVariable with HKV as the backend, and the internal time threshold of HKV is UTC time, therefore a UTC time threshold needs to be inputted.. 

In [3]:
def sok_incremental_dump_evaluate():
    sok_vars = [
        sok.DynamicVariable(
            dimension=cols[i],
            var_type="hybrid",
            initializer=str(initial_vals[i]),
            init_capacity=1024 * 1024,
            max_capacity=1024 * 1024,
        )
        for i in range(len(cols))
    ]
    
    optimizer = tf.optimizers.SGD(learning_rate=1.0, momentum=0.9)
    sok_optimizer = sok.OptimizerWrapper(optimizer) 
    local_indices = []
    for row in rows:
        local_size = row // hvd.size()
        if hvd.rank() < row % hvd.size():
            local_size += 1
        indices = np.arange(local_size) * hvd.size() + hvd.rank()
        indices = tf.convert_to_tensor(indices, dtype=tf.int64)
        local_indices.append(indices)
                                                                                                                                                                                                                                                                                        # indices
    total_indices = []
    for i in range(len(rows)):
        offsets = np.random.randint(1, hotness[i] + 1, iters * batch_size)
        offsets = tf.convert_to_tensor(offsets, dtype=tf.int64)
        offsets = hvd.broadcast(offsets, root_rank=0)
        values = np.random.randint(0, rows[i], tf.reduce_sum(offsets))
        values = tf.convert_to_tensor(values, dtype=tf.int64)
        values = hvd.broadcast(values, root_rank=0)
        total_indices.append(tf.RaggedTensor.from_row_lengths(values, offsets))
        
    left = batch_size // hvd.size() * hvd.rank()
    right = batch_size // hvd.size() * (hvd.rank() + 1)  
    
    indices_records = []
    time_records = []
    for i in range(iters):
        indices = []
        indices_global = []
        for j in range(len(total_indices)):
            indices.append(total_indices[j][i * batch_size + left : i * batch_size + right])
            indices_global.append(total_indices[j][i * batch_size : (i + 1) * batch_size])
        time.sleep(sleep_seconds)
        loss = train_step(sok_vars, indices,sok_optimizer)
        indices_records.append(indices_global)
        time.sleep(sleep_seconds)
    
        # Must convert now time threshold to UTC time threshold
        utc_time = datetime.now(pytz.utc)
        time_records.append(utc_time)
        if i > 0:
            time_before = time_records[i - 1]
            #incremental_model_dump will accept multi sok_vars , and return multi keys , values
            #len(sok_vars) == len(keys) == len(values)## Run With Horovod
            keys, values = sok.incremental_model_dump(sok_vars, time_before)
        
            num_lookups = len(keys)
            indices_before = indices_records[i]
            for lookup_id in range(num_lookups):
                indices_flat_before = indices_before[lookup_id].flat_values
                indices_np = indices_flat_before.numpy()
                indices_np, unique_reverse_indices = np.unique(indices_np, return_index=True)
                indices_np = np.sort(indices_np)
                tmp_keys = keys[lookup_id]
                tmp_keys = np.sort(tmp_keys)
                np.testing.assert_array_equal(indices_np, tmp_keys)
            print("____________iter {} is pass!________________".format(str(i)))

Define a func to call sok_incremental_dump_evaluate

In [4]:
import horovod

def training_func():
    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    import time
    import pytz
    from datetime import datetime
    import tensorflow as tf
    import horovod.tensorflow as hvd
    import numpy as np
    import sparse_operation_kit as sok


    hvd.init()
    gpus = tf.config.experimental.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
    sok.init()

    #remove some tf stderr output
    class suppress_stderr:
        def __init__(self):
            self.null_fd = os.open(os.devnull, os.O_RDWR)
            self.save_fd = os.dup(2)

        def __enter__(self):
            os.dup2(self.null_fd, 2)

        def __exit__(self, *_):
            os.dup2(self.save_fd, 2)
            os.close(self.null_fd)
            os.close(self.save_fd)
        
    with suppress_stderr():
        sok_incremental_dump_evaluate()

## Run With Horovod

use horovod.run do 2 process task.

In [7]:
horovod.run(training_func, np=2, verbose=False, disable_cache=True, use_mpi=True)

[1,0]<stderr>:2024-06-03 01:52:59.067307: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[1,0]<stderr>:2024-06-03 01:52:59.067348: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[1,0]<stderr>:2024-06-03 01:52:59.068589: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1534] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[1,1]<stderr>:2024-06-03 01:52:59.104917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[1,1]<stderr>:2024-06-03 01:52:59.104956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.c

[1,0]<stdout>:[SOK INFO] Import /usr/local/lib/libsparse_operation_kit.so
[1,1]<stdout>:[SOK INFO] Import /usr/local/lib/libsparse_operation_kit.so
[1,0]<stdout>:[SOK INFO] Initialize finished, communication tool: horovod
[1,1]<stdout>:[SOK INFO] Initialize finished, communication tool: horovod
[1,0]<stdout>:____________iter 1 is pass!________________
[1,1]<stdout>:____________iter 1 is pass!________________
[1,0]<stdout>:____________iter 2 is pass!________________
[1,1]<stdout>:____________iter 2 is pass!________________
[1,0]<stdout>:____________iter 3 is pass!________________
[1,1]<stdout>:____________iter 3 is pass!________________
[1,0]<stdout>:____________iter 4 is pass!________________
[1,1]<stdout>:____________iter 4 is pass!________________


[None, None]