In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Scaling Criteo: Multi-GPU Training with TensorFlow

## Overview

We observed that TensorFlow training pipelines can be slow as the dataloader is a bottleneck. The native dataloader in TensorFlow randomly sample each item from the dataset, which is very slow. The window dataloader in TensorFlow is not much faster. In our experiments, we are able to speed-up existing TensorFlow pipelines by 9x using a highly optimized dataloader.<br><br>

We have already discussed the NVTabular dataloader for TensorFlow in more detail in our [Getting Started with Movielens notebooks](https://github.com/NVIDIA/NVTabular/tree/main/examples/getting-started-movielens). We provided an example to [train a TensorFlow model for the criteo dataset with a single GPU](./03-Training-with-TF.ipynb) and an [examples for multi-GPU training on Movielens](../multi-gpu-movielens/).<br><br>

This notebook is another example to train a TensorFlow model with NVTabular data loaders using multiple GPUs. Our multi-GPU examples use [Horovod](https://github.com/horovod/horovod) for distributed training.

### Learning objectives

In this notebook, we learn how to:

- Use **NVTabular dataloader** with TensorFlow Keras model
- Use **[Horovod](https://github.com/horovod/horovod)** to train with multiple GPUs.

## NVTabular dataloader for TensorFlow

We’ve identified that the dataloader is one bottleneck in deep learning recommender systems when training pipelines with TensorFlow. The dataloader cannot prepare the next batch fast enough and therefore, the GPU is not fully utilized. 

We developed a highly customized tabular dataloader for accelerating existing pipelines in TensorFlow. In our experiments, we see a speed-up by 9x of the same training workflow with NVTabular dataloader. NVTabular dataloader’s features are:

- removing bottleneck of item-by-item dataloading
- enabling larger than memory dataset by streaming from disk
- reading data directly into GPU memory and remove CPU-GPU communication
- preparing batch asynchronously in GPU to avoid CPU-GPU communication
- supporting commonly used .parquet format
- easy integration into existing TensorFlow pipelines by using similar API - works with tf.keras models
- **supporting multi-GPU training with Horovod**

More information in our [blogpost](https://medium.com/nvidia-merlin/training-deep-learning-based-recommender-systems-9x-faster-with-tensorflow-cc5a2572ea49).

**If you are not familiar with NVTabular data loaders, we recommend you to review the basics in our starting notebooks [Movielens](../getting-started-movielens/03-Training-with-TF.ipynb) and [Criteo](03-Training-with-TF.ipynb).**

Horovod is a framework, which enables distributed training for TensorFlow, Keras, PyTorch and Apache MXNet models. It will launch the distributed training from the command line and requires a Python file. We will transform the [Single-GPU example of Criteo](03-Training-with-TF.ipynb) into a Python file and will launch Horovod.
<br><br>
**If you are not familiar with NVTabular+Horovod, we recommend you to review our [basic examples for distributed training](../multi-gpu-movielens/). They explain the required modifications.**

In [None]:
import os

BASE_DIR = os.environ.get("BASE_DIR", "/raid/data/criteo")
OUTPUT_DATA_DIR = os.environ.get("OUTPUT_DATA_DIR", BASE_DIR + "/test_dask/output")

In [None]:
%%writefile './tf_trainer.py'

# External dependencies
import argparse
import glob
import os

import cupy

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.4"  # fraction of free memory

import nvtabular as nvt
from nvtabular.framework_utils.tensorflow import layers
from nvtabular.loader.tensorflow import KerasSequenceLoader

import tensorflow as tf
import horovod.tensorflow as hvd

parser = argparse.ArgumentParser(description="Process some integers.")
parser.add_argument("--dir_in", default=None, help="Input directory")
parser.add_argument("--batch_size", default=None, help="batch size")
args = parser.parse_args()

BASE_DIR = args.dir_in
BATCH_SIZE = int(args.batch_size)

schema = nvt.Schema().load(os.path.join(BASE_DIR + "train/schema.pbtxt"))

CONTINUOUS_COLUMNS = [x.name for x in schema.select_by_tag(nvt.graph.tags.Tags.CONTINUOUS)]
CATEGORICAL_COLUMNS = [x.name for x in schema.select_by_tag(nvt.graph.tags.Tags.CATEGORICAL)]
LABEL_COLUMNS = [x.name for x in schema.select_by_tag("label")]

TRAIN_PATHS = sorted(
    glob.glob(os.path.join(BASE_DIR, "train/*.parquet"))
)  # Output from ETL-with-NVTabular
VALID_PATHS = sorted(
    glob.glob(os.path.join(BASE_DIR, "valid/*.parquet"))
)  # Output from ETL-with-NVTabular
hvd.init()

# Seed with system randomness (or a static seed)
cupy.random.seed(None)


def seed_fn():
    """
    Generate consistent dataloader shuffle seeds across workers

    Reseeds each worker's dataloader each epoch to get fresh a shuffle
    that's consistent across workers.
    """
    min_int, max_int = tf.int32.limits
    max_rand = max_int // hvd.size()

    # Generate a seed fragment on each worker
    seed_fragment = cupy.random.randint(0, max_rand).get()

    # Aggregate seed fragments from all Horovod workers
    seed_tensor = tf.constant(seed_fragment)
    reduced_seed = hvd.allreduce(seed_tensor, name="shuffle_seed", op=hvd.mpi_ops.Sum)

    return reduced_seed % max_rand


proc = nvt.Workflow.load(os.path.join(BASE_DIR, "workflow/"))
EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)

for key in EMBEDDING_TABLE_SHAPES.keys():
    EMBEDDING_TABLE_SHAPES[key] = (
        EMBEDDING_TABLE_SHAPES[key][0],
        min(4, EMBEDDING_TABLE_SHAPES[key][1]),
    )

train_dataset_tf = KerasSequenceLoader(
    TRAIN_PATHS,  # you could also use a glob pattern
    batch_size=BATCH_SIZE,
    label_names=LABEL_COLUMNS,
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=False,
    buffer_size=0.06,  # how many batches to load at once
    parts_per_chunk=1,
    global_size=hvd.size(),
    global_rank=hvd.rank(),
    seed_fn=seed_fn,
)

valid_dataset_tf = KerasSequenceLoader(
    VALID_PATHS,  # you could also use a glob pattern
    batch_size=BATCH_SIZE,
    label_names=LABEL_COLUMNS,
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=False,
    buffer_size=0.06,  # how many batches to load at once
    parts_per_chunk=1,
    global_size=hvd.size(),
    global_rank=hvd.rank(),
    seed_fn=seed_fn,
)

inputs = {}  # tf.keras.Input placeholders for each feature to be used
emb_layers = []  # output of all embedding layers, which will be concatenated
num_layers = []  # output of numerical layers

for col in CATEGORICAL_COLUMNS:
    inputs[col] = tf.keras.Input(name=col, dtype=tf.int32, shape=(1,))

for col in NUMERIC_COLUMNS:
    inputs[col] = tf.keras.Input(name=col, dtype=tf.float32, shape=(1,))

for col in CATEGORICAL_COLUMNS:
    emb_layers.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(
                col, EMBEDDING_TABLE_SHAPES[col][0]
            ),  # Input dimension (vocab size)
            EMBEDDING_TABLE_SHAPES[col][1],  # Embedding output dimension
        )
    )

for col in NUMERIC_COLUMNS:
    num_layers.append(tf.feature_column.numeric_column(col))

emb_layer = layers.DenseFeatures(emb_layers + num_layers)
x_emb_output = emb_layer(inputs)
x = tf.keras.layers.Dense(128, activation="relu")(x_emb_output)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(x)

model = tf.keras.Model(inputs=inputs, outputs=x)
opt = tf.keras.optimizers.Adam(0.01 * hvd.size())
opt = hvd.DistributedOptimizer(opt)

model.compile(
    loss=tf.losses.BinaryCrossentropy(), optimizer=opt, metrics=[tf.keras.metrics.BinaryAccuracy()]
)

callbacks = [
    hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0),
    hvd.keras.callbacks.MetricAverageCallback(),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if hvd.rank() == 0:
    callbacks.append(tf.keras.callbacks.ModelCheckpoint("./checkpoint-{epoch}.h5"))
    callbacks.append(tf.keras.callbacks.TensorBoard("./logs"))

verbose = 1 if hvd.rank() == 0 else 0

model.fit(
    train_dataset_tf,
    validation_data=valid_dataset_tf,
    callbacks=callbacks,
    steps_per_epoch=6435 // hvd.size(),
    epochs=2,
    verbose=verbose,
)

We'll also need a small wrapper script to check environment variables set by the Horovod runner to see which rank we'll be assigned, in order to set CUDA_VISIBLE_DEVICES properly for each worker:

In [None]:
%%writefile './hvd_wrapper.sh'

#!/bin/bash

# Get local process ID from OpenMPI or alternatively from SLURM
if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then
    if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then
        LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}"
    elif [ -n "${SLURM_LOCALID:-}" ]; then
        LOCAL_RANK="${SLURM_LOCALID}"
    fi
    export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
fi

exec "$@"

In [None]:
!horovodrun -np 2 sh hvd_wrapper.sh python tf_trainer.py --dir_in $OUTPUT_DATA_DIR --batch_size 16384