# Using Tensorflow DALI plugin: DALI tf.data.Dataset

### Overview

In this tutorial you will find out how to integrate a DALI pipeline with tf.data API and use it in training with various TensorFlow APIs. We will use well known MNIST dataset converted to JPEGs. You can find it in DALI_extra repository ready to use.

Let's start with creating a pipeline to read MNIST images.

In [1]:
import nvidia.dali as dali
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types

import os

# Path to MNIST dataset
data_path = os.path.join(os.environ['DALI_EXTRA_PATH'], 'db/MNIST/training/')


class MnistPipeline(Pipeline):
    def __init__(self, device, device_id=0, num_threads=4, seed=0):
        super(MnistPipeline, self).__init__(
            batch_size, num_threads, device_id, seed)
        self.device = device
        self.reader = ops.Caffe2Reader(path=data_path, random_shuffle=True)
        self.decode = ops.ImageDecoder(
            device='mixed' if device is 'gpu' else 'cpu',
            output_type=types.GRAY)
        self.cmn = ops.CropMirrorNormalize(
            device=device,
            output_dtype=types.FLOAT,
            image_type=types.GRAY,
            mean=[0.],
            std=[255.],
            output_layout=types.NCHW)

    def define_graph(self):
        inputs, labels = self.reader(name="Reader")
        images = self.decode(inputs)
        if self.device is 'gpu':
            labels = labels.gpu()
        images = self.cmn(images)

        return (images, labels)

Now we define some parameters of the training:

In [2]:
batch_size = 32
dropout = 0.2
image_size = 28
num_classes = 10
hidden_size = 128
epochs = 5
iterations = 100

Now, instead of the usuall workflow of building a pipeline we wrap it with `DALIDataset` object from DALI TensorFlow plugin. This class is compatible with `tf.data.Dataset`. We need to pass expected shapes and types of the outputs with the pipeline.

In [3]:
import nvidia.dali.plugin.tf as dali_tf
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()


# Create pipeline
mnist_pipeline = MnistPipeline(device='cpu', device_id=0)

# Define shapes and types of the outputs
shapes = [
    (batch_size, image_size, image_size),
    (batch_size)]
dtypes = [
    tf.float32,
    tf.int32]

# Create dataset
mnist_set = dali_tf.DALIDataset(
    pipeline=mnist_pipeline,
    batch_size=batch_size,
    shapes=shapes,
    dtypes=dtypes,
    device_id=0)

We are ready to start the training. 

### Keras

First, we will pass `mnist_set` to `tf.keras` model.

In [4]:
# Define the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(image_size, image_size), name='images'),
    tf.keras.layers.Flatten(input_shape=(image_size, image_size)),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(num_classes, activation='softmax')])
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

# Train using DALI dataset
model.fit(
    mnist_set,
    epochs=epochs,
    steps_per_epoch=iterations)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Train on 100 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f2926327588>

As you can see, it was very easy to integrate DALI pipeline with `tf.keras` API.

Above code performed the training usgin the CPU. We can easily move the whole processing to the GPU. Both the DALI pipelien and the Keras model will be using the GPU without any CPU buffer between them.

In [5]:
# Create pipeline
mnist_pipeline = MnistPipeline(device='gpu', device_id=0)

# Define the model and place it on the GPU
with tf.device('/gpu:0'):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(image_size, image_size), name='images'),
        tf.keras.layers.Flatten(input_shape=(image_size, image_size)),
        tf.keras.layers.Dense(hidden_size, activation='relu'),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(num_classes, activation='softmax')])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    
# Train on the GPU. Data pipeline will be using the GPU as well.
model.fit(
    mnist_set,
    epochs=epochs,
    steps_per_epoch=iterations)

Train on 100 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f291c370dd8>

That is all that was needed to use the GPU as a training accelerator.



### Estimators

This part of the tutorial focuses on how to use `tf.estimator` API with DALI dataset. 

In [6]:
# Define the feature columns for Estimator
feature_columns = [tf.feature_column.numeric_column(
    "images", shape=[image_size, image_size])]

# And the run config
run_config = tf.estimator.RunConfig(
    model_dir='/tmp/tensorflow-checkpoints',
    device_fn=lambda op: '/gpu:0')

# Finally create the model based on `DNNClassifier`
model = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[hidden_size],
    n_classes=num_classes,
    dropout=dropout,
    config=run_config,
    optimizer='Adam')

# In tf.estimator API data is passed with the function returning the dataset
# We define this function to return DALI dataset placed on the GPU
def train_data_fn():
    with tf.device('/gpu:0'):
        mnist_pipeline = MnistPipeline(device='gpu', device_id=0)
        mnist_set = dali_tf.DALIDataset(
            pipeline=mnist_pipeline,
            batch_size=batch_size,
            shapes=shapes,
            dtypes=dtypes,
            device_id=0)
        mnist_set = mnist_set.map(
            lambda features, labels: ({'images': features}, labels))
        
    return mnist_set

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tensorflow-checkpoints', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': <function <lambda> at 0x7f291c163400>, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f29284ab7b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


With everything set up we are ready to run the training.

In [7]:
# Running the training
model.train(input_fn=train_data_fn, steps=epochs * iterations)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tensorflow-checkpoints/model.ckpt-500
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 500 into /tmp/tensorflow-checkpoints/model.ckpt.
INFO:tensorflow:loss = 43.148834, step = 500
INFO:tensorflow:global_step/sec: 216.331
INFO:tensorflow:loss = 41.12014, step = 600 (0.465 sec)
INFO:tensorflow:global_step/sec: 228.112
INFO:tensorflow:loss = 51.369453, step = 700 (0.439 sec)
INFO:tensorflow:global_step/sec: 221.138
INFO:tensorflow:loss = 33.94025, step 

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x7f290c4b9978>

In [8]:
model.evaluate(input_fn=train_data_fn, steps=iterations)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-11-21T14:29:51Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tensorflow-checkpoints/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2019-11-21-14:29:52
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8425, average_loss = 0.63225484, global_step = 1000, loss = 20.232155
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tensorflow-checkpoints/model.ckp

{'accuracy': 0.8425,
 'average_loss': 0.63225484,
 'loss': 20.232155,
 'global_step': 1000}

### Custom models and training loops

Finally, last part of this tutorial focuses on integrating DALI dataset with custom models and training loops. Complete example below shows from start to finish how to use DALI dataset with native TensorFlow model and run training using `tf.Session`

In [10]:
tf.reset_default_graph()

options = tf.data.Options()
options.experimental_optimization.apply_default_optimizations = False
options.experimental_optimization.autotune = False


with tf.device('/gpu:0'):
    daliset = dali_tf.DALIDataset(
        pipeline=MnistPipeline(device='gpu', device_id=0),
        batch_size=batch_size,
        shapes=shapes,
        dtypes=dtypes,
        device_id=0).with_options(options)

    iterator = tf.data.make_initializable_iterator(daliset)
    images, labels = iterator.get_next()

    images = tf.reshape(images, [batch_size, image_size*image_size])
    labels = tf.reshape(
        tf.one_hot(labels, num_classes),
        [batch_size, num_classes])
    
    with tf.variable_scope('mnist_net', reuse=False):
        images = tf.layers.flatten(images)
        images = tf.layers.dense(images, hidden_size, activation=tf.nn.relu)
        images = tf.layers.dropout(images, rate=dropout, training=True)
        images = tf.layers.dense(images, num_classes, activation=tf.nn.softmax)

    logits_train = images
    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits_train, labels=labels))
    train_step = tf.train.AdamOptimizer().minimize(loss_op)

    correct_pred = tf.equal(
            tf.argmax(logits_train, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(iterator.initializer)

        for i in range(epochs * iterations):
            sess.run(train_step)
            if i % iterations == 0:
                train_accuracy = sess.run(accuracy)
                print("Step %d, accuracy: %g" % (i, train_accuracy))

        final_accuracy = 0
        for _ in range(iterations):
            final_accuracy = final_accuracy + \
                accuracy.eval()
        final_accuracy = final_accuracy / iterations

        print('Final accuracy: ', final_accuracy)

Step 0, accuracy: 0.125
Step 100, accuracy: 0.8125
Step 200, accuracy: 0.90625
Step 300, accuracy: 0.84375
Step 400, accuracy: 0.84375
Final accuracy:  0.9021875
