# Using Tensorflow DALI plugin: DALI tf.data.Dataset

### Overview

In this tutorial you will find out how to integrate a DALI pipeline with tf.data API and use it in training with various TensorFlow APIs. We will use well known MNIST dataset converted to JPEGs. You can find it in DALI_extra repository ready to use.

Let's start with creating a pipeline to read MNIST images.

In [None]:
import nvidia.dali as dali
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types

import os

# Path to MNIST dataset
data_path = os.path.join(os.environ['DALI_EXTRA_PATH'], 'db/MNIST/training/')


class MnistPipeline(Pipeline):
    def __init__(self, device, device_id=0, num_threads=4, seed=0):
        super(MnistPipeline, self).__init__(
            batch_size, num_threads, device_id, seed)
        self.device = device
        self.reader = ops.Caffe2Reader(path=data_path, random_shuffle=True)
        self.decode = ops.ImageDecoder(
            device='mixed' if device is 'gpu' else 'cpu',
            output_type=types.GRAY)
        self.cmn = ops.CropMirrorNormalize(
            device=device,
            output_dtype=types.FLOAT,
            image_type=types.GRAY,
            mean=[0.],
            std=[255.],
            output_layout=types.NCHW)

    def define_graph(self):
        inputs, labels = self.reader(name="Reader")
        images = self.decode(inputs)
        if self.device is 'gpu':
            labels = labels.gpu()
        images = self.cmn(images)

        return (images, labels)

Now we define some parameters of the training:

In [None]:
batch_size = 32
dropout = 0.2
image_size = 28
num_classes = 10
hidden_size = 128
epochs = 5
iterations = 100

Now, instead of the usuall workflow of building a pipeline we wrap it with `DALIDataset` object from DALI TensorFlow plugin. This class is compatible with `tf.data.Dataset`. We need to pass expected shapes and types of the outputs with the pipeline.

In [None]:
import nvidia.dali.plugin.tf as dali_tf
try:
    import tensorflow.compat.v1 as tf
#     tf.compat.v1.disable_eager_execution()
except:
    import tensorflow as tf

# Create pipeline
mnist_pipeline = MnistPipeline(device='cpu', device_id=0)

# Define shapes and types of the outputs
shapes = [
    (batch_size, image_size, image_size),
    (batch_size)]
dtypes = [
    tf.float32,
    tf.int32]

# Create dataset
mnist_set = dali_tf.DALIDataset(
    pipeline=mnist_pipeline,
    batch_size=batch_size,
    shapes=shapes,
    dtypes=dtypes,
    device_id=0)

We are ready to start the training. 

### Keras

First, we will pass `mnist_set` to `tf.keras` model.

In [None]:
# Define the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(image_size, image_size), name='images'),
    tf.keras.layers.Flatten(input_shape=(image_size, image_size)),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(num_classes, activation='softmax')])
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

# Train using DALI dataset
model.fit(
    mnist_set,
    epochs=epochs,
    steps_per_epoch=iterations)

As you can see, it was very easy to integrate DALI pipeline with `tf.keras` API.

Above code performed the training usgin the CPU. We can easily move the whole processing to the GPU. Both the DALI pipelien and the Keras model will be using the GPU without any CPU buffer between them.

In [None]:
# Create pipeline
mnist_pipeline = MnistPipeline(device='gpu', device_id=0)

# Define the model and place it on the GPU
with tf.device('/gpu:0'):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(image_size, image_size), name='images'),
        tf.keras.layers.Flatten(input_shape=(image_size, image_size)),
        tf.keras.layers.Dense(hidden_size, activation='relu'),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(num_classes, activation='softmax')])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    
# Train on the GPU. Data pipeline will be using the GPU as well.
model.fit(
    mnist_set,
    epochs=epochs,
    steps_per_epoch=iterations)

That is all that was needed to use the GPU as a training accelerator.



### Estimators

This part of the tutorial focuses on how to use `tf.estimator` API with DALI dataset. 

In [None]:
# Define the feature columns for Estimator
feature_columns = [tf.feature_column.numeric_column(
    "images", shape=[image_size, image_size])]

# And the run config
run_config = tf.estimator.RunConfig(
    model_dir='/tmp/tensorflow-checkpoints',
    device_fn=lambda op: '/gpu:0')

# Finally create the model based on `DNNClassifier`
model = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[hidden_size],
    n_classes=num_classes,
    dropout=dropout,
    config=run_config,
    optimizer='Adam')

# In tf.estimator API data is passed with the function returning the dataset
# We define this function to return DALI dataset placed on the GPU
def train_data_fn():
    with tf.device('/gpu:0'):
        mnist_pipeline = MnistPipeline(device='gpu', device_id=0)
        mnist_set = dali_tf.DALIDataset(
            pipeline=mnist_pipeline,
            batch_size=batch_size,
            shapes=shapes,
            dtypes=dtypes,
            device_id=0)
        mnist_set = mnist_set.map(
            lambda features, labels: ({'images': features}, labels))
        
    return mnist_set

With everything set up we are ready to run the training.

In [None]:
# Running the training
model.train(input_fn=train_data_fn, steps=epochs * iterations)

In [None]:
model.evaluate(input_fn=train_data_fn, steps=iterations)

### Custom models and training loops

Finally, last part of this tutorial focuses on integrating DALI dataset with custom models and training loops.

In [None]:
with tf.device('/gpu:0'):
    daliset = train_data_fn()

    iterator = tf.data.make_initializable_iterator(daliset)
    images, labels = iterator.get_next()

    images = tf.reshape(images, [batch_size, image_size*image_size])
    labels = tf.reshape(
        tf.one_hot(labels, labels_size),
        [batch_size, labels_size])
    
    with variable_scope('mnist_net', reuse=False):
        images = tf.layers.flatten(images)
        images = tf.layers.dense(images, hidden_size, activation=tf.nn.relu)
        images = tf.layers.dropout(images, rate=dropout, training=True)
        images = tf.layers.dense(images, labels_size, activation=tf.nn.softmax)

    logits_train = images
    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits_train, labels=labels))
    train_step = AdamOptimizer().minimize(loss_op)

    correct_pred = tf.equal(
            tf.argmax(logits_train, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))