In [1]:
import tensorflow as tf

from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import nvidia.dali.tfrecord as tfrec
import numpy as np
from timeit import default_timer as timer

base = "/data/imagenet/train-val-recordio-256/"
idx_files = [base + "val.idx"]
rec_files = [base + "val.rec"]
idx_files

BURNIN_STEPS = 6
BATCH_SIZE = 128
DEVICES = 2
ITERATIONS = 32

In [2]:
class HybridPipe(Pipeline):
    def __init__(self, batch_size, num_threads, device_id, num_gpus):
        super(HybridPipe, self).__init__(batch_size,
                                         num_threads,
                                         device_id)
        self.input = ops.MXNetReader(path = rec_files, index_path = idx_files, shard_id = 0, num_shards = 1)

        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True,
                                 resize_a = 256, resize_b = 480,
                                 image_type = types.RGB,
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (224, 224),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))
        self.iter = 0

    def define_graph(self):
        inputs, labels = self.input(name="Reader")
        images = self.decode(inputs)
        images = self.resize(images)
        output = self.cmn(images, crop_pos_x = self.uniform(),
                          crop_pos_y = self.uniform())
        return (output, labels.gpu())

    def iter_setup(self):
        pass

In [3]:
pipes = [HybridPipe(batch_size=BATCH_SIZE, num_threads=2, device_id = device_id, num_gpus = 1) for device_id in range(DEVICES)]

In [4]:
serialized_pipes = [pipe.serialize() for pipe in pipes]
del pipes

In [None]:
import tensorflow as tf
import nvidia.dali.plugin.tf as dali_tf
import time
daliop = dali_tf.DALIIterator()

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
config = tf.ConfigProto(gpu_options=gpu_options)

images = []
labels = []
for d in range(DEVICES):
    print("DALIOP with device %i" % d)
    with tf.device('/gpu:%i' % d):
        image, label = daliop(serialized_pipeline = serialized_pipes[d],
            batch_size = BATCH_SIZE,
            height = 224,
            width = 224,
            device_id = d)
        images.append(image)
        labels.append(label)

with tf.Session(config=config) as sess:
    print("in session")
    all_img_per_sec = []
    total_batch_size = BATCH_SIZE * DEVICES
    
    for i in range(ITERATIONS):
        start_time = time.time()
        res = sess.run([images, labels])
        elapsed_time = time.time() - start_time
        img_per_sec = total_batch_size / elapsed_time
        if i > BURNIN_STEPS:
            all_img_per_sec.append(img_per_sec)
        print("\t%7.1f img/s" %  img_per_sec)

    print("TOTAL AVG %7.1f img/s" % (sum(all_img_per_sec) / len(all_img_per_sec)))
    print(res[0][0][0].shape)

In [None]:
import matplotlib.pyplot as plt
print(res[0][0].shape)
img = res[0][0][10].transpose() + 128
imgplot = plt.imshow(img.astype('uint8'))