In [1]:
from collections import defaultdict
import ndll.plugin.tf as ndll_tf
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import init_ops

from ndll.pipeline import Pipeline
import ndll.ops as ops
import ndll.types as types
import ndll.tfrecord as tfrec
import numpy as np
from timeit import default_timer as timer
import argparse
import time
import math
import os

try:
    from tensorflow.contrib import nccl
    have_nccl = True
except ImportError:
    have_nccl = False
    print("WARNING: NCCL support not available")

__version__ = "1.4"

base = "/data/imagenet/train-val-recordio-256/"
#base = '/opt/ndll/examples/recordio/'
idx_files = [base + "val.idx"]
rec_files = [base + "val.rec"]
idx_files

TFRECORD_DIR = "/data/imagenet/train-val-tfrecord-480-subset/"
#TFRECORD_DIR = "/opt/ndll/examples/train-val-tfrecord-480-subset/"

BATCH_SIZE = 128
DEVICES = 1
NRECORDS = sum(1 for line in open(idx_files[0]))
NRECORDS

NDLL_ON = True

model_dtype = tf.float32 # or tf.float32
nlayer = 50
nclass = 1000

total_batch_size = BATCH_SIZE * DEVICES

learning_rate = 0.001
weight_decay = 1e-4
momentum = 0.9
lr_decay_policy = 'step'
lr_decay_epochs = 30
lr_decay_rate = 0.1
loss_scale = 1
nccl_on = False
nstep_burnin = 20
display_every = 1
input_buffer_size = min(10000, NRECORDS) 

# Resnet[nlayer] model

In [2]:
class DummyScope(object):
    def __enter__(self):
        pass
    def __exit__(self, exc_type, exc_val, exc_tb):
        pass


def resnet_bottleneck_v1(net, input_layer, depth, depth_bottleneck, stride,
                         basic=False):
    num_inputs = input_layer.get_shape().as_list()[1]
    x = input_layer
    s = stride
    with tf.name_scope('resnet_v1'):
        if depth == num_inputs:
            if stride == 1:
                shortcut = input_layer
            else:
                shortcut = net.pool(x, 'MAX', (1,1), (s,s))
        else:
            shortcut = net.conv(x, depth, (1,1), (s,s), activation='LINEAR')
        if basic:
            x = net.conv(x, depth_bottleneck, (3,3), (s,s), padding='SAME_RESNET')
            x = net.conv(x, depth,            (3,3), activation='LINEAR')
        else:
            x = net.conv(x, depth_bottleneck, (1,1), (s,s))
            x = net.conv(x, depth_bottleneck, (3,3), padding='SAME')
            x = net.conv(x, depth,            (1,1), activation='LINEAR')
        with net.jit_scope():
            x = net.activate(x + shortcut)
        return x

def resnext_split_branch(net, input_layer, stride):
    x = input_layer
    with tf.name_scope('resnext_split_branch'):
        x = net.conv(x, net.bottleneck_width, (1, 1), (stride, stride), activation='RELU', use_batch_norm=True)
        x = net.conv(x, net.bottleneck_width, (3, 3), (1, 1), activation='RELU', use_batch_norm=True)
    return x

def resnext_shortcut(net, input_layer, stride, input_size, output_size):
    x = input_layer
    useConv = net.shortcut_type == 'C' or (net.shortcut_type == 'B' and input_size != output_size)
    with tf.name_scope('resnext_shortcut'):
        if useConv:
            x = net.conv(x, output_size, (1,1), (stride, stride), use_batch_norm=True)
        elif output_size == input_size:
            if stride == 1:
                x = input_layer
            else:
                x = net.pool(x, 'MAX', (1,1), (stride, stride))
        else:
            x = input_layer
    return x

def resnext_bottleneck_v1(net, input_layer, depth, depth_bottleneck, stride):
    num_inputs = input_layer.get_shape().as_list()[1]
    x = input_layer
    with tf.name_scope('resnext_bottleneck_v1'):
        shortcut = resnext_shortcut(net, x, stride, num_inputs, depth)
        branches_list = []
        for i in range(net.cardinality):
            branch = resnext_split_branch(net, x, stride)
            branches_list.append(branch)
        concatenated_branches = tf.concat(values=branches_list, axis=1, name='concat')
        bottleneck_depth = concatenated_branches.get_shape().as_list()[1]
        x = net.conv(concatenated_branches, depth, (1, 1), (1, 1), activation=None)
        x = net.activate(x + shortcut, 'RELU')
    return x

def inference_residual(net, input_layer, layer_counts, bottleneck_callback):
    net.use_batch_norm = True
    x = net.input_layer(input_layer)
    x = net.conv(x, 64,    (7,7), (2,2), padding='SAME_RESNET')
    x = net.pool(x, 'MAX', (3,3), (2,2), padding='SAME')
    for i in range(layer_counts[0]):
        x = bottleneck_callback(net, x,  256,  64, 1)
    for i in range(layer_counts[1]):
        x = bottleneck_callback(net, x, 512, 128, 2 if i==0 else 1)
    for i in range(layer_counts[2]):
        x = bottleneck_callback(net, x, 1024, 256, 2 if i==0 else 1)
    for i in range(layer_counts[3]):
        x = bottleneck_callback(net, x, 2048, 512, 2 if i==0 else 1)
    x = net.spatial_avg(x)
    return x

def inference_resnet_v1_impl(net, input_layer, layer_counts):
    return inference_residual(net, input_layer, layer_counts, resnet_bottleneck_v1)

def inference_resnet_v1(net, input_layer, nlayer):
    """Deep Residual Networks family of models
    https://arxiv.org/abs/1512.03385
    """
    if nlayer ==  50: return inference_resnet_v1_impl(net, input_layer, [3,4, 6,3])
    elif nlayer == 152: return inference_resnet_v1_impl(net, input_layer, [3,8,36,3])
    else: raise ValueError("Invalid nlayer (%i); must be one of: 50,152" % nlayer)                     

In [3]:
class GPUNetworkBuilder(object):
    """This class provides convenient methods for constructing feed-forward
    networks with internal data layout of 'NCHW'.
    """
    def __init__(self,
                 is_training,
                 dtype=tf.float32,
                 activation='RELU',
                 use_batch_norm=True,
                 batch_norm_config = {'decay':   0.9,
                                      'epsilon': 1e-4,
                                      'scale':   True,
                                      'zero_debias_moving_mean': False}):
        self.dtype             = dtype
        self.activation_func   = activation
        self.is_training       = is_training
        self.use_batch_norm    = use_batch_norm
        self.batch_norm_config = batch_norm_config
        self._layer_counts     = defaultdict(lambda: 0)
        self.jit_scope = DummyScope

    def _count_layer(self, layer_type):
        idx  = self._layer_counts[layer_type]
        name = layer_type + str(idx)
        self._layer_counts[layer_type] += 1
        return name
    def _get_variable(self, name, shape, dtype=None,
                      initializer=None, seed=None):
        if dtype is None:
            dtype = self.dtype
        if initializer is None:
            initializer = init_ops.glorot_uniform_initializer(seed=seed)
        elif (isinstance(initializer, float) or
              isinstance(initializer, int)):
            initializer = tf.constant_initializer(float(initializer))
        return tf.get_variable(name, shape, dtype, initializer)
    def _to_nhwc(self, x):
        return tf.transpose(x, [0,2,3,1])
    def _from_nhwc(self, x):
        return tf.transpose(x, [0,3,1,2])
    def _bias(self, input_layer):
        num_outputs = input_layer.get_shape().as_list()[1]
        biases = self._get_variable('biases', [num_outputs], input_layer.dtype,
                                    initializer=0)
        if len(input_layer.get_shape()) == 4:
            return tf.nn.bias_add(input_layer, biases,
                                  data_format='NCHW')
        else:
            return input_layer + biases
    def _batch_norm(self, input_layer, scope):
        return tf.contrib.layers.batch_norm(input_layer,
                                            is_training=self.is_training,
                                            scope=scope,
                                            data_format='NCHW',
                                            fused=True,
                                            **self.batch_norm_config)
    def _bias_or_batch_norm(self, input_layer, scope, use_batch_norm):
        if use_batch_norm is None:
            use_batch_norm = self.use_batch_norm
        if use_batch_norm:
            return self._batch_norm(input_layer, scope)
        else:
            return self._bias(input_layer)
    def input_layer(self, input_layer):
        """Converts input data into the internal format"""
        with self.jit_scope():
            # for ndll
            x = self._from_nhwc(input_layer)
            #x = input_layer
            x = tf.cast(x, self.dtype)
            # Rescale and shift to [-1,1]
            x = x * (1./127.5) - 1
        #print("input layer")
        #print(x.get_shape())
        return x
    def conv(self, input_layer, num_filters, filter_size,
             filter_strides=(1,1), padding='SAME',
             activation=None, use_batch_norm=None):
        """Applies a 2D convolution layer that includes bias or batch-norm
        and an activation function.
        """
        num_inputs = input_layer.get_shape().as_list()[1]
        kernel_shape = [filter_size[0], filter_size[1],
                        num_inputs, num_filters]
        strides = [1, 1, filter_strides[0], filter_strides[1]]
        with tf.variable_scope(self._count_layer('conv')) as scope:
            kernel = self._get_variable('weights', kernel_shape,
                                        input_layer.dtype)
            if padding == 'SAME_RESNET': # ResNet models require custom padding
                kh, kw = filter_size
                rate = 1
                kernel_size_effective = kh + (kw - 1) * (rate - 1)
                pad_total = kernel_size_effective - 1
                pad_beg = pad_total // 2
                pad_end = pad_total - pad_beg
                padding = [[0, 0], [0, 0],
                           [pad_beg, pad_end], [pad_beg, pad_end]]
                input_layer = tf.pad(input_layer, padding)
                padding = 'VALID'
            x = tf.nn.conv2d(input_layer, kernel, strides,
                             padding=padding, data_format='NCHW')
            #print(x.get_shape())
            x = self._bias_or_batch_norm(x, scope, use_batch_norm)
            x = self.activate(x, activation)
            return x
    def activate(self, input_layer, funcname=None):
        """Applies an activation function"""
        if isinstance(funcname, tuple):
            funcname = funcname[0]
            params = funcname[1:]
        if funcname is None:
            funcname = self.activation_func
        if funcname == 'LINEAR':
            return input_layer
        activation_map = {
            'RELU':    tf.nn.relu,
            'RELU6':   tf.nn.relu6,
            'ELU':     tf.nn.elu,
            'SIGMOID': tf.nn.sigmoid,
            'TANH':    tf.nn.tanh,
            'LRELU':   lambda x, name: tf.maximum(params[0]*x, x, name=name)
        }
        return activation_map[funcname](input_layer, name=funcname.lower())
    def pool(self, input_layer, funcname, window_size,
                 window_strides=(2,2),
                 padding='VALID'):
        """Applies spatial pooling"""
        pool_map = {
            'MAX': tf.nn.max_pool,
            'AVG': tf.nn.avg_pool
        }
        kernel_size    = [1, 1, window_size[0], window_size[1]]
        kernel_strides = [1, 1, window_strides[0], window_strides[1]]
        return pool_map[funcname](input_layer, kernel_size, kernel_strides,
                                  padding, data_format='NCHW',
                                  name=funcname.lower())
    def spatial_avg(self, input_layer):
        """Averages over spatial dimensions (4D->2D)"""
        return tf.reduce_mean(input_layer, [2, 3], name='spatial_avg')
    def fully_connected(self, input_layer, num_outputs, activation=None):
        """Applies a fully-connected set of weights"""
        num_inputs = input_layer.get_shape().as_list()[1]
        kernel_size = [num_inputs, num_outputs]
        with tf.variable_scope(self._count_layer('fully_connected')):
            kernel = self._get_variable('weights', kernel_size,
                                        input_layer.dtype)
            x = tf.matmul(input_layer, kernel)
            x = self._bias(x)
            x = self.activate(x, activation)
            return x
    def residual(self, input_layer, net, scale=1.0, activation='RELU'):
        """Applies a residual layer"""
        input_size     = input_layer.get_shape().as_list()
        num_inputs     = input_size[1]
        output_layer   = scale*net(self, input_layer)
        output_size    = output_layer.get_shape().as_list()
        num_outputs    = output_size[1]
        kernel_strides = (input_size[2]//output_size[2],
                          input_size[3]//output_size[3])
        with tf.name_scope('residual'):
            if (num_outputs != num_inputs or
                kernel_strides[0] != 1 or
                kernel_strides[1] != 1):
                input_layer = self.conv(input_layer, num_outputs, [1, 1],
                                        kernel_strides, activation='LINEAR')
            with self.jit_scope():
                x = self.activate(input_layer + output_layer, activation)
            return x                                                        

# TF vanilla image loading

In [4]:
def deserialize_image_record(record):
    feature_map = {
        'image/encoded':          tf.FixedLenFeature([ ], tf.string, ''),
        'image/class/label':      tf.FixedLenFeature([1], tf.int64,  -1),
        'image/class/text':       tf.FixedLenFeature([ ], tf.string, ''),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
    }
    with tf.name_scope('deserialize_image_record'):
        obj = tf.parse_single_example(record, feature_map)
        imgdata = obj['image/encoded']
        label   = tf.cast(obj['image/class/label'], tf.int32)
        bbox    = tf.stack([obj['image/object/bbox/%s'%x].values
                            for x in ['ymin', 'xmin', 'ymax', 'xmax']])
        bbox = tf.transpose(tf.expand_dims(bbox, 0), [0,2,1])
        text    = obj['image/class/text']
        return imgdata, label, bbox, text

def decode_jpeg(imgdata, channels=3):
    return tf.image.decode_jpeg(imgdata, channels=channels,
                             fancy_upscaling=False,
                             dct_method='INTEGER_FAST')

def decode_png(imgdata, channels=3):
    return tf.image.decode_png(imgdata, channels=channels)

def random_crop_and_resize_image(image, bbox, height, width):
    with tf.name_scope('random_crop_and_resize'):

        bbox_begin, bbox_size, distorted_bbox = tf.image.sample_distorted_bounding_box(
            tf.shape(image),
            bounding_boxes=bbox,
            min_object_covered=0.1,
            aspect_ratio_range=[0.8, 1.25],
            area_range=[0.1, 1.0],
            max_attempts=100,
            use_image_if_no_bounding_boxes=True)
        # Crop the image to the distorted bounding box
        image = tf.slice(image, bbox_begin, bbox_size)
        # Resize to the desired output size
        image = tf.image.resize_images(
            image,
            [height, width],
            tf.image.ResizeMethod.BILINEAR,
            align_corners=False)
        image.set_shape([height, width, 3])
        return image

def stage(tensors):
    """Stages the given tensors in a StagingArea for asynchronous put/get.
    """
    stage_area = data_flow_ops.StagingArea(
     dtypes=[tensor.dtype       for tensor in tensors],
     shapes=[tensor.get_shape() for tensor in tensors])
    put_op      = stage_area.put(tensors)
    get_tensors = stage_area.get()

    get_tensors = [tf.reshape(gt, t.get_shape())
                for (gt,t) in zip(get_tensors, tensors)]
    return put_op, get_tensors


class ImagePreprocessor(object):
    def __init__(self, height, width, subset='train', dtype=tf.uint8):
        self.height = height
        self.width  = width
        self.num_devices = DEVICES
        self.subset = subset
        self.dtype = dtype
        self.nsummary = 10 # Max no. images to generate summaries for
    def preprocess(self, imgdata, bbox, thread_id):
        with tf.name_scope('preprocess_image'):
            try:
                image = decode_jpeg(imgdata)
            except:
                image = decode_png(imgdata)
            if thread_id < self.nsummary:
                image_with_bbox = tf.image.draw_bounding_boxes(
                    tf.expand_dims(tf.to_float(image), 0), bbox)
                tf.summary.image('original_image_and_bbox', image_with_bbox)
            image = random_crop_and_resize_image(image, bbox,
                                                 self.height, self.width)
            if thread_id < self.nsummary:
                tf.summary.image('cropped_resized_image',
                                 tf.expand_dims(image, 0))
            image = tf.image.random_flip_left_right(image)
            if thread_id < self.nsummary:
                tf.summary.image('flipped_image',
                                 tf.expand_dims(image, 0))
        return image
    def device_minibatches(self, total_batch_size):
        record_input = data_flow_ops.RecordInput(
            file_pattern=os.path.join(TFRECORD_DIR, '%s-*' % self.subset),
            parallelism=64,
            # Note: This causes deadlock during init if larger than dataset preprocess
            buffer_size=input_buffer_size,
            batch_size=total_batch_size)
        records = record_input.get_yield_op()
        # Split batch into individual images
        records = tf.split(records, total_batch_size, 0)
        records = [tf.reshape(record, []) for record in records]
        # Deserialize and preprocess images into batches for each device
        images = defaultdict(list)
        labels = defaultdict(list)
        with tf.name_scope('input_pipeline'):
            for i, record in enumerate(records):
                imgdata, label, bbox, text = deserialize_image_record(record)
                image = self.preprocess(imgdata, bbox, thread_id=i)
                label -= 1 # Change to 0-based (don't use background class)
                device_num = i % self.num_devices
                images[device_num].append(image)
                labels[device_num].append(label)
            # Stack images back into a sub-batch for each device
            for device_num in range(self.num_devices):
                images[device_num] = tf.parallel_stack(images[device_num])
                labels[device_num] = tf.concat(labels[device_num], 0)
                images[device_num] = tf.reshape(images[device_num],
                                                [-1, self.height, self.width, 3])
                images[device_num] = tf.clip_by_value(images[device_num], 0., 255.)
                images[device_num] = tf.cast(images[device_num], self.dtype)
        return images, labels

# DALI pipeline preparation

In [5]:
class HybridPipe(Pipeline):
    def __init__(self, batch_size, num_threads, device_id, num_gpus, pipelined = True, async = True):
        super(HybridPipe, self).__init__(batch_size, num_threads, device_id, pipelined, async)
        self.input = ops.MXNetReader(path = rec_files, index_path = idx_files, shard_id = device_id, num_shards = num_gpus)

        self.huffman = ops.HuffmanDecoder()
        self.idct = ops.DCTQuantInv(device = "gpu", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True,
                                 resize_a = 256, resize_b = 480,
                                 image_type = types.RGB,
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT16,
                                            random_crop = True,
                                            crop = (224, 224),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.],
                                            output_layout=types.NHWC)
        self.iter = 0

    def define_graph(self):
        inputs, labels = self.input(name="Reader")
        dct_coeff, jpeg_meta = self.huffman(inputs)
        images = self.idct(dct_coeff.gpu(), jpeg_meta)
        images = self.resize(images)
        output = self.cmn(images)
        return (output, labels.gpu())

    def iter_setup(self):
        pass

class NdllPreprocessor(object):
    def __init__(self, height, width, batch_size, dtype=tf.uint8):
        self.height = height
        self.width  = width
        self.batch = batch_size
        pipe = HybridPipe(batch_size=self.batch, num_threads=2, device_id = 0, num_gpus = 1, pipelined = True, async = True)
        serialized_pipe = pipe.serialize()
        del pipe
        ndllop = ndll_tf.NDLLIterator()
        self.images, self.labels = ndllop(serialized_pipeline = serialized_pipe,
                batch_size = batch_size,
                height = 224,
                width = 224)
    def get_device_minibatch(self):
        return self.images, self.labels                                                                                     

# Trainer

In [6]:
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
                                    initializer=None, regularizer=None,
                                    trainable=True,
                                    *args, **kwargs):
        storage_dtype = tf.float32 if trainable else dtype
        variable = getter(name, shape, dtype=storage_dtype,
                          initializer=initializer, regularizer=regularizer,
                          trainable=trainable,
                          *args, **kwargs)
        if trainable and dtype != tf.float32:
            variable = tf.cast(variable, dtype)
        return variable

def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0'):
    if len(devices) == 1:
        return tower_gradvars

    if have_nccl and nccl_on:
        new_tower_grads = []
        contig_list = []
        for d, grad_list in zip(devices, tower_gradvars):
            with tf.device(d):
                flat_grads = [tf.reshape(g, [-1]) for (g, _) in grad_list]
                contig_grads = tf.concat(flat_grads, 0)
                contig_list.append(contig_grads)

        summed_grads = nccl.all_sum(contig_list)
        for d, s, grad_list in zip(devices, summed_grads, tower_gradvars):
            with tf.device(d):
                new_grad_list = [];
                sizes = [tf.size(g) for (g, _) in grad_list]
                flat_grads = tf.split(s, sizes)
                for newg, (oldg, v) in zip(flat_grads, grad_list):
                    newg = tf.reshape(newg, tf.shape(oldg))
                    newg *= 1. / len(devices)
                    new_grad_list.append((newg, v))
                new_tower_grads.append(new_grad_list)
        return new_tower_grads
    else:
        num_devices = len(tower_gradvars)
        avg_gradvars = []
        for layer in zip(*tower_gradvars):
            grads_on_devices, vars_on_devices = zip(*layer)
            with tf.device(param_server_device):
                avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0)
            avg_grads_on_devices = [avg_grad]*num_devices
            avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices))
            avg_gradvars.append(avg_gradvars_on_devices)
        return list(zip(*avg_gradvars))

def all_sync_params(tower_params, devices):
    """Assigns the params from the first tower to all others"""
    if len(devices) == 1:
        return tf.no_op()
    sync_ops = []
    if have_nccl and nccl_on:
        for param_on_devices in zip(*tower_params):
            # Note: param_on_devices is [paramX_gpu0, paramX_gpu1, ...]
            param0 = param_on_devices[0]
            send_op, received_tensors = nccl.broadcast(param0, devices[1:])
            sync_ops.append(send_op)
            for device, param, received in zip(devices[1:],
                                               param_on_devices[1:],
                                               received_tensors):
                with tf.device(device):
                    sync_op = param.assign(received)
                    sync_ops.append(sync_op)
    else:
        params0 = tower_params[0]
        for device, params in zip(devices, tower_params):
            with tf.device(device):
                for param, param0 in zip(params, params0):
                    sync_op = param.assign(param0.read_value())
                    sync_ops.append(sync_op)
    return tf.group(*sync_ops)

class FeedForwardTrainer(object):
    def __init__(self, preprocessor, loss_func, nstep_per_epoch=None):
        self.image_preprocessor = preprocessor
        self.loss_func          = loss_func
        with tf.device('/cpu:0'):
            self.global_step = tf.get_variable(
                'global_step', [],
                initializer=tf.constant_initializer(0),
                dtype=tf.int64,
                trainable=False)
        self.learning_rate = tf.train.exponential_decay(
            learning_rate,
            self.global_step,
            decay_steps=lr_decay_epochs*nstep_per_epoch,
            decay_rate=lr_decay_rate,
            staircase=True)
    def make_optimizer(self):
        opt = tf.train.MomentumOptimizer(self.learning_rate, momentum,
                                         use_nesterov=True)
        return opt
    def training_step(self, total_batch_size, devices):
        preload_ops = [] # CPU pre-load
        gpucopy_ops = [] # H2D transfer
        self.tower_params = []
        tower_losses   = []
        tower_gradvars = []
        tower_top1s    = []
        tower_top5s    = []
        if type(self.image_preprocessor) is NdllPreprocessor:
            dev_images, dev_labels = self.image_preprocessor.get_device_minibatch()
        else:
            with tf.device('/cpu:0'):
                dev_images, dev_labels = self.image_preprocessor.device_minibatches(
                    total_batch_size)

        # Each device has its own copy of the model, referred to as a tower
        for device_num, device in enumerate(devices):
            if type(self.image_preprocessor) is NdllPreprocessor:
                images, labels = dev_images, dev_labels
            else:
                images, labels = dev_images[device_num], dev_labels[device_num]
                with tf.device('/cpu:0'):
                    # Stage images on the host
                    preload_op, (images, labels) = stage([images, labels])
                    preload_ops.append(preload_op)

            with tf.device(device):
                if type(self.image_preprocessor) is ImagePreprocessor:
                    # Copy images from host to device
                    gpucopy_op, (images, labels) = stage([images, labels])
                    gpucopy_ops.append(gpucopy_op)

                # Evaluate the loss and compute the gradients
                with tf.variable_scope(
                        'GPU_%i' % device_num,
                        # Force all variables to be stored as float32
                        custom_getter=float32_variable_storage_getter) \
                        as var_scope, \
                     tf.name_scope('tower_%i' % device_num):
                    loss, logits = self.loss_func(images, labels, var_scope)
                    tower_losses.append(loss)
                    params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope=var_scope.name)
                    self.tower_params.append(params)
                    # Apply loss scaling to improve numerical stability
                    if loss_scale != 1:
                        scale = loss_scale
                        grads  = [grad*(1./scale)
                                  for grad in tf.gradients(loss*scale, params)]
                    else:
                        grads = tf.gradients(loss, params)
                    gradvars = list(zip(grads, params))
                    tower_gradvars.append(gradvars)
                    with tf.device('/cpu:0'): # No in_top_k implem on GPU
                        labels = tf.cast(labels, tf.int32)
                        top1 = tf.reduce_mean(
                            tf.cast(tf.nn.in_top_k(logits, labels, 1), tf.float32))
                        top5 = tf.reduce_mean(
                            tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
                    tower_top1s.append(top1)
                    tower_top5s.append(top5)
        # Average the losses and gradients from each tower
        with tf.device('/cpu:0'):
            total_loss = tf.reduce_mean(tower_losses)
            total_top1 = tf.reduce_mean(tower_top1s)
            total_top5 = tf.reduce_mean(tower_top5s)
            averager = tf.train.ExponentialMovingAverage(0.90, name='loss_avg',
                                                         zero_debias=True)
            avg_op = averager.apply([total_loss])
            total_loss_avg = averager.average(total_loss)
            # Note: This must be done _after_ the averager.average() call
            #         because it changes total_loss into a new object.
            with tf.control_dependencies([avg_op]):
                total_loss     = tf.identity(total_loss)
                total_loss_avg = tf.identity(total_loss_avg)
            tf.summary.scalar('total loss raw', total_loss)
            tf.summary.scalar('total loss avg', total_loss_avg)
            tf.summary.scalar('train accuracy top-1 %', 100.*total_top1)
            tf.summary.scalar('train accuracy top-5 %', 100.*total_top5)
            tf.summary.scalar('learning rate', self.learning_rate)
        tower_gradvars = all_avg_gradients(tower_gradvars, devices)

        for grad, var in tower_gradvars[0]:
            tf.summary.histogram(var.op.name + '/values', var)
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        # Apply the gradients to optimize the loss function
        train_ops = []
        for device_num, device in enumerate(devices):
            with tf.device(device):
                gradvars = tower_gradvars[device_num]
                opt = self.make_optimizer()
                train_op = opt.apply_gradients(gradvars)
                train_ops.append(train_op)
        # Combine all of the ops required for a training step
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
        with tf.device('/cpu:0'):
            increment_global_step_op = tf.assign_add(self.global_step, 1)
        update_ops.append(increment_global_step_op)
        self.enqueue_ops = []
        self.enqueue_ops.append(tf.group(*preload_ops))
        self.enqueue_ops.append(tf.group(*gpucopy_ops))
        train_and_update_ops = tf.group(*(train_ops + update_ops))
        all_training_ops = (self.enqueue_ops + [train_and_update_ops])
        return total_loss_avg, self.learning_rate, all_training_ops
    def init(self, sess, devices):
        init_op = tf.global_variables_initializer()
        sync_op = all_sync_params(self.tower_params, devices)
        sess.run(init_op)
        sess.run(sync_op)
    def prefill_pipeline(self, sess):
        # Pre-fill the input pipeline with data
        for i in range(len(self.enqueue_ops)):
            sess.run(self.enqueue_ops[:i+1])

In [7]:
model_func = lambda net, images: inference_resnet_v1(net, images, nlayer)

def loss_func(images, labels, var_scope):
    # Build the forward model
    net = GPUNetworkBuilder(
        True, dtype=model_dtype)
    # Tmp: need to implem shape in the NdllOp
    #images.set_shape((FLAGS.batch_size, 3, height, width))
    #print(images.shape)
    output = model_func(net, images)
    # Add final FC layer to produce nclass outputs
    logits = net.fully_connected(output, nclass, activation='LINEAR')
    if logits.dtype != tf.float32:
        logits = tf.cast(logits, tf.float32)
    if labels.dtype != tf.int32:
        labels = tf.cast(labels, tf.int32)
    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels)
    # Add weight decay
    if weight_decay is not None and weight_decay != 0.:
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope=var_scope.name)
        with net.jit_scope():
            l2_loss = tf.add_n([tf.nn.l2_loss(w) for w in params])
            if l2_loss.dtype != tf.float32:
                l2_loss = tf.cast(l2_loss, tf.float32)
            loss += weight_decay * l2_loss
    return loss, logits


In [None]:
height, width = 224, 224
#nstep_per_epoch = NRECORDS / BATCH_SIZE
nstep_per_epoch = 1000

if NDLL_ON:
    preprocessor = NdllPreprocessor(height, width, BATCH_SIZE)
else:
    preprocessor = ImagePreprocessor(height, width, "train")


trainer = FeedForwardTrainer(preprocessor, loss_func, nstep_per_epoch)
print("Building training graph")
devices = ['/gpu:%i' % i for i in range(DEVICES)]
total_loss, learning_rate, train_ops = trainer.training_step(
    BATCH_SIZE, devices)                                        

In [None]:
# TODO: remove per_process_gpu_memory_fraction once DALI use TF allocator
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
config = tf.ConfigProto(gpu_options=gpu_options)
config.intra_op_parallelism_threads = 1

sess = tf.Session(config=config)

trainer.init(sess, devices)

print("Training")
print("  Step Epoch Img/sec   Loss   LR")

oom = False
batch_times = []
nstep = nstep_per_epoch
step0 = int(sess.run(trainer.global_step))

for step in range(step0, nstep):
    ops_to_run = [total_loss, learning_rate] + train_ops
    try:
        start_time = time.time()
        loss, lr = sess.run(ops_to_run)[:2]
        elapsed = time.time() - start_time
    except KeyboardInterrupt:
        print("Keyboard interrupt")
        break
    except tf.errors.ResourceExhaustedError:
        elapsed = -1.
        loss    = 0.
        lr      = -1
        oom = True

    if step >= nstep_burnin:
        batch_times.append(elapsed)
    img_per_sec = total_batch_size / elapsed
    effective_accuracy = 100. / math.exp(min(loss,20.))
    if step == 0 or (step+1) % display_every == 0:
        epoch = step*total_batch_size // NRECORDS
        print("%6i %5i %7.1f %7.3f %7.5f" % (
            step+1, epoch+1, img_per_sec, loss, lr))
    if oom:
        break

nstep = len(batch_times)
if nstep > 0:
    batch_times = np.array(batch_times)
    speeds = total_batch_size / batch_times
    speed_mean = np.mean(speeds)
    if nstep > 2:
        speed_uncertainty = np.std(speeds, ddof=1) / np.sqrt(float(nstep))
    else:
        speed_uncertainty = float('nan')
    speed_madstd = 1.4826*np.median(np.abs(speeds - np.median(speeds)))
    speed_jitter = speed_madstd
    print('-' * 64)
    print('Images/sec: %.1f +/- %.1f (jitter = %.1f)' % (
        speed_mean, speed_uncertainty, speed_jitter))
    print('-' * 64)
else:
    print("No results, did not get past burn-in phase (%i steps)" %
          nstep_burnin)

if oom:
    print("Out of memory error detected, exiting")
    sys.exit(-2)                                  