# Building an input pipeline with the Dataset API

This scripts builds an input pipeline using the Dataset API from tensorflow. It preforms the following tasks:
1. parses images from TFRecords
2. rezises and normalises the input
3. shuffles the dataset and returns a batch of size "batch_size"
4. trough an iterator provides access inside a session

Further this notebook also implements the functions parse and input_fn. These functions take an dataset batch and select one element from each class, thereby performing uniform smapling of each class. 

In [1]:
import tensorflow as tf
import os
import numpy as np
from matplotlib.image import imread
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes


In [2]:
data_dir = '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train'
path_tfrecords_train = [os.path.join(data_dir, 'data_batch_%d.bin' % i) 
                        for i in xrange(0, 7)]
# sampling parameters
target_probs = np.array([77,  5, 12,  4,  2], dtype=np.float32)/100
batch_size = 1000
print(path_tfrecords_train)


['/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_0.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_1.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_2.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_3.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_4.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_5.bin', '/home/olle/PycharmProjects/Diabetic_Retinopathy_Detection/data/train/data_batch_6.bin']


In [3]:
def undersampling_filter(example):
    """
    Computes if given example is rejected or not.
    """
    class_prob = example['class_prob']
    class_target_prob = example['class_target_prob']
    prob_ratio = tf.cast(class_target_prob/class_prob, dtype=tf.float32)
    prob_ratio = prob_ratio ** undersampling_coef
    prob_ratio = tf.minimum(prob_ratio, 1.0)
    
    acceptance = tf.less_equal(tf.random_uniform([], dtype=tf.float32), prob_ratio)

    return acceptance

In [4]:
def load_images(image_paths):
    # Load the images from disk.
    images = [imread(path) for path in image_paths]

    # Convert to a numpy array and return it.
    return np.asarray(images)

In [5]:
def parse(serialized):
    features = \
        {
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        }

    # Parse the serialized data so we get a dict with our data.
    parsed_example = tf.parse_single_example(serialized=serialized,
                                             features=features)
    # Get the image as raw bytes.
    image_raw = parsed_example['image_raw']
    # Decode the raw bytes so it becomes a tensor with type.
    image = tf.decode_raw(image_raw, tf.uint8)
    # The type is now uint8 but we need it to be float.
    image = tf.cast(image, tf.float32)
    image = tf.divide(image, 255)
    #
    image = tf.reshape(image, [256, 256, 3])
    # Get the label associated with the image.
    label = parsed_example['label']
    label = tf.cast(label, tf.int32)


    # The image and label are now correct TensorFlow types.
    return image, label

In [6]:
def input_fn(filenames, train, batch_size=batch_size, buffer_size=2048):
    # Args:
    # filenames:   Filenames for the TFRecords files.
    # train:       Boolean whether training (True) or testing (False).
    # batch_size:  Return batches of this size.
    # buffer_size: Read buffers of this size. The random shuffling
    #              is done on the buffer, so it must be big enough.

    # Create a TensorFlow Dataset-object which has functionality
    # for reading and shuffling data from TFRecords files.
    dataset = tf.data.TFRecordDataset(filenames=filenames)

    # Parse the serialized data in the TFRecords files.
    # This returns TensorFlow tensors for the image and labels.
    dataset = dataset.map(parse)

    if train:
        # If training then read a buffer of the given size and
        # randomly shuffle it.
        dataset = dataset.shuffle(buffer_size=buffer_size)

        # Allow infinite reading of the data.
        num_repeat = None
    else:
        # If testing then don't shuffle the data.
        
        # Only go through the data once.
        num_repeat = 1
    
    #dataset = dataset.filter(undersampling_filter)

    # Repeat the dataset the given number of times.
    dataset = dataset.repeat(num_repeat)
    
    # Get a batch of data with the given size.
    dataset = dataset.batch(batch_size)

    # Create an iterator for the dataset and the above modifications.
    iterator = dataset.make_one_shot_iterator()

    # Get the next batch of images and labels.
    images_batch, labels_batch = iterator.get_next()

#     # The input-function must return a dict wrapping the images.
#     x = {'image': images_batch}
#     y = labels_batch

    return images_batch, labels_batch

In [7]:
def train_input_fn():
    return input_fn(filenames=path_tfrecords_train, train=True)

In [8]:
x, y = train_input_fn()

In [None]:
range(0,10)

In [21]:
classes = tf.constant(4)
condition = tf.equal(y, classes)
indices = tf.where(condition)
number_class = tf.size(indices)
class_prob = tf.divide(number_class,batch_size)

mask = tf.one_hot(indices, depth=batch_size, dtype=tf.bool, on_value=True, off_value=False)


x_sample = tf.gather(x,indices=tf.gather(indices,0),name=None)
y_sample = tf.gather(y,indices=tf.gather(indices,1),name=None)

In [19]:
def return_elem(classes):
    #get boolean true or false vector indicating where class is
    condition = tf.equal(y, classes)
    #get indecis for classes 
    indices = tf.where(condition)
    #number of classes in original batch
    number_class = tf.size(indices)
    #class proportion
    class_prob = tf.divide(number_class,batch_size)
    #gather the first index that countaing the class
    x_sample = tf.gather(x,indices=tf.gather(indices,0),name=None)
    y_sample = tf.gather(y,indices=tf.gather(indices,0),name=None)
    return(x_sample, y_sample)

In [10]:
def extract_one_from_each_class(x,y):
    class_4 = tf.constant(4)
    sample_4x, sample_4y  = return_elem(class_4)

    class_3 = tf.constant(3)
    class_2 = tf.constant(2)
    sample_3x, sample_3y  = return_elem(class_3)
    sample_2x, sample_2y  = return_elem(class_2)

    class_1 = tf.constant(1)
    class_0 = tf.constant(0)
    sample_1x, sample_1y  = return_elem(class_1)
    sample_0x, sample_0y  = return_elem(class_0)
    
    sample_y = tf.concat([sample_4y,sample_3y,sample_2y,sample_1y,sample_0y], axis=0)
    sample_x = tf.concat([sample_4x,sample_3x,sample_2x,sample_1x,sample_0x], axis=0)

    
    return(sample_x,sample_y)


In [None]:
class_5 = tf.constant(5)
class_4 = tf.constant(4)
sample_5x, sample_5y  = return_elem(class_5)
sample_4x, sample_4y  = return_elem(class_4)

class_3 = tf.constant(3)
class_2 = tf.constant(2)
sample_3x, sample_3y  = return_elem(class_3)
sample_2x, sample_2y  = return_elem(class_2)

class_1 = tf.constant(1)
class_0 = tf.constant(0)
sample_1x, sample_1y  = return_elem(class_1)
sample_0x, sample_0y  = return_elem(class_0)


In [None]:
print([sample_5y,sample_4y,sample_3y,sample_2y,sample_1y,sample_0y])

In [22]:
with tf.Session() as sess:
    #print(sess.run(x))
    #f,lab = sess.run(extract_one_from_each_class(x,y))
    print(sess.run([indices,y_sample]))
    #d,e,f,lab = sess.run([sample_2y,sample_1y,sample_0y,y])
    #print(a)
    #print(b)
    #print(c)
    #print(d)
    #print(e)

[array([[ 24],
       [ 39],
       [121],
       [176],
       [270],
       [424],
       [444],
       [474],
       [547],
       [551],
       [573],
       [619],
       [827],
       [929],
       [967]]), array([4], dtype=int32)]


In [None]:
with tf.Session() as sess:
    print(sess.run(x).shape)
    #print(sess.run(x))
    labels, mask_one, mask_two, ind, size,prop, x_sub, y_sub = sess.run([y, condition,mask, indices,number, class_prob, x_sample, y_sample])
    print(labels)
    print(mask)
    print(ind)
    print(size)
    print(prop)
    print(mask_two)
    
    print(x_sub)
    print(y_sub)
    

In [None]:
filenames = tf.placeholder(tf.string, shape=[None])
dataset = tf.data.TFRecordDataset(path_tfrecords_train)
dataset = dataset.map(parse)  # Parse the record into tensors.
dataset = dataset.repeat()  # Repeat the input indefinitely.
dataset = dataset.batch(32)
iterator = dataset.make_initializable_iterator()

# You can feed the initializer with the appropriate filenames for the current
# phase of execution, e.g. training vs. validation.
with tf.Session() as sess:
    # Initialize `iterator` with training data.
    training_filenames = path_tfrecords_train
    sess.run(iterator.initializer, feed_dict={filenames: training_filenames})


In [None]:
inc_dataset = tf.data.Dataset.range(100)
dec_dataset = tf.data.Dataset.range(0, -100, -1)
dataset = tf.data.Dataset.zip((inc_dataset, dec_dataset))
batched_dataset = dataset.batch(4)

iterator = batched_dataset.make_one_shot_iterator()
next_element = iterator.get_next()
with tf.Session() as sess:
    print(sess.run(next_element))  # ==> ([0, 1, 2,   3],   [ 0, -1,  -2,  -3])
    print(sess.run(next_element))  # ==> ([4, 5, 6,   7],   [-4, -5,  -6,  -7])
    print(sess.run(next_element))  # ==> ([8, 9, 10, 11],   [-8, -9, -10, -11])

In [None]:
next_element