## Using Input Pipelines to Read Data from TFRecords Files

TensorFlow provides users with multiple options for providing data to the model. One of the probably most common methods is to define placeholders in the TensorFlow graph and feed the data from the current Python session into the TensorFlow Session using the feed_dict parameter. Using this approach, a large dataset that does not fit into memory is most conveniently and efficiently stored using NumPy archives as explained in [Chunking an Image Dataset for Minibatch Training using NumPy NPZ Archives](https://render.githubusercontent.com/view/image-data-chunking-npz.ipynb) or HDF5 data base files ([Storing an Image Dataset for Minibatch Training using HDF5](https://render.githubusercontent.com/view/image-data-chunking-hdf5.ipynb)).

Another approach, which is often preferred when it comes to computational efficiency, is to do the "data loading" directly in the graph using input queues from so-called TFRecords files, which will be illustrated in this notebook.

Beyond the examples in this notebook, you are encouraged to read more in TensorFlow's "[Reading Data](https://www.tensorflow.org/programmers_guide/reading_data)" guide.

### 0. Dataset

In [None]:
import numpy as np

import sys
sys.path.insert(0, '..')
from helper import mnist_export_to_jpg

In [None]:
random_seed = 123
np.random.seed(random_seed)

mnist_path = "D:/work/data/Python/tensorflow/mnist/data/"
mnist_export_to_jpg(path=mnist_path)

In [None]:
import os

for i in ('train', 'valid', 'test'):
    print('mnist_%s subdirectories' % i, os.listdir(os.path.join(mnist_path, 'mnist_%s' % i)))

In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import os

mnist_train_path = os.path.join(mnist_path, 'mnist_train/9/')
some_img = os.path.join(mnist_train_path, os.listdir(mnist_train_path)[0])

img = mpimg.imread(some_img)
print(img.shape)
plt.imshow(img, cmap='binary');

### 1. Saving images as TFRecords files

In [None]:
import glob
import numpy as np
import tensorflow as tf

In [None]:
def images_to_tfrecords(data_stempath,
                        shuffle=False, 
                        random_seed=None):
    
    def int64_to_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    
    for s in ['train', 'valid', 'test']:

        with tf.python_io.TFRecordWriter('mnist_%s.tfrecords' % s) as writer:

            img_paths = np.array([p for p in glob.iglob('%s/**/*.jpg' % 
                                  os.path.join(data_stempath, "mnist_" + s), 
                                   recursive=True)])
            
            print(img_paths.shape)
            print(img_paths[0])
            if shuffle:
                rng = np.random.RandomState(random_seed)
                rng.shuffle(img_paths)

            for idx, path in enumerate(img_paths):
                label = int(os.path.basename(os.path.dirname(path)))
                image = mpimg.imread(path)
                image = image.reshape(-1).tolist()

                if (idx + 1) % 10000 == 0:
                    print("dealing mnist_{}, idx: {}".format(s, (idx+1)))
                
                
                example = tf.train.Example(features=tf.train.Features(feature={
                    'image': int64_to_feature(image),
                    'label': int64_to_feature([label])}))

                writer.write(example.SerializeToString())

In [None]:
images_to_tfrecords(data_stempath=mnist_path, shuffle=True, random_seed=123)

Just to make sure that the images were serialized correctly, let us load an image back from TFRecords using the [`tf.python_io.tf_record_iterator`](https://www.tensorflow.org/api_docs/python/tf/python_io/tf_record_iterator) and display it:

In [None]:
import tensorflow as tf
import numpy as np

record_iterator = tf.python_io.tf_record_iterator(path='mnist_train.tfrecords')

for r in record_iterator:
    example = tf.train.Example()
    example.ParseFromString(r)
    
    label = example.features.feature['label'].int64_list.value[0]
    print('Label:', label)
    img = np.array(example.features.feature['image'].int64_list.value)
    img = img.reshape((28, 28))
    plt.imshow(img, cmap='binary')
    plt.show
    break

### 2. Loading images via the TFRecordReader


Roughly speaking, we can regard the TFRecordReader as a class that let's us load images "symbolically" inside a TensorFlow graph. A TFRecordReader uses the state in the graph to remember the location of a .tfrecord file that it reads and lets us iterate over training examples and batches after initializing the graph as we will see later.

To see how it works, let's start with a simple function that reads one image at a time:

In [None]:
def read_one_image(tfrecords_queue, normalize=True):

    reader = tf.TFRecordReader()
    key, value = reader.read(tfrecords_queue)
    features = tf.parse_single_example(value,
        features={'label': tf.FixedLenFeature([], tf.int64),
                  'image': tf.FixedLenFeature([784], tf.int64)})
    label = tf.cast(features['label'], tf.int32)
    image = tf.cast(features['image'], tf.float32)
    onehot_label = tf.one_hot(indices=label, depth=10)
    
    if normalize:
        # normalize to [0, 1] range
        image = image / 255.
    
    return onehot_label, image

In [None]:
g = tf.Graph()
with g.as_default():
    
    queue = tf.train.string_input_producer(['mnist_train.tfrecords'], 
                                           num_epochs=10)
    label, image = read_one_image(queue)


with tf.Session(graph=g) as sess:
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
   
    for i in range(10):
        one_label, one_image = sess.run([label, image])
        
    print('Label:', one_label, '\nImage dimensions:', one_image.shape)
    
    coord.request_stop()
    coord.join(threads)

In [None]:
g = tf.Graph()
with g.as_default():
    
    queue = tf.train.string_input_producer(['mnist_train.tfrecords'], 
                                           num_epochs=10)
    label, image = read_one_image(queue)
    
    
    label_batch, image_batch = tf.train.shuffle_batch([label, image], 
                                                       batch_size=64,
                                                       capacity=5000,
                                                       min_after_dequeue=2000,
                                                       num_threads=8,
                                                       seed=123)

with tf.Session(graph=g) as sess:
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
   
    for i in range(10):
        many_labels, many_images = sess.run([label_batch, image_batch])
        
    print('Batch size:', many_labels.shape[0])
    
    coord.request_stop()
    coord.join(threads)

### 3. Use queue runners to train a neural network


In this section, we will take the concepts that were introduced in the previous sections and train a multilayer perceptron from the 'mnist_train.tfrecords' file:

In [None]:
# Hyperparameters
learning_rate = 0.1
batch_size = 128
n_epochs = 15
n_iter = n_epochs * (45000 // batch_size)

# Architecture
n_hidden_1 = 128
n_hidden_2 = 256
height, width = 28, 28
n_classes = 10



##########################
### GRAPH DEFINITION
##########################

g = tf.Graph()
with g.as_default():
    
    tf.set_random_seed(123)

    # Input data
    queue = tf.train.string_input_producer(['mnist_train.tfrecords'], 
                                           num_epochs=None)
    label, image = read_one_image(queue)
    
    label_batch, image_batch = tf.train.shuffle_batch([label, image], 
                                                       batch_size=batch_size,
                                                       seed=123,
                                                       num_threads=8,
                                                       capacity=5000,
                                                       min_after_dequeue=2000)
    
    tf_images = tf.placeholder_with_default(image_batch,
                                            shape=[None, 784], 
                                            name='images')
    tf_labels = tf.placeholder_with_default(label_batch, 
                                            shape=[None, 10], 
                                            name='labels')

    # Model parameters
    weights = {
        'h1': tf.Variable(tf.truncated_normal([height*width, n_hidden_1], stddev=0.1)),
        'h2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], stddev=0.1)),
        'out': tf.Variable(tf.truncated_normal([n_hidden_2, n_classes], stddev=0.1))
    }
    biases = {
        'b1': tf.Variable(tf.zeros([n_hidden_1])),
        'b2': tf.Variable(tf.zeros([n_hidden_2])),
        'out': tf.Variable(tf.zeros([n_classes]))
    }

    # Multilayer perceptron
    layer_1 = tf.add(tf.matmul(tf_images, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']

    # Loss and optimizer
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=out_layer, labels=tf_labels)
    cost = tf.reduce_mean(loss, name='cost')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    train = optimizer.minimize(cost, name='train')

    # Prediction
    prediction = tf.argmax(out_layer, 1, name='prediction')
    correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(out_layer, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
    
    
    
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    saver0 = tf.train.Saver()
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    avg_cost = 0.
    iter_per_epoch = n_iter // n_epochs
    epoch = 0

    for i in range(n_iter):
        _, cost = sess.run(['train', 'cost:0'])
        avg_cost += cost
        
        if not i % iter_per_epoch:
            epoch += 1
            avg_cost /= iter_per_epoch
            print("Epoch: %03d | AvgCost: %.3f" % (epoch, avg_cost))
            avg_cost = 0.
            
        
    coord.request_stop()
    coord.join(threads)
    
    saver0.save(sess, save_path='./mlp')