# MNIST classification: how to feed the data to the model
1- MNIST dataset in memory  
2- Transform the numpy array as images  
3- Create and save the JPEG images on the Cloud Storage  
4- Dataset to read numpy array in memory  
5- Create TFRecords to store numpy array  
6- Dataset to read TFRecord with numpy array  
7- Create TFRecords to store JPEG images  
8- Dataset to read TFRecord with JPEG images  

## Install packages on Google  Cloud Datalab (locally use conda env)

### Select in the Python3 Kernel:
In the menu bar the of 'Kernel', select   
**python3**
### Install needed packages
copy the command below in a Google Cloud Datalab cell  
**!pip install tensorflow==1.12**
### Restart the Kernel 
this is to take into account the new installed packages. Click in the menu bar on:  
**Reset Session**

## Include paths to our functions

In [41]:
import sys
import os
import pathlib

workingdir=os.getcwd()
print(workingdir)
d=[d for d in os.listdir(workingdir)]
n=0
while not set(['notebook']).issubset(set(d)):
   workingdir=str(pathlib.Path(workingdir).parents[0])
   print(workingdir)
   d=[d for d in os.listdir(str(workingdir))]
   n+=1
   if n>5:
       break
sys.path.insert(0, workingdir)
os.chdir(workingdir)

/Users/tarrade/Desktop/Work/Data_Science/Tutorials_Codes/Python/proj_DL_models_and_pipelines_with_GCP/notebook
/Users/tarrade/Desktop/Work/Data_Science/Tutorials_Codes/Python/proj_DL_models_and_pipelines_with_GCP


## Setup librairies import and plots style

### Import librairies

In [185]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import gzip
import sys
import _pickle as cPickle
from PIL import Image
import glob as glob

In [43]:
print(tf.__version__)
print(tf.keras.__version__)

1.12.0
2.1.6-tf


### Import our utils functions

In [44]:
import src.utils.mnist_utils as mnist_utils
import src.utils.tensorflow_helper as tensorflow_helper
import src.model_mnist_v1.trainer.model as mnist_v1

In [45]:
import importlib
importlib.reload(mnist_utils)
importlib.reload(mnist_v1)
importlib.reload(tensorflow_helper);# to reload the function and mask the output

## Input Data
### Load the data

In [111]:
# load the data: path is relative to the python path!
(x_train, y_train), (x_test, y_test) = mnist_utils.load_data(path='data/mnist/raw/mnist.pkl.gz')

### Basics checks

In [112]:
# check data shape (training)
x_train.shape

(60000, 28, 28)

In [113]:
# check data shape (train)
x_test.shape

(10000, 28, 28)

In [114]:
x_train.dtype, x_test.dtype

(dtype('uint8'), dtype('uint8'))

In [115]:
np.max(x_train), np.min(x_train), np.max(x_test), np.min(x_test) 

(255, 0, 255, 0)

## Save Numpy array as JPEG as images

In [55]:
path_train_images='data/mnist/images_train/'
path_test_images='data/mnist/images_test/'

In [56]:
if not os.path.exists(path_train_images):
    os.makedirs(path_train_images)

In [57]:
if not os.path.exists(path_test_images):
    os.makedirs(path_test_images)

In [24]:
for i, im_array in enumerate(x_train):
    im = Image.fromarray(im_array)
    im.save(path_train_images+'image_train_'+str(i).zfill(5)+'_label_'+str(y_train[i]).zfill(2)+'.jpeg')

In [25]:
for i, im_array in enumerate(x_test):
    im = Image.fromarray(im_array)
    im.save(path_test_images+'image_test_'+str(i).zfill(5)+'_label_'+str(y_test[i]).zfill(2)+'.jpeg')

## Save JPEG images on GCP

In [88]:
!gcloud info --format='value(config.project)'
!gcloud info --format='value(config.properties.compute.region)'
!gcloud info --format='value(config.properties.compute.zone)'

ml-productive-pipeline-53122
europe-west1
europe-west1-c


In [78]:
!gcloud info --format=json

{
  "basic": {
    "architecture": {
      "file_name": "x86_64",
      "id": "x86_64",
      "name": "x86_64"
    },
    "operating_system": {
      "file_name": "darwin",
      "id": "MACOSX",
      "name": "Mac OS X"
    },
    "python_location": "/Users/tarrade/anaconda/bin/python2",
    "python_version": "2.7.15 |Anaconda, Inc.| (default, Nov 13 2018, 17:07:45) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]",
    "site_packages": false,
    "version": "234.0.0"
  },
  "config": {
    "account": "fabien.tarrade@gmail.com",
    "active_config_name": "default",
    "paths": {
      "active_config_path": "/Users/tarrade/.config/gcloud/configurations/config_default",
      "global_config_dir": "/Users/tarrade/.config/gcloud",
      "installation_properties_path": "/Users/tarrade/Test/google-cloud-sdk/properties"
    },
    "project": "ml-productive-pipeline-53122",
    "properties": {
      "compute": {
        "region": "europe-west1",
     

In [89]:
GCS_BUCKET = 'gs://ml-productive-pipeline-53122' 
PROJECT = 'ml-productive-pipeline-53122'
REGION = 'europe-west1'
LOCAL_DATA_TEST = path_test_images
LOCAL_DATA_TRAIN = path_train_images

In [91]:
os.environ['GCS_BUCKET'] = GCS_BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['LOCAL_DATA_TEST'] = LOCAL_DATA_TEST
os.environ['LOCAL_DATA_TRAIN'] = LOCAL_DATA_TRAIN

In [94]:
!gsutil ls $GCS_BUCKET/mnist/raw

gs://ml-productive-pipeline-53122/mnist/raw/test/
gs://ml-productive-pipeline-53122/mnist/raw/train/


## Set parameters

In [107]:
tf.logging.set_verbosity(tf.logging.INFO)

In [194]:
# number of classes
NUM_CLASSES =10

# dimension of the input data
DIM_INPUT = 784

# number of epoch to train our model
EPOCHS = 2

# size of our mini batch
BATCH_SIZE = 128

# shuffle buffer size
SHUFFLE_BUFFER_SIZE = 10 * BATCH_SIZE

# prefetch buffer size
PREFETCH_BUFFER_SIZE = tf.contrib.data.AUTOTUNE

# number of paralell calls
NUM_PARALELL_CALL = 4

# model version
MODEL='v1'

## Defined flags

In [195]:
tensorflow_helper.del_all_flags(tf.flags.FLAGS)

In [196]:
# just for jupyter notebook and avoir : "UnrecognizedFlagError: Unknown command line flag 'f'"
tf.app.flags.DEFINE_string('f', '', 'kernel') 

# path to store the model and input for Tensorboard
tf.app.flags.DEFINE_string('model_dir_keras', './results/Models/Mnist/tf_1_12/keras/'+MODEL+'/ckpt/', 'Dir to save a model and checkpoints with keras')
tf.app.flags.DEFINE_string('tensorboard_dir_keras', './results/Models/Mnist/tf_1_12/keras/'+MODEL+'/logs/', 'Dir to save logs for TensorBoard with keras')

# parameters for the input dataset and train the model
tf.app.flags.DEFINE_integer('epoch', EPOCHS, 'number of epoch')
tf.app.flags.DEFINE_integer('step_per_epoch', len(x_train) // BATCH_SIZE, 'number of step per epoch')
tf.app.flags.DEFINE_integer('batch_size', BATCH_SIZE, 'Batch size')
tf.app.flags.DEFINE_integer('shuffle_buffer_size', SHUFFLE_BUFFER_SIZE , 'Shuffle buffer size')
tf.app.flags.DEFINE_integer('prefetch_buffer_size', PREFETCH_BUFFER_SIZE, 'Prefetch buffer size')
tf.app.flags.DEFINE_integer('num_parallel_calls', NUM_PARALELL_CALL, 'Number of paralell calls')

# parameters for the model
tf.app.flags.DEFINE_integer('num_classes', NUM_CLASSES, 'number of classes in our model')
tf.app.flags.DEFINE_integer('dim_input', DIM_INPUT, 'dimension of the input data for our model')

FLAGS = tf.app.flags.FLAGS

## Dataset to preprocess and feed data in our model 
Use tf.data.dataset it will prepare mini batches of data, reshuffle the data, parallelized the pre-processing the data. 

https://www.tensorflow.org/guide/performance/datasets  
To summarize, one good order for the different transformations is:
- create the dataset
- shuffle (with a big enough buffer size)  
https://stackoverflow.com/questions/46444018/meaning-of-buffer-size-in-dataset-map-dataset-prefetch-and-dataset-shuffle)
- repeat
- map with the actual work (preprocessing, augmentation…) using multiple parallel calls
- batch
- prefetch

ModeKeys:  
https://www.tensorflow.org/api_docs/python/tf/estimator/ModeKeys  
- EVAL
- PREDICT
- TRAIN

### Printing the number relater to the number of events (epoch, batch size, ...)

In [120]:
def print_summary_input(data, step='training'):
    print('Summary for the {} dataset:'.format(step))
    if step=='training':
        print('  - number of epoch            :', FLAGS.epoch)
        print('  - number of events per epoch :', len(data))
        print('  - batch size                 :', FLAGS.batch_size)
        print('  - number of step per epoch   :', FLAGS.step_per_epoch)
        print('  - total number of steps      :', FLAGS.epoch * FLAGS.step_per_epoch)
    else:
        print('  - number of epoch            :', 1)
        print('  - number of events per epoch :', len(data))
        print('  - batch size                 :', None)
        print('  - number of step per epoch   :', 1)
        print('  - total number of steps      :', 1) 

In [121]:
print_summary_input(x_train)

Summary for the training dataset:
  - number of epoch            : 10
  - number of events per epoch : 60000
  - batch size                 : 128
  - number of step per epoch   : 468
  - total number of steps      : 4680


In [122]:
print_summary_input(x_test)

Summary for the training dataset:
  - number of epoch            : 10
  - number of events per epoch : 10000
  - batch size                 : 128
  - number of step per epoch   : 468
  - total number of steps      : 4680


### Dataset using numpy array to preprocess and feed data in our model 

#### Creating "Graph" for the datasets

In [125]:
training_dataset = mnist_v1.input_mnist_array_dataset_fn(x_train, 
                                                         y_train, 
                                                         FLAGS,
                                                         mode=tf.estimator.ModeKeys.TRAIN, 
                                                         batch_size=FLAGS.batch_size)

INFO:tensorflow:input_dataset_fn: TRAIN, train


In [126]:
testing_dataset = mnist_v1.input_mnist_array_dataset_fn(x_test, 
                                                        y_test,
                                                        FLAGS,
                                                        mode=tf.estimator.ModeKeys.EVAL, 
                                                        batch_size=len(x_test))

INFO:tensorflow:input_dataset_fn: EVAL, eval


In [127]:
training_dataset, testing_dataset

(<PrefetchDataset shapes: ((128, 784), (128, 10)), types: (tf.float32, tf.float32)>,
 <PrefetchDataset shapes: ((10000, 784), (10000, 10)), types: (tf.float32, tf.float32)>)

#### Executing the "Graph for the datasets" for training

In [133]:
# create an iterator
iterator = training_dataset.make_one_shot_iterator()

# next_element
features, labels = iterator.get_next()

In [134]:
n=0

n_iter=12
with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print('iteration n:', n, 'execution time:', time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
            n+=1
            if n>=n_iter:
                print('number of iteration reached')
                break
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

iteration n: 0 execution time: 2.063270000000017 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 1 execution time: 0.02947199999999839 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 2 execution time: 0.030369000000007418 seconds
(128, 784)
(128, 10)
first label of the batch 6 

iteration n: 3 execution time: 0.028968999999989364 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 4 execution time: 0.028436999999996715 seconds
(128, 784)
(128, 10)
first label of the batch 5 

iteration n: 5 execution time: 0.029730999999998176 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 6 execution time: 0.02815099999997983 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 7 execution time: 0.02917199999998843 seconds
(128, 784)
(128, 10)
first label of the batch 8 

iteration n: 8 execution time: 0.028329000000013593 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 9 e

#### Executing the "Graph for the datasets" for testing

In [135]:
# create an iterator
iterator = testing_dataset.make_one_shot_iterator()

# next_element
features, labels = iterator.get_next()

In [136]:
n=0

n_iter=10
with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print('iteration n:', n, 'execution time:', time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
            n+=1
            if n>=n_iter:
                print('number of iteration reached')
                break
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

iteration n: 0 execution time: 2.8718900000000076 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 1 execution time: 2.057161000000008 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 2 execution time: 1.7940890000000138 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 3 execution time: 2.1304879999999855 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 4 execution time: 2.154911999999996 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 5 execution time: 2.1948069999999973 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 6 execution time: 2.1568949999999916 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 7 execution time: 2.387498999999991 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

iteration n: 8 execution time: 1.8869400000000098 seconds
(10000, 784)
(10000, 10)
first label of the batch

## Stored the input data as TFRecords files
TFRecord file format is a simple record-oriented binary format  
- https://medium.com/coinmonks/storage-efficient-tfrecord-for-images-6dc322b81db4
- https://www.damienpontifex.com/2017/09/18/convert-and-using-the-mnist-dataset-as-tfrecords/
- https://docs.databricks.com/_static/notebooks/horovodrunner/mnist-tensorflow-to-tfrecords.html

Contrary to numpy array or pandas dataframe this is will scale with any amount of data.

### Helper function for TFRecords

In [138]:
# to be move in .py
def _int64_feature(value:int) -> tf.train.Features.FeatureEntry:
    """Create a Int64List Feature
    
    Args:
        value: The value to store in the feature
    
    Returns:
        The FeatureEntry
    """
    
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [398]:
# to be deleted ?
def _floatlist_feature(value:str) -> tf.train.Features.FeatureEntry:
    """Create a FloatList Feature
    
    Args:
        value: The value to store in the feature
    
    Returns:
        The FeatureEntry
    """
    
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [139]:
# to be move in .py
def _bytes_feature(value:str) -> tf.train.Features.FeatureEntry:
    """Create a BytesList Feature
    
    Args:
        value: The value to store in the feature
    
    Returns:
        The FeatureEntry
    """
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [143]:
def _data_path(data_directory:str, name:str) -> str:
    """Construct a full path to a TFRecord file to be stored in the 
    data_directory. Will also ensure the data directory exists
    
    Args:
        data_directory: The directory where the records will be stored
        name:           The name of the TFRecord
    
    Returns:
        The full path to the TFRecord file
    """
    if not os.path.isdir(data_directory):
        os.makedirs(data_directory)

    return os.path.join(data_directory, f'{name}.tfrecords')

### Numpy array to TFRecords files

#### Creating the TFRecords files

In [141]:
def _numpy_to_tfrecords(example_dataset, filename:str):
    print(f'Processing {filename} data')
    dataset_length = len(example_dataset)
    with tf.python_io.TFRecordWriter(filename) as writer:
        for index, (image, label) in enumerate(example_dataset):
            sys.stdout.write(f"\rProcessing sample {index+1} of {dataset_length}")
            sys.stdout.flush()
            image_raw = image.tostring()
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': _int64_feature(int(label)),
                'image_raw': _bytes_feature(image_raw)
            }))            
            writer.write(example.SerializeToString())
        print()

In [142]:
def convert_numpy_to_tfrecords(x_data, y_data, name:str, data_directory:str, num_shards:int=1):
    """Convert the dataset into TFRecords on disk
    
    Args:
        x_data:         The MNIST data set to convert: data
        y_data:         The MNIST data set to convert: label
        name:           The name of the data set
        data_directory: The directory where records will be stored
        num_shards:     The number of files on disk to separate records into
    """

    data_set = list(zip(x_data, y_data))
    data_directory=os.path.abspath(data_directory)
    
    if num_shards == 1:
        _numpy_to_tfrecords(data_set, _data_path(data_directory, name))
    else:
        sharded_dataset = np.array_split(data_set, num_shards)
        for shard, dataset in enumerate(sharded_dataset):
            _numpy_to_tfrecords(dataset, _data_path(data_directory, f'{name}-{shard+1}'))

In [153]:
path_test_tfrecords = 'data/mnist/tfrecord_numpy_test'
path_train_tfrecords = 'data/mnist/tfrecord_numpy_train'

In [154]:
# creating TFRecords files for the training dataset
convert_numpy_to_tfrecords(x_train, y_train, 'train', path_train_tfrecords, 1)

Processing /Users/tarrade/Desktop/Work/Data_Science/Tutorials_Codes/Python/proj_DL_models_and_pipelines_with_GCP/data/mnist/tfrecord_numpy_train/train.tfrecords data
Processing sample 60000 of 60000


In [155]:
# creating TFRecords files for the testing dataset
convert_numpy_to_tfrecords(x_test, y_test, 'test', path_test_tfrecords, 1)

Processing /Users/tarrade/Desktop/Work/Data_Science/Tutorials_Codes/Python/proj_DL_models_and_pipelines_with_GCP/data/mnist/tfrecord_numpy_test/test.tfrecords data
Processing sample 9774 of 10000

#### Save TFRecord file with numpy array on GCP

In [156]:
LOCAL_DATA_TEST = path_test_tfrecords
LOCAL_DATA_TRAIN = path_train_tfrecords

In [157]:
os.environ['LOCAL_DATA_TEST'] = LOCAL_DATA_TEST
os.environ['LOCAL_DATA_TRAIN'] = LOCAL_DATA_TRAIN

In [160]:
!gsutil ls $GCS_BUCKET/mnist/

gs://ml-productive-pipeline-53122/mnist/image/
gs://ml-productive-pipeline-53122/mnist/raw/
gs://ml-productive-pipeline-53122/mnist/tfrecords/


In [159]:
!gsutil -m cp -R $LOCAL_DATA_TRAIN/ $GCS_BUCKET/mnist/tfrecords/numpy_train

Copying file://data/mnist/tfrecord_numpy_train/train.tfrecords [Content-Type=application/octet-stream]...
- [1/1 files][ 48.2 MiB/ 48.2 MiB] 100% Done                                    
Operation completed over 1 objects/48.2 MiB.                                     


In [161]:
!gsutil -m cp -R $LOCAL_DATA_TEST/ $GCS_BUCKET/mnist/tfrecords/numpy_test

Copying file://data/mnist/tfrecord_numpy_test/.DS_Store [Content-Type=application/octet-stream]...
Copying file://data/mnist/tfrecord_numpy_test/test.tfrecords [Content-Type=application/octet-stream]...
| [2/2 files][  8.0 MiB/  8.0 MiB] 100% Done                                    
Operation completed over 2 objects/8.0 MiB.                                      


#### Dataset using TFRecords file to preprocess and feed data in our model 

In [169]:
def input_numpy_tfrecords_fn(filenames, batch_size=128, mode=tf.estimator.ModeKeys.TRAIN):
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        tf.logging.info("input_dataset_fn: PREDICT, {}".format(mode))
    elif mode == tf.estimator.ModeKeys.EVAL:
        tf.logging.info("input_dataset_fn: EVAL, {}".format(mode))
    elif mode == tf.estimator.ModeKeys.TRAIN:
        tf.logging.info("input_dataset_fn: TRAIN, {}".format(mode))
    
    def _parser(record):
        # 1. define a parser
        features={
            # the label are parsed as int
            'label': tf.FixedLenFeature(shape=[], dtype=tf.int64),
            # the bytes_list data is parsed into tf.string.
            'image_raw': tf.FixedLenFeature(shape=[], dtype=tf.string)
        }
        print('step 1')
        parsed_record = tf.parse_single_example(record, features)
        print('step 2')
        print(parsed_record)
        # 2. Convert the data
        label = parsed_record['label']
        #image = parsed_record['image_raw']#
        image =  tf.cast(tf.decode_raw(parsed_record['image_raw'], out_type=tf.uint8), tf.float64)
        print('TEST',label.shape)
        print('TEST',image.shape)
        # 3. reshape
        #tf.reshape(image, [1,784])
        #print('---',image.shape())
        
        # 4. hot emcoding
        num_classes=10
        #label = tf.keras.utils.to_categorical( label, num_classes)
        label=tf.one_hot(label, num_classes)

        return image, label
        #dataset = tf.data.Dataset.from_tensor_slices((image, label))
        #return dataset
    
    def _normalize(image, label):
        """Convert image from [0, 255] -> [-0.5, 0.5] floats."""
        image = image * (1. / 255) - 0.5
        return image, label
    
    def _dense_to_one_hot(labels_dense, num_classes):
        """Convert class labels from scalars to one-hot vectors."""
        num_labels = labels_dense.shape[0]
        index_offset = numpy.arange(num_labels) * num_classes
        labels_one_hot = numpy.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
        return labels_one_hot
    
    def _input_fn():
        # 1) read data from TFRecordDataset
        dataset = (tf.data.TFRecordDataset(filenames).map(_parser))
        
        # 2) shuffle (with a big enough buffer size)    :        
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # loop indefinitely
            dataset = dataset.shuffle(buffer_size=FLAGS.shuffle_buffer_size, seed=2)# depends on sample size
        else:
            num_epochs = 1 # end-of-input after this
        print('the number of epoch: num_epoch =', num_epochs)
        
        # caching data
        #dataset = dataset.cache()
    
        # 3) automatically refill the data queue when empty
        dataset = dataset.repeat(num_epochs)

        # 4) map
        dataset = dataset.map(map_func=_normalize, num_parallel_calls=FLAGS.num_parallel_calls)

        # 5) create batches of data
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

        # 6) prefetch data for faster consumption, based on your system and environment, allows the tf.data runtime to automatically tune the prefetch buffer sizes
        dataset = dataset.prefetch(FLAGS.prefetch_buffer_size)

        return dataset
    return _input_fn()

In [186]:
glob.glob(path_train_tfrecords+'/train*.tfrecords')

['data/mnist/tfrecord_numpy_train/train.tfrecords']

In [187]:
training_dataset = input_numpy_tfrecords_fn(glob.glob(path_train_tfrecords+'/train*.tfrecords'), 
                                            mode=tf.estimator.ModeKeys.TRAIN, 
                                            batch_size=FLAGS.batch_size)

INFO:tensorflow:input_dataset_fn: TRAIN, train
step 1
step 2
{'image_raw': <tf.Tensor 'ParseSingleExample/ParseSingleExample:0' shape=() dtype=string>, 'label': <tf.Tensor 'ParseSingleExample/ParseSingleExample:1' shape=() dtype=int64>}
TEST ()
TEST (?,)
the number of epoch: num_epoch = None


In [192]:
testing_dataset = input_numpy_tfrecords_fn(glob.glob(path_test_tfrecords+'/test*.tfrecords'), 
                                           mode=tf.estimator.ModeKeys.EVAL, 
                                           batch_size=len(x_test))

INFO:tensorflow:input_dataset_fn: EVAL, eval
step 1
step 2
{'image_raw': <tf.Tensor 'ParseSingleExample/ParseSingleExample:0' shape=() dtype=string>, 'label': <tf.Tensor 'ParseSingleExample/ParseSingleExample:1' shape=() dtype=int64>}
TEST ()
TEST (?,)
the number of epoch: num_epoch = 1


In [189]:
iterator = training_dataset.make_one_shot_iterator()
# next_element
features, labels = iterator.get_next()

In [197]:
n=0
n_iter=10

with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print('iteration n:', n, 'execution time:', time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
            n+=1
            if n>=n_iter:
                print('number of iteration reached')
                break
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

iteration n: 0 execution time: 0.8165169999999762 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 1 execution time: 0.06866899999999987 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 2 execution time: 0.06222000000002481 seconds
(128, 784)
(128, 10)
first label of the batch 6 

iteration n: 3 execution time: 0.06054800000003979 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 4 execution time: 0.06055100000003222 seconds
(128, 784)
(128, 10)
first label of the batch 5 

iteration n: 5 execution time: 0.06222900000000209 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 6 execution time: 0.06071299999996427 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 7 execution time: 0.07545900000002348 seconds
(128, 784)
(128, 10)
first label of the batch 8 

iteration n: 8 execution time: 0.05505900000002839 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 9 execu

In [198]:
iterator = testing_dataset.make_one_shot_iterator()
# next_element
features, labels = iterator.get_next()

In [199]:
n=0

n_iter=10
with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print('iteration n:', n, 'execution time:', time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
            n+=1
            if n>=n_iter:
                print('number of iteration reached')
                break
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

iteration n: 0 execution time: 4.225740000000087 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

tf.errors.OutOfRangeError


## TFRecord files based on images

In [526]:
os.environ['CLOUDSDK_PYTHON']='/Users/tarrade/anaconda3/bin/python'

In [527]:
!gcloud version

Google Cloud SDK 232.0.0
bq 2.0.40
core 2019.01.27
gsutil 4.35


In [530]:
os.environ['CLOUDSDK_PYTHON']='/Users/tarrade/anaconda/bin/python'

In [531]:
!gsutil cp ../data/*tfrecords gs://ml-productive-pipeline-53122

Copying file://../data/test.tfrecords [Content-Type=application/octet-stream]...
Copying file://../data/train.tfrecords [Content-Type=application/octet-stream]...
\ [2 files][ 56.2 MiB/ 56.2 MiB]    4.9 MiB/s                                   
Operation completed over 2 objects/56.2 MiB.                                     


In [491]:
def input_dataset_tfrecords_fn(filenames, batch_size=128, mode=tf.estimator.ModeKeys.TRAIN):
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        tf.logging.info("input_dataset_fn: PREDICT, {}".format(mode))
    elif mode == tf.estimator.ModeKeys.EVAL:
        tf.logging.info("input_dataset_fn: EVAL, {}".format(mode))
    elif mode == tf.estimator.ModeKeys.TRAIN:
        tf.logging.info("input_dataset_fn: TRAIN, {}".format(mode))
    
    def _parser(record):
        # 1. define a parser
        features={
            #'label': tf.FixedLenFeature([], tf.int64),
            #'image_raw': tf.FixedLenFeature([], tf.train.FloatList)#tf.string)
            'label': tf.FixedLenFeature(shape=[], dtype=tf.int64),
            # The bytes_list data is parsed into tf.string.
            'image_raw': tf.FixedLenFeature(shape=[], dtype=tf.string)
        }
        print('step 1')
        parsed_record = tf.parse_single_example(record, features)
        print('step 2')
        print(parsed_record)
        # 2. Convert the data
        label = parsed_record['label']
        #image = parsed_record['image_raw']#
        image =  tf.cast(tf.decode_raw(parsed_record['image_raw'], out_type=tf.uint8), tf.float64)
        print('TEST',label.shape)
        print('TEST',image.shape)
        # 3. reshape
        #tf.reshape(image, [1,784])
        #print('---',image.shape())
        
        # 4. hot emcoding
        num_classes=10
        #label = tf.keras.utils.to_categorical( label, num_classes)
        label=tf.one_hot(label, num_classes)

        return image, label
        #dataset = tf.data.Dataset.from_tensor_slices((image, label))
        #return dataset
    
    def _normalize(image, label):
        """Convert image from [0, 255] -> [-0.5, 0.5] floats."""
        image = image * (1. / 255) - 0.5
        return image, label
    
    def _dense_to_one_hot(labels_dense, num_classes):
        """Convert class labels from scalars to one-hot vectors."""
        num_labels = labels_dense.shape[0]
        index_offset = numpy.arange(num_labels) * num_classes
        labels_one_hot = numpy.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
        return labels_one_hot
    
    def _input_fn():
        # 1) read data from TFRecordDataset
        dataset = (tf.data.TFRecordDataset(filenames).map(_parser))
        
        # 2) shuffle (with a big enough buffer size)    :        
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # loop indefinitely
            dataset = dataset.shuffle(buffer_size=FLAGS.shuffle_buffer_size, seed=2)# depends on sample size
        else:
            num_epochs = 1 # end-of-input after this
        print('the number of epoch: num_epoch =', num_epochs)
        
        # caching data
        #dataset = dataset.cache()
    
        # 3) automatically refill the data queue when empty
        dataset = dataset.repeat(num_epochs)

        # 4) map
        dataset = dataset.map(map_func=_normalize, num_parallel_calls=FLAGS.num_parallel_calls)

        # 5) create batches of data
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

        # 6) prefetch data for faster consumption, based on your system and environment, allows the tf.data runtime to automatically tune the prefetch buffer sizes
        dataset = dataset.prefetch(FLAGS.prefetch_buffer_size)

        return dataset
    return _input_fn()

### Exploration dataset API

In [509]:
iterator = training_dataset.make_one_shot_iterator()

In [510]:
# next_element
features, labels = iterator.get_next()

In [511]:
n=0
n_iter=10 #len(x_train)//BATCH_SIZE
with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print('iteration n:', n, 'execution time:', time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
            n+=1
            if n>=n_iter:
                print('number of iteration reached')
                break
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

iteration n: 0 execution time: 0.5324680000001081 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 1 execution time: 0.055957000000034895 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 2 execution time: 0.05217799999991257 seconds
(128, 784)
(128, 10)
first label of the batch 6 

iteration n: 3 execution time: 0.04504799999995157 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 4 execution time: 0.04535500000019965 seconds
(128, 784)
(128, 10)
first label of the batch 5 

iteration n: 5 execution time: 0.04520900000011352 seconds
(128, 784)
(128, 10)
first label of the batch 1 

iteration n: 6 execution time: 0.04962000000000444 seconds
(128, 784)
(128, 10)
first label of the batch 9 

iteration n: 7 execution time: 0.050853999999844746 seconds
(128, 784)
(128, 10)
first label of the batch 8 

iteration n: 8 execution time: 0.04601600000000872 seconds
(128, 784)
(128, 10)
first label of the batch 2 

iteration n: 9 exe

In [512]:
iterator = testing_dataset.make_one_shot_iterator()

In [513]:
# next_element
features, labels = iterator.get_next()

In [514]:
with tf.Session() as sess:
    while True:
        try:
            start_time = time.clock()
            x,y = sess.run([features, labels])
            print(time.clock() - start_time, 'seconds')
            print(x.shape)
            print(y.shape)
            print('first label of the batch',np.argmax(y[0]),'\n')
        except tf.errors.OutOfRangeError:
            print('tf.errors.OutOfRangeError')
            break

3.6481220000000576 seconds
(10000, 784)
(10000, 10)
first label of the batch 7 

tf.errors.OutOfRangeError
