In [14]:
import os
import sys
import pickle
import numpy as np
import tensorflow as tf

In [3]:
from modis_utils.misc import restore_data, cache_data

In [4]:
data_dir = '../sequence_data/12'

In [6]:
def _get_file_names():
    """Returns the file names expected to exist in the input_dir."""
    file_names = {}
    file_names['train'] = ['data_batch_%d' % i for i in range(1, 5)]
    file_names['validation'] = ['data_batch_5']
    file_names['eval'] = ['test_batch']
    return file_names

In [44]:
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [62]:
def convert_to_tfrecord(input_files, output_file):
    """Converts a file to TFRecords."""
    print('Generating %s' % output_file)
    with tf.python_io.TFRecordWriter(output_file) as record_writer:
        for input_file in input_files:
            inputs, labels, inputs_pw, labels_pw = restore_data(input_file)
            num_entries_in_batch = len(inputs)
            for i in range(num_entries_in_batch):
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'inputs': _float_feature(inputs[i].flatten().tolist()),
                        'labels': _float_feature(labels[i].flatten().tolist())
                    }))
                record_writer.write(example.SerializeToString())

In [63]:
for subset in ('val', 'test', 'train'):
    input_dir = os.path.join(data_dir, subset)
    input_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
    output_file = os.path.join(data_dir, subset + '.tfrecords')
    try:
        os.remove(output_file)
    except OSError:
        pass
    # Convert to tf.train.Example and write the to TFRecords.
    convert_to_tfrecord(input_files, output_file)
    print('Done {}!'.format(subset))

Generating ../sequence_data/12/val.tfrecords
Done val!
Generating ../sequence_data/12/test.tfrecords
Done test!
Generating ../sequence_data/12/train.tfrecords
Done train!


In [30]:
n_examples = {'train': 0, 'val': 0, 'test': 0}
for subset in ('train', 'val', 'test'):
    n = 0
    subset_data_dir = os.path.join(data_dir, subset)
    for filename in os.listdir(subset_data_dir):
        data = restore_data(os.path.join(subset_data_dir, filename))
        n += len(data[0])
    n_examples[subset] = n
print(n_examples)

{'train': 15534, 'val': 1380, 'test': 2454}


In [74]:
in_steps = 14
out_steps = 12
HEIGHT = 32
WIDTH = 32
DEPTH = 1


class BCLDataSet(object):

  def __init__(self, data_dir, subset='train', use_distortion=True):
    self.data_dir = data_dir
    self.subset = subset
    self.use_distortion = use_distortion

  def get_filenames(self):
    if self.subset in ['train', 'val', 'test']:
      return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
    else:
      raise ValueError('Invalid data subset "%s"' % self.subset)

  def parser(self, serialized_example):
    """Parses a single tf.Example into image and label tensors."""
    # Dimensions of the images in the CIFAR-10 dataset.
    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
    # input format.
    features = tf.parse_single_example(
        serialized_example,
        features={
            'inputs': tf.FixedLenFeature([in_steps * DEPTH * HEIGHT * WIDTH], tf.float32),
            'labels': tf.FixedLenFeature([out_steps * DEPTH * HEIGHT * WIDTH], tf.float32),
        })
    #features['inputs'].set_shape([in_steps * DEPTH * HEIGHT * WIDTH])
    #features['labels'].set_shape([out_steps * DEPTH * HEIGHT * WIDTH])

    # Reshape from [in_steps * depth * height * width] to [in_steps, height, width, depth].
    inputs = tf.reshape(features['inputs'], [in_steps, HEIGHT, WIDTH, DEPTH])
    labels = tf.reshape(features['labels'], [out_steps, HEIGHT, WIDTH, DEPTH])

    return inputs, labels

  def make_batch(self, batch_size):
    """Read the images and labels from 'filenames'."""
    filenames = self.get_filenames()
    # Repeat infinitely.
    dataset = tf.data.TFRecordDataset(filenames).repeat()

    # Parse records.
    dataset = dataset.map(
        self.parser, num_parallel_calls=batch_size)

    # Potentially shuffle records.
    if self.subset == 'train':
      min_queue_examples = int(
          BCLDataSet.num_examples_per_epoch(self.subset) * 0.4)
      # Ensure that the capacity is sufficiently large to provide good random
      # shuffling.
      dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)

    # Batch it up.
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs_batch, labels_batch = iterator.get_next()

    return inputs_batch, labels_batch

  @staticmethod
  def num_examples_per_epoch(subset='train'):
    if subset == 'train':
      return 15534
    elif subset == 'val':
      return 1380
    elif subset == 'test':
      return 2454
    else:
      raise ValueError('Invalid data subset "%s"' % subset)

In [75]:
bcl_dataset = BCLDataSet(data_dir, 'train', False)

In [76]:
batches = bcl_dataset.make_batch(32)

features[inputs] = Tensor("ParseSingleExample/ParseSingleExample:0", shape=(14336,), dtype=float32)
features[labels] = Tensor("ParseSingleExample/ParseSingleExample:1", shape=(12288,), dtype=float32)


In [78]:
type(batches)

tuple

In [79]:
type(batches[0])

tensorflow.python.framework.ops.Tensor

In [81]:
batches[1].shape

TensorShape([Dimension(None), Dimension(12), Dimension(32), Dimension(32), Dimension(1)])

In [84]:
b = batches[0][2]

In [88]:
c = batches[0].get_next()

AttributeError: 'Tensor' object has no attribute 'get_next'