The code is accompanied by my blogpost where I go into more detail about the decisions I've made, and whatever notes I may make

The link to my notes, and introspections are here: [TODO]()

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import tensorflow as tf

tf.enable_eager_execution()
import numpy as np
import os
import datetime
import tqdm
import sys
import pprint

# 1) Training Pipeline

## 1.1) Producer:

- takes `numpy` arrays or `CSVs`

- $\lambda$ `np.ndarray` -> `tf.TfRecords`

- loads from `raw_data` folder in this tutorial and writes to `processed_data` folder 

## 1.2) Provider:

- loads from `processed_data` folder 

- processes the data (so that the processing is part of the computation graph)

- loads `tf.TfRecords` and sends it directly to tensorflow. Avoids `feed_dict` which is [reportedly slow](https://www.tensorflow.org/guide/performance/overview#input_pipeline)

# Producer:

## Method 1

1: Load the numpy array

2: Conversion of a row of data into formats compatible with tf

3: Save it as a tfRecords file

## Method 2 (not shown here)

You can load it in as a tf.data.Dataset, then use the `experimenta` library to construct a tfRecords file. A good resource for this is [official docs](https://www.tensorflow.org/tutorials/load_data/tf-records#tfexample)


```
serialized_features_dataset = features_dataset.map(tf_serialize_example)
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_features_dataset)
```


# Dataset Information

    Elevation                               quantitative    meters                       Elevation in meters
    
    Aspect                                  quantitative    azimuth                      Aspect in degrees azimuth
    
    Slope                                   quantitative    degrees                      Slope in degrees
    
    Horizontal_Distance_To_Hydrology        quantitative    meters                       Horz Dist to nearest surface water features
    
    Vertical_Distance_To_Hydrology          quantitative    meters                       Vert Dist to nearest surface water features
    
    Horizontal_Distance_To_Roadways         quantitative    meters                       Horz Dist to nearest roadway
    
    Hillshade_9am                           quantitative    0 to 255 index               Hillshade index at 9am, summer solstice
    
    Hillshade_Noon                          quantitative    0 to 255 index               Hillshade index at noon, summer soltice
    
    Hillshade_3pm                           quantitative    0 to 255 index               Hillshade index at 3pm, summer solstice
    
    Horizontal_Distance_To_Fire_Points      quantitative    meters                       Horz Dist to nearest wildfire ignition points
    
    Wilderness_Area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  Wilderness area designation
    
    Soil_Type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  Soil Type designation
    
    Cover_Type (7 types)                    integer         1 to 7                       Forest Cover Type designation
    

In [14]:
# Parse records

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _tensor_feature(value, kwd, underlying_type):
    """Returns a tensor feature from a list of values (that were first converted to a tensor)
    
    # I tried using the following, but tf.train.Feature expects bytes, not a Tensor of bytes (avoid doing this!)
    tensor = tf.convert_to_tensor(value)
    print(tensor)
    serialized = tf.serialize_tensor(tensor)
    print(serialized)
    print(dir(serialized))
    serialized =  tf.compat.bytes_or_text_types(serialized)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[serialized]))
    """
    return tf.train.Feature(**{
        kwd: underlying_type(value=value)
    })

In [19]:
# Prototype class - enables easier feature definition and better for refactoring

class FeatureProto(object):
    from collections import namedtuple
    
    proto = namedtuple('prototype', ['name', 'dtype', 'shape'])
    
    features = [
        proto(name='Elevation', dtype=tf.float32, shape=1),
        proto(name='Aspect', dtype=tf.float32, shape=1),
        proto(name='Slope', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Hydrology', dtype=tf.float32, shape=1),
        proto(name='Vertical_Distance_To_Hydrology', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Roadways', dtype=tf.float32, shape=1),
        proto(name='Hillshade_9am', dtype=tf.float32, shape=1),
        proto(name='Hillshade_Noon', dtype=tf.float32, shape=1),
        proto(name='Hillshade_3pm', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Fire_Points', dtype=tf.float32, shape=1),
        proto(name='Wilderness_Area', dtype=tf.float32, shape=4),
        proto(name='Soil_Type', dtype=tf.float32, shape=40),
        proto(name='Cover_Type', dtype=tf.float32, shape=1),
    ]
    
    
    def dataset_creation(self, data):
        idx = 0
        collection = {}
        for prototype in self.features:
            feat_name = prototype.name
            dtype = prototype.dtype
            shape = prototype.shape
            
            if dtype == tf.float32:
                if shape == 1:
                    datum = data[idx]
                    encoded_feature = _float_feature(datum)
                else:
                    datum = data[idx: idx+shape]
                    encoded_feature = _tensor_feature(datum, 'float_list', tf.train.FloatList)
            else:
                raise NotImplementedError('dataset creation for non-float32 not supported')
            
            collection[feat_name] = encoded_feature
            idx += shape
        return collection
    
    def dataset_parsing(self):
        if hasattr(self, 'parser_proto'):
            return self.parser_proto
        else:
            parser_proto = {}
            for prototype in self.features:
                feat_name = prototype.name
                dtype = prototype.dtype
                shape = prototype.shape
                parser_proto[feat_name] = tf.FixedLenFeature(() if shape == 1 else (shape), dtype)
            self.parser_proto = parser_proto
            return self.parser_proto


feature_proto = FeatureProto()

In [20]:
def load_data():
    loaded = np.loadtxt('unprocessed_data/covtype.data', delimiter=',')  # Avoid tf.contrib since we want to get our hands dirty
    print(loaded.shape)
    all_ind = np.arange(0, len(loaded))
    train_ind = all_ind[: int(len(loaded) * 0.8)]
    test_ind = all_ind[int(len(loaded) * 0.8): ]
    
    return loaded, all_ind, train_ind, test_ind

#loaded, all_ind, train_ind, test_ind = load_data()
print('Uncomment above line to load in data from unprocessed_data folder')

Uncomment above line to load in data from unprocessed_data folder


In [18]:
def generate_samples(feature_proto):
    try:
        os.mkdir('processed_data')
    except:
        print('Folder exists')
    for record_type in [('train', train_ind), ('test', test_ind)]:

        filename = 'processed_data/tf_record_covtype_{}_{}'.format(
            record_type[0], datetime.datetime.now().replace(microsecond=0,second=0,minute=0)
        )  # Round to the previous hour
        with tf.python_io.TFRecordWriter(filename) as writer:
            for i in tqdm.tqdm_notebook(record_type[1]):
                datum = loaded[i, :]
                feature = feature_proto.dataset_creation(datum)
                example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
                writer.write(example_proto.SerializeToString())

        # using your storage system -S3 or some other file hosting service, add the export here
#generate_samples(feature_proto)
print('Uncomment line above to use feature_proto to generate the tfRecords files')

HBox(children=(IntProgress(value=0, max=464809), HTML(value='')))




HBox(children=(IntProgress(value=0, max=116203), HTML(value='')))




# Provider

Cell 1: Initialize the loader

- even though there is the train, and test data in the tfRecordDataset, we pretend that they're two different runs of our pre-processor

- The proto_wrap function is unnecessary here but for the sake of clarity I left it in. In the next tutorial, where I show you how to use the loaded data, we will remove it

Cell 2: Provide

- return an iterator that you can go through to iterate your dataset

In [7]:
def proto_wrap(feature_proto):
    features = feature_proto.dataset_parsing()
    def unpack(example_proto):
        parsed_features = tf.parse_single_example(example_proto, features)
        labels = parsed_features['Cover_Type']
        parsed_features.pop('Cover_Type')
        # Then, convert the dataset into tensors which tensorflow expects?
        parsed_features['Soil_Type'] = tf.convert_to_tensor(parsed_features['Soil_Type'])
        parsed_features['Wilderness_Area'] = tf.cast(tf.argmax(parsed_features['Wilderness_Area'], axis=0), dtype=tf.float32)
        labels = tf.one_hot(tf.cast(labels, dtype=tf.uint8), 8, on_value=1, off_value=0, axis=-1)

        return parsed_features, labels
    return unpack
 


def dataset_config(filenames: list, mapper=None, repeat=False, batch_size=32,
                  initializable=False, sess=None, feed_dict=None, num_cpus=None):
    dataset = tf.data.TFRecordDataset(filenames)
    
    if mapper is not None:
        dataset = dataset.map(mapper, num_parallel_calls=num_cpus)
        
    if repeat:
        dataset = dataset.repeat()
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=batch_size)
    
    if initializable:
        """
        An initializable iterator requires you to run an explicit iterator.initializer operation before using it. 
        In exchange for this inconvenience, it enables you to parameterize the definition of the dataset, 
        using one or more tf.placeholder() tensors that can be fed when you initialize the iterator
        """
        # Creates an Iterator for enumerating the elements of this dataset
        if sess is None:
            raise Exception('Initializable dataset configuration specified but session not supplied')
        iterator = dataset.make_initializable_iterator()
    else:
        """
        A one-shot iterator is the simplest form of iterator, which only supports iterating once through a dataset, 
        with no need for explicit initialization. One-shot iterators handle almost all of the cases that the existing 
        queue-based input pipelines support, but they do not support parameterization
        """
        iterator = dataset.make_one_shot_iterator()
        
    if initializable:
        assert feed_dict is not None, 'Supply feed dict to initializable iterator'
        sess.run(iterator.initializer, feed_dict=feed_dict)
    
    next_element = iterator.get_next()
    return next_element

In [8]:
filename_list = []
for dirname, dirnames, filenames in os.walk('processed_data/'):
    # print path to all subdirectories first.
    for f in filenames:
        filename_list.append('{}{}'.format(dirname, f))
print(filename_list)
dataset = tf.data.TFRecordDataset(filename_list)

num_cpus = os.cpu_count()
contextual_unpacker = proto_wrap(feature_proto)
training_dataset_next = dataset_config(filename_list, mapper=contextual_unpacker, num_cpus=num_cpus)

['processed_data/tf_record_covtype_test_2018-12-27 21:00:00', 'processed_data/tf_record_covtype_train_2018-12-27 21:00:00']


In [9]:
# Lazy execution

# init = tf.global_variables_initializer()
# with tf.Session() as sess:
#     sess.run(init)
#     features, label = sess.run(training_dataset_next)
    
    
# Eager execution
features, label = training_dataset_next

# Done! 

And with that, we're done! We've 

1) taken a non-trivial dataset, 
2) converted it into a `tfRecord`
3) shown how to unload it and read from it