The code is accompanied by my blogpost where I go into more detail about the decisions I've made, and whatever notes I may make

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf

tf.enable_eager_execution()
import numpy as np
import os
import datetime
import tqdm
import sys

# 1) Training Pipeline

## 1.1) Producer:

- takes `numpy` arrays or `CSVs`

- $\lambda$ `np.ndarray` -> `tf.TfRecords`

- loads from `raw_data` folder in this tutorial and writes to `processed_data` folder 

## 1.2) Provider:

- loads from `processed_data` folder 

- processes the data (so that the processing is part of the computation graph)

- loads `tf.TfRecords` and sends it directly to tensorflow. Avoids `feed_dict` which is [reportedly slow](https://www.tensorflow.org/guide/performance/overview#input_pipeline)

# Producer:

## Method 1

Cell 1: Load the numpy array

Cell 2: Conversion of a row of data into formats compatible with tf

Cell 3: Save it as a tfRecords file

## Method 2 (not shown here)

You can load it in as a tf.data.Dataset, then use the `experimenta` library to construct a tfRecords file. A good resource for this is [official docs](https://www.tensorflow.org/tutorials/load_data/tf-records#tfexample)


```
serialized_features_dataset = features_dataset.map(tf_serialize_example)
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_features_dataset)
```


In [3]:
loaded = np.loadtxt('unprocessed_data/covtype.data', delimiter=',')  # Avoid tf.contrib since we want to get our hands dirty
print(loaded.shape)
all_ind = np.arange(0, len(loaded))
train_ind = all_ind[: int(len(loaded) * 0.8)]
test_ind = all_ind[int(len(loaded) * 0.8): ]

(581012, 55)


In [4]:
# Parse records

"""
Elevation                               quantitative    meters                       Elevation in meters
Aspect                                  quantitative    azimuth                      Aspect in degrees azimuth
Slope                                   quantitative    degrees                      Slope in degrees
Horizontal_Distance_To_Hydrology        quantitative    meters                       Horz Dist to nearest surface water features
Vertical_Distance_To_Hydrology          quantitative    meters                       Vert Dist to nearest surface water features
Horizontal_Distance_To_Roadways         quantitative    meters                       Horz Dist to nearest roadway
Hillshade_9am                           quantitative    0 to 255 index               Hillshade index at 9am, summer solstice
Hillshade_Noon                          quantitative    0 to 255 index               Hillshade index at noon, summer soltice
Hillshade_3pm                           quantitative    0 to 255 index               Hillshade index at 3pm, summer solstice
Horizontal_Distance_To_Fire_Points      quantitative    meters                       Horz Dist to nearest wildfire ignition points
Wilderness_Area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  Wilderness area designation
Soil_Type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  Soil Type designation
Cover_Type (7 types)                    integer         1 to 7                       Forest Cover Type designation
"""
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _tensor_feature(value, kwd, underlying_type):
    """Returns a tensor feature from a list of values (that were first converted to a tensor)
    
    # I tried using the following, but tf.train.Feature expects bytes, not a Tensor of bytes (avoid doing this!)
    tensor = tf.convert_to_tensor(value)
    print(tensor)
    serialized = tf.serialize_tensor(tensor)
    print(serialized)
    print(dir(serialized))
    serialized =  tf.compat.bytes_or_text_types(serialized)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[serialized]))
    """
    return tf.train.Feature(**{
        kwd: underlying_type(value=value)
    })


In [5]:
def generate_samples():
    for record_type in [('train', train_ind), ('test', test_ind)]:

        filename = 'processed_data/tf_record_covtype_{}_{}'.format(
            record_type[0], datetime.datetime.now().replace(microsecond=0,second=0,minute=0)
        )  # Round to the previous hour
        with tf.python_io.TFRecordWriter(filename) as writer:
            for i in tqdm.tqdm_notebook(record_type[1]):
                datum = loaded[i, :]
                feature = {
                    'Elevation': _float_feature(datum[0]),
                    'Aspect': _float_feature(datum[1]),
                    'Slope': _float_feature(datum[2]),
                    'Horizontal_Distance_To_Hydrology': _float_feature(datum[3]),
                    'Vertical_Distance_To_Hydrology': _float_feature(datum[4]),
                    'Hillshade_9am': _float_feature(datum[5]),
                    'Hillshade_Noon': _float_feature(datum[6]),
                    'Hillshade_3pm': _float_feature(datum[7]),
                    'Horizontal_Distance_To_Fire_Points': _float_feature(datum[8]),
                    'Wilderness_Area': _tensor_feature(datum[9:13], 'float_list', tf.train.FloatList),
                    'Soil_Type': _tensor_feature(datum[14:54], 'float_list', tf.train.FloatList),
                    'Cover Type': _float_feature(datum[54])
                }
                example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
                writer.write(example_proto.SerializeToString())

        # using your storage system -S3 or some other file hosting service, add the export here

# Provider

Cell 1: Initialize the loader

- even though there is the train, and test data in the tfRecordDataset, we pretend that they're two different runs of our pre-processor

Cell 2: Provide

In [6]:
def unpack(example_proto):
    # First, parse the dataset back 
    features = {
        'Elevation': tf.FixedLenFeature((), tf.float32),
        'Aspect': tf.FixedLenFeature((), tf.float32),
        'Slope': tf.FixedLenFeature((), tf.float32),
        'Horizontal_Distance_To_Hydrology': tf.FixedLenFeature((), tf.float32),
        'Vertical_Distance_To_Hydrology': tf.FixedLenFeature((), tf.float32),
        'Hillshade_9am': tf.FixedLenFeature((), tf.float32),
        'Hillshade_Noon': tf.FixedLenFeature((), tf.float32),
        'Hillshade_3pm': tf.FixedLenFeature((), tf.float32),
        'Horizontal_Distance_To_Fire_Points': tf.FixedLenFeature((), tf.float32),
        'Wilderness_Area': tf.FixedLenFeature((4), tf.float32),
        'Soil_Type': tf.FixedLenFeature((40), tf.float32),
        'Cover Type': tf.FixedLenFeature((), tf.float32)
    }
    
    
    
    parsed_features = tf.parse_single_example(example_proto, features)
    
    labels = parsed_features['Cover Type']
    parsed_features.pop('Cover Type')
    
    # Then, convert the dataset into tensors which tensorflow expects?
    
    
    return parsed_features, labels
 


def dataset_config(filenames: list, mapper=None, repeat=False, batch_size=32,
                  initializable=False, sess=None, feed_dict=None, num_cpus=None):
    dataset = tf.data.TFRecordDataset(filenames)
    
    if mapper is not None:
        dataset = dataset.map(mapper, num_parallel_calls=num_cpus)
        
    if repeat:
        dataset = dataset.repeat()
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=batch_size)
    
    if initializable:
        """
        An initializable iterator requires you to run an explicit iterator.initializer operation before using it. 
        In exchange for this inconvenience, it enables you to parameterize the definition of the dataset, 
        using one or more tf.placeholder() tensors that can be fed when you initialize the iterator
        """
        # Creates an Iterator for enumerating the elements of this dataset
        if sess is None:
            raise Exception('Initializable dataset configuration specified but session not supplied')
        iterator = dataset.make_initializable_iterator()
    else:
        """
        A one-shot iterator is the simplest form of iterator, which only supports iterating once through a dataset, 
        with no need for explicit initialization. One-shot iterators handle almost all of the cases that the existing 
        queue-based input pipelines support, but they do not support parameterization
        """
        iterator = dataset.make_one_shot_iterator()
        
    next_element = iterator.get_next()
    if initializable:
        assert feed_dict is not None, 'Supply feed dict to initializable iterator'
        sess.run(iterator.initializer, feed_dict=feed_dict)
    
    return iterator

In [7]:
filename_list = []
for dirname, dirnames, filenames in os.walk('processed_data/'):
    # print path to all subdirectories first.
    for f in filenames:
        filename_list.append('{}{}'.format(dirname, f))
print(filename_list)
dataset = tf.data.TFRecordDataset(filename_list)

num_cpus = os.cpu_count()
training_dataset = dataset_config(filename_list, mapper=unpack, num_cpus=num_cpus)

['processed_data/tf_record_covtype_train_2018-12-26 18:00:00', 'processed_data/tf_record_covtype_test_2018-12-26 17:00:00', 'processed_data/tf_record_covtype_train_2018-12-26 17:00:00']


In [8]:
features, label = training_dataset.get_next(1)