# Setup

- Contains setup functions (inherited from data_exploration.ipynb)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf

tf.enable_eager_execution()
import numpy as np
import os
import datetime
import tqdm
import sys
import pprint

In [5]:
# Dataset creation
# Dataset reading
# placeholder definition

class FeatureProto(object):
    from collections import namedtuple
    
    proto = namedtuple('prototype', ['name', 'dtype', 'shape'])
    
    features = [
        proto(name='Elevation', dtype=tf.float32, shape=1),
        proto(name='Aspect', dtype=tf.float32, shape=1),
        proto(name='Slope', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Hydrology', dtype=tf.float32, shape=1),
        proto(name='Vertical_Distance_To_Hydrology', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Roadways', dtype=tf.float32, shape=1),
        proto(name='Hillshade_9am', dtype=tf.float32, shape=1),
        proto(name='Hillshade_Noon', dtype=tf.float32, shape=1),
        proto(name='Hillshade_3pm', dtype=tf.float32, shape=1),
        proto(name='Horizontal_Distance_To_Fire_Points', dtype=tf.float32, shape=1),
        proto(name='Wilderness_Area', dtype=tf.float32, shape=4),
        proto(name='Soil_Type', dtype=tf.float32, shape=40),
        proto(name='Cover_Type', dtype=tf.float32, shape=1),
    ]
    
    
    def dataset_creation(self, data):
        idx = 0
        collection = {}
        for prototype in self.features:
            feat_name = prototype.name
            dtype = prototype.dtype
            shape = prototype.shape
            
            if dtype == tf.float32:
                if shape == 1:
                    datum = data[idx]
                    encoded_feature = _float_feature(datum)
                else:
                    datum = data[idx: idx+shape]
                    encoded_feature = _tensor_feature(datum, 'float_list', tf.train.FloatList)
            else:
                raise NotImplementedError('dataset creation for non-float32 not supported')
            
            collection[feat_name] = encoded_feature
            idx += shape
        return collection
    
    def dataset_parsing(self):
        if hasattr(self, 'parser_proto'):
            return self.parser_proto
        else:
            parser_proto = {}
            for prototype in self.features:
                feat_name = prototype.name
                dtype = prototype.dtype
                shape = prototype.shape
                parser_proto[feat_name] = tf.FixedLenFeature(() if shape == 1 else (shape), dtype)
            self.parser_proto = parser_proto
            return self.parser_proto

#     def placeholder_creation():
#         """
#         Used Temporarily because 
#         """
#         parser_proto = {}
#         for prototype in self.features:
#             feat_name = prototype.name
#             dtype = prototype.dtype
#             shape = prototype.shape
#             parser_proto[feat_name] = tf.FixedLenFeature(() if shape == 1 else (shape), dtype)

feature_proto = FeatureProto()

In [6]:
def proto_wrap(feature_proto):
    features = feature_proto.dataset_parsing()
    def unpack(example_proto):
        parsed_features = tf.parse_single_example(example_proto, features)
        labels = parsed_features['Cover_Type']
        parsed_features.pop('Cover_Type')
        # Then, convert the dataset into tensors which tensorflow expects?
        parsed_features['Soil_Type'] = tf.convert_to_tensor(parsed_features['Soil_Type'])
        parsed_features['Wilderness_Area'] = tf.cast(tf.argmax(parsed_features['Wilderness_Area'], axis=0), dtype=tf.float32)
        labels = tf.one_hot(tf.cast(labels, dtype=tf.uint8), 8, on_value=1, off_value=0, axis=-1)

        return parsed_features, labels
    return unpack
 


def dataset_config(filenames: list, mapper=None, repeat=False, batch_size=32,
                  initializable=False, sess=None, feed_dict=None, num_cpus=None):
    dataset = tf.data.TFRecordDataset(filenames)
    
    if mapper is not None:
        dataset = dataset.map(mapper, num_parallel_calls=num_cpus)
        
    if repeat:
        dataset = dataset.repeat()
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=batch_size)
    
    if initializable:
        """
        An initializable iterator requires you to run an explicit iterator.initializer operation before using it. 
        In exchange for this inconvenience, it enables you to parameterize the definition of the dataset, 
        using one or more tf.placeholder() tensors that can be fed when you initialize the iterator
        """
        # Creates an Iterator for enumerating the elements of this dataset
        if sess is None:
            raise Exception('Initializable dataset configuration specified but session not supplied')
        iterator = dataset.make_initializable_iterator()
    else:
        """
        A one-shot iterator is the simplest form of iterator, which only supports iterating once through a dataset, 
        with no need for explicit initialization. One-shot iterators handle almost all of the cases that the existing 
        queue-based input pipelines support, but they do not support parameterization
        """
        iterator = dataset.make_one_shot_iterator()
        
    if initializable:
        assert feed_dict is not None, 'Supply feed dict to initializable iterator'
        sess.run(iterator.initializer, feed_dict=feed_dict)
    
    next_element = iterator.get_next()
    return next_element

In [7]:
filename_list = []
for dirname, dirnames, filenames in os.walk('processed_data/'):
    # print path to all subdirectories first.
    for f in filenames:
        filename_list.append('{}{}'.format(dirname, f))
print(filename_list)
dataset = tf.data.TFRecordDataset(filename_list)

num_cpus = os.cpu_count()
contextual_unpacker = proto_wrap(feature_proto)
training_dataset_next = dataset_config(filename_list, mapper=contextual_unpacker, num_cpus=num_cpus)

['processed_data/tf_record_covtype_test_2018-12-27 21:00:00', 'processed_data/tf_record_covtype_train_2018-12-27 21:00:00']


# Actual Code

features: A mapping from key to tensors. _FeatureColumns look up via these keys. For example numeric_column('price') will look at 'price' key in this dict. Values can be a SparseTensor or a Tensor depends on corresponding _FeatureColumn.
    
feature_columns: An iterable containing the FeatureColumns to use as inputs to your model. All items should be instances of classes derived from _DenseColumn such as numeric_column, embedding_column, bucketized_column, indicator_column. If you have categorical features, you can wrap them with an embedding_column or indicator_column


"""
price = numeric_column('price')
keywords_embedded = embedding_column(
    categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
columns = [price, keywords_embedded, ...]
features = tf.parse_example(..., features=make_parse_example_spec(columns))
dense_tensor = input_layer(features, columns)
for units in [128, 64, 32]:
  dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
prediction = tf.layers.dense(dense_tensor, 1)
"""

In [8]:
columns = [tf.feature_column.numeric_column(feat) for feat in ]

SyntaxError: invalid syntax (<ipython-input-8-49a8c8cfc5b2>, line 1)

In [None]:
dense_tensor = input_layer(features, columns)
for units in [256, 16, 8]:
    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
prediction = tf.layers.dense(dense_tensor, 1, tf.nn.softmax)