In [None]:
# Ensure that we have Tensorflow 1.13 installed.
!pip3 freeze | grep tensorflow==1.13.1 || pip3 install tensorflow==1.13.1

In [None]:
import tensorflow as tf

tf.enable_eager_execution()
tf.logging.set_verbosity(tf.logging.ERROR)

## Intro

The `tf.feature_column` package provides several options for encoding categorical data. This mini-lab gives you an oppurtunity to explore and understand these options.

In [22]:
# Toy Features Dictionary

features = {"sq_footage": [ 1000, 2000, 3000, 4000, 5000],
            "house_type":       ["house", "house", "apt", "apt", "townhouse"]}

#### Feature Column Definition

We have one continuous feature and one categorical feature.

Note that the category 'townhouse' is outside of our vocabulary list (OOV for short).

In [None]:
feat_cols = [
    tf.feature_column.numeric_column('sq_footage'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'house_type',['house','apt']
        ))
]

#### Inspect Transformed Data

This is what would be input to your model would be after the features are transformed by the feature column specification.

In [None]:
tf.feature_column.input_layer(features,feat_cols)

### Excercise 1

What is the current encoding behavior for the OOV value?

Modify the feature column to have OOV values default to the 'house' category.

In [None]:
feat_cols = [
    tf.feature_column.numeric_column('sq_footage'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            #TODO
        ))
]

tf.feature_column.input_layer(features,feat_cols)

### Excercise 2

Now modify the feature column to have OOV values be assigned to a separate 'catch-all' category.


In [None]:
feat_cols = [
    tf.feature_column.numeric_column('sq_footage'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            #TODO
        ))
]

tf.feature_column.input_layer(features,feat_cols)

### Excercise 3

Assume we didn't have a vocabulary list available. Modify the feature column to one-hot encode house type based on a hash function.

What is the minimum hash size to ensure no collisions?

In [None]:
feat_cols = [
    tf.feature_column.numeric_column('sq_footage'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_hash_bucket(
            #TODO
        ))
]

tf.feature_column.input_layer(features,feat_cols)