## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)
* [Intro to Feature Engineering with TensorFlow](https://www.youtube.com/watch?v=d12ra3b_M-0) (Josh Gordon, YouTube)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from common import column_variants, load_dataset

np.random.seed(0)
tf.set_random_seed(0)

In [None]:
data_train = load_dataset('data/train.csv')
data_test = load_dataset('data/test.csv', skiprows=1)

data_train.info()

In [None]:
def drop_missing(data):
    data.dropna(inplace=True)
    data.index = pd.RangeIndex(len(data.index))

drop_missing(data_train)
drop_missing(data_test)

data_train.info()
data_train.head()

In [None]:
def create_train_input(x, y, batch_size=32): 
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)

def create_test_input(x, y):
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, num_epochs=1, shuffle=False)

In [None]:
y_train = data_train.pop('Income').apply(lambda y: y == 'High')
x_train = data_train

y_test = data_test.pop('Income').apply(lambda y: y == 'High')
x_test = data_test

In [None]:
age = tf.feature_column.numeric_column('Age')

age_bucket = tf.feature_column.bucketized_column(
    age, boundaries=[30, 40, 50, 60, 70])

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', column_variants().get('Education'))

country = tf.feature_column.categorical_column_with_hash_bucket(
    'NativeCountry', 1000)

age_bucket_education = tf.feature_column.crossed_column(
    [age_bucket, education], hash_bucket_size=int(1e4))

feature_columns = [
    age,
    age_bucket,
    education,
    country,
    age_bucket_education,
]

In [None]:
estimator = tf.estimator.LinearClassifier(
    feature_columns, model_dir='model/linear', n_classes=2)

estimator.train(create_train_input(x_train, y_train), steps=1000);

In [None]:
estimator.evaluate(create_test_input(x_test, y_test))

In [None]:
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', column_variants().get('MaritalStatus'))

occupation = tf.feature_column.categorical_column_with_hash_bucket(
    'Occupation', 100)
     
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', column_variants().get('Relationship'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', column_variants().get('WorkClass'))

feature_columns = [
    age,
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.embedding_column(occupation, 10),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(work_class),
]

In [None]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[256, 128, 64], feature_columns=feature_columns, 
    n_classes=2, model_dir='model/deep')

estimator.train(create_train_input(x_train, y_train), steps=2000);

In [None]:
estimator.evaluate(create_test_input(x_test, y_test))