## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)
* [Intro to Feature Engineering with TensorFlow](https://www.youtube.com/watch?v=d12ra3b_M-0) (Josh Gordon, YouTube)

In [1]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import tensorflow as tf

from common import feature_variants, load_dataset

np.random.seed(0)
tf.set_random_seed(0)
%matplotlib inline

In [2]:
data_train = load_dataset('data/major.csv')
data_test = load_dataset('data/minor.csv', skiprows=1)

data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age                    32561 non-null int64
WorkClass              30725 non-null object
FinalSamplingWeight    32561 non-null int64
Education              32561 non-null object
EducationNumber        32561 non-null int64
MaritalStatus          32561 non-null object
Occupation             30718 non-null object
Relationship           32561 non-null object
Race                   32561 non-null object
Sex                    32561 non-null object
CapitalGain            32561 non-null int64
CapitalLoss            32561 non-null int64
HoursPerWeek           32561 non-null int64
NativeCountry          31978 non-null object
Income                 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
def drop_missing(data):
    data.dropna(inplace=True)
    data.index = pd.RangeIndex(len(data.index))

drop_missing(data_train)
drop_missing(data_test)

data_train.info()
data_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 15 columns):
Age                    30162 non-null int64
WorkClass              30162 non-null object
FinalSamplingWeight    30162 non-null int64
Education              30162 non-null object
EducationNumber        30162 non-null int64
MaritalStatus          30162 non-null object
Occupation             30162 non-null object
Relationship           30162 non-null object
Race                   30162 non-null object
Sex                    30162 non-null object
CapitalGain            30162 non-null int64
CapitalLoss            30162 non-null int64
HoursPerWeek           30162 non-null int64
NativeCountry          30162 non-null object
Income                 30162 non-null object
dtypes: int64(6), object(9)
memory usage: 3.5+ MB


Unnamed: 0,Age,WorkClass,FinalSamplingWeight,Education,EducationNumber,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,Low
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,Low
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,Low
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,Low
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,Low


In [4]:
def create_train_input(x, y, batch_size=32): 
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)

def create_test_input(x, y):
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, num_epochs=1, shuffle=False)

In [5]:
y_train = data_train.pop('Income').apply(lambda y: y == 'High')
x_train = data_train

y_test = data_test.pop('Income').apply(lambda y: y == 'High')
x_test = data_test

In [6]:
age = tf.feature_column.numeric_column('Age')

age_bucket = tf.feature_column.bucketized_column(
    age, boundaries=[30, 40, 50, 60, 70])

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', feature_variants().get('Education'))

country = tf.feature_column.categorical_column_with_hash_bucket(
    'NativeCountry', 1000)

age_bucket_education = tf.feature_column.crossed_column(
    [age_bucket, education], hash_bucket_size=int(1e4))

feature_columns = [
    age,
    age_bucket,
    education,
    country,
    age_bucket_education,
]

In [7]:
estimator = tf.estimator.LinearClassifier(
    feature_columns, model_dir='model/linear', n_classes=2)

estimator.train(create_train_input(x_train, y_train), steps=1000);

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_save_summary_steps': 100, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_is_chief': True, '_save_checkpoints_secs': 600, '_session_config': None, '_num_ps_replicas': 0, '_master': '', '_task_type': 'worker', '_num_worker_replicas': 1, '_task_id': 0, '_keep_checkpoint_max': 5, '_log_step_count_steps': 100, '_model_dir': 'model/linear', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1b9634ea90>}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from model/linear/model.ckpt-2000
INFO:tensorflow:Saving checkpoints for 2001 into model/linear/model.ckpt.
INFO:tensorflow:loss = 12.4873, step = 2001
INFO:tensorflow:global_step/sec: 291.039
INFO:tensorflow:loss = 14.3849, step = 2101 (0.347 sec)
INFO:tensorflow:global_step/sec: 388.313
INFO:tensorflow:loss = 12.6078, step = 2201 (0.256 sec

In [8]:
estimator.evaluate(create_test_input(x_test, y_test))

INFO:tensorflow:Starting evaluation at 2018-01-12-14:18:39
INFO:tensorflow:Restoring parameters from model/linear/model.ckpt-3000
INFO:tensorflow:Finished evaluation at 2018-01-12-14:18:40
INFO:tensorflow:Saving dict for global step 3000: accuracy = 0.776428, accuracy_baseline = 0.754316, auc = 0.787269, auc_precision_recall = 0.541287, average_loss = 0.463707, global_step = 3000, label/mean = 0.245684, loss = 59.1816, prediction/mean = 0.190615


{'accuracy': 0.77642763,
 'accuracy_baseline': 0.75431609,
 'auc': 0.78726882,
 'auc_precision_recall': 0.54128659,
 'average_loss': 0.46370745,
 'global_step': 3000,
 'label/mean': 0.24568394,
 'loss': 59.181644,
 'prediction/mean': 0.1906146}

In [9]:
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', feature_variants().get('MaritalStatus'))

occupation = tf.feature_column.categorical_column_with_hash_bucket(
    'Occupation', 100)
     
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', feature_variants().get('Relationship'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', feature_variants().get('WorkClass'))

feature_columns = [
    age,
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.embedding_column(occupation, 10),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(work_class),
]

In [10]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[256, 128, 64], feature_columns=feature_columns, 
    n_classes=2, model_dir='model/deep')

estimator.train(create_train_input(x_train, y_train), steps=2000);

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_save_summary_steps': 100, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_is_chief': True, '_save_checkpoints_secs': 600, '_session_config': None, '_num_ps_replicas': 0, '_master': '', '_task_type': 'worker', '_num_worker_replicas': 1, '_task_id': 0, '_keep_checkpoint_max': 5, '_log_step_count_steps': 100, '_model_dir': 'model/deep', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1b95687c88>}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from model/deep/model.ckpt-2000
INFO:tensorflow:Saving checkpoints for 2001 into model/deep/model.ckpt.
INFO:tensorflow:loss = 10.5445, step = 2001
INFO:tensorflow:global_step/sec: 244.512
INFO:tensorflow:loss = 20.4067, step = 2101 (0.412 sec)
INFO:tensorflow:global_step/sec: 342.704
INFO:tensorflow:loss = 11.0904, step = 2201 (0.291 sec)
INFO

In [11]:
estimator.evaluate(create_test_input(x_test, y_test))

INFO:tensorflow:Starting evaluation at 2018-01-12-14:18:48
INFO:tensorflow:Restoring parameters from model/deep/model.ckpt-4000
INFO:tensorflow:Finished evaluation at 2018-01-12-14:18:49
INFO:tensorflow:Saving dict for global step 4000: accuracy = 0.830544, accuracy_baseline = 0.754316, auc = 0.883057, auc_precision_recall = 0.698147, average_loss = 0.358379, global_step = 4000, label/mean = 0.245684, loss = 45.7389, prediction/mean = 0.24402


{'accuracy': 0.83054447,
 'accuracy_baseline': 0.75431609,
 'auc': 0.88305688,
 'auc_precision_recall': 0.69814718,
 'average_loss': 0.35837924,
 'global_step': 4000,
 'label/mean': 0.24568394,
 'loss': 45.738911,
 'prediction/mean': 0.24401987}