In [30]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [31]:
import tensorflow as tf
import pandas as pd

In [32]:
print('Using TensorFlow version %s' % (tf.__version__))

Using TensorFlow version 2.4.0


In [33]:
train_data_path = '../data/adult.data.csv'
test_data_path = '../data/adult.test.csv'

export_dir = 'export'

In [34]:
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
NUMERIC_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
CSV_COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]

In [35]:
X_train = pd.read_csv(train_data_path, header=None, names=CSV_COLUMNS)
X_test = pd.read_csv(test_data_path, header=None, names=CSV_COLUMNS)

In [36]:
income_train = X_train["income_bracket"]
income_test = X_test["income_bracket"]

X_train['income_bracket'] = (X_train['income_bracket'] == ' >50K').astype(int)
X_test['income_bracket'] = (X_test['income_bracket'] == ' >50K').astype(int)

Y_train = X_train["income_bracket"]
Y_test = X_test["income_bracket"]

X_train = X_train.drop('fnlwgt', axis=1)
X_test = X_test.drop('fnlwgt', axis=1)

X_train = X_train.drop('income_bracket', axis=1)
X_test = X_test.drop('income_bracket', axis=1)

Y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: income_bracket, dtype: int64

In [37]:
feature_columns = []
categorical_columns = []
numeric_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = X_train[feature_name].unique()
    col = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
    categorical_columns.append(col)
    feature_columns.append(col)

for feature_name in NUMERIC_COLUMNS:
    col = tf.feature_column.numeric_column(feature_name, dtype=tf.float32)
    numeric_columns.append(col)
    feature_columns.append(col)

print(feature_columns)

[VocabularyListCategoricalColumn(key='workclass', vocabulary_list=(' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='education', vocabulary_list=(' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=(' Never-married', ' Married-civ-spouse', ' Divorced', ' Married-spouse-absent', ' Separated', ' Married-AF-spouse', ' Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='occupation', vocabulary_list=(' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners', ' Prof-specialty', ' Other-service', ' Sal

In [38]:
def make_input_function(features, labels, training=True, batch_size = 256):
    def input_function():        
        dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

        if training:
            dataset = dataset.shuffle(1000).repeat()

        return dataset.batch(batch_size)

    return input_function


In [39]:
train_input_function = make_input_function(X_train, Y_train)
eval_input_function = make_input_function(X_test, Y_test, training=False)

In [40]:
deep_columns = []
for col in categorical_columns:
    deep_columns.append(tf.feature_column.indicator_column(col))

for col in numeric_columns:
    deep_columns.append(col)

classifier = tf.estimator.DNNClassifier(
    feature_columns=deep_columns,
    hidden_units = [30, 10],
    n_classes=2
)




INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpzf_2sf45', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [41]:
classifier.train(input_fn = make_input_function(X_train, Y_train, training=True), steps=5000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpzf_2sf45/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 11.742542, step = 0
INFO:tensorflow:global_step/sec: 189.873
INFO:tensorflow:loss = 0.566441, step = 100 (0.528 sec)
INFO:tensorflow:global_step/sec: 224.005
INFO:tensorflow:loss = 0.47907874, step = 200 (0.446 sec)
INFO:tensorflow:global_step/sec: 221.607
INFO:tensorflow:loss = 0.46038243, step = 300 (0.451 sec)
INFO:tensorflow:global_step/sec: 226.513
INFO:tensorflow:loss = 0.5277344, step = 400 (0.442 sec)
INFO:tensorflow:global_step/sec: 226.97
INFO:tensorflow:loss = 0.4818145, step = 500 (0.440 sec)
INFO:t

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f7c0802e4c0>

In [42]:
result = classifier.evaluate(input_fn=make_input_function(X_test, Y_test, training=False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-01-05T01:50:32Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpzf_2sf45/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.06802s
INFO:tensorflow:Finished evaluation at 2021-01-05-01:50:34
INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.8056272, accuracy_baseline = 0.7637916, auc = 0.8238, auc_precision_recall = 0.6379304, average_loss = 0.4244011, global_step = 5000, label/mean = 0.23620838, loss = 0.42447194, precision = 0.7241606, prediction/mean = 0.2587335, recall = 0.2860858
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /tmp/tmpzf_2sf45/model.ckpt-5000


In [43]:
print("Accuracy: {accuracy:0.3f}\n".format(**result))

Accuracy: 0.806



In [44]:
def prediction_input_function(features, batch_size = 256):
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

In [45]:
income_type = ['<=50K', '>50K']

i = 53
newX = X_test.iloc[i:i+1]
newY = income_test[i]
predictions = classifier.predict(input_fn = lambda: prediction_input_function(newX))

for pred_dict in predictions: 
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]   

    print('Prediction is "{}" ({:.1f}%), Real value: "{}""'.format(income_type[class_id], 100 * probability, newY))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpzf_2sf45/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Prediction is ">50K" (99.9%), Real value: " >50K""


In [46]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)

# Build receiver function, and export.
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
model_dir = classifier.export_saved_model(export_dir, serving_input_receiver_fn)
print('model dir:', model_dir)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from /tmp/tmpzf_2sf45/model.ckpt-5000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: export/temp-1609811435/saved_model.pb
model dir: b'export/1609811435'


In [47]:
imported = tf.saved_model.load(model_dir)

In [48]:
def predict(i):
    inputs = X_test[i:i+1]
    example = tf.train.Example()
    for col in inputs:  
        if col in CATEGORICAL_COLUMNS:  
            cat_value = inputs[col].tolist()[0][0].encode('utf8')
            example.features.feature[col].bytes_list.value.extend([cat_value]) 
        if col in NUMERIC_COLUMNS:
            example.features.feature[col].float_list.value.extend(inputs[col])
    return imported.signatures["predict"](examples=tf.constant([example.SerializeToString()]))


In [49]:
predictions = predict(3)

In [50]:
class_ids = predictions['class_ids'].numpy()[0][0]
probability = predictions['probabilities'].numpy()[0][class_ids]
print('Prediction is "{}" ({:.1f}%)'.format(income_type[class_ids], 100 * probability))

Prediction is ">50K" (93.9%)
