In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


In [2]:
import tensorflow as tf
import pandas as pd

In [3]:
print('Using TensorFlow version %s' % (tf.__version__))

Using TensorFlow version 2.4.0


In [4]:
train_data_path = '../data/adult.data.csv'
test_data_path = '../data/adult.test.csv'

export_dir = 'export'

In [5]:
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
NUMERIC_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
CSV_COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]

In [6]:
X_train = pd.read_csv(train_data_path, header=None, names=CSV_COLUMNS)
X_test = pd.read_csv(test_data_path, header=None, names=CSV_COLUMNS)

In [7]:
income_train = X_train["income_bracket"]
income_test = X_test["income_bracket"]

X_train['income_bracket'] = (X_train['income_bracket'] == ' >50K').astype(int)
X_test['income_bracket'] = (X_test['income_bracket'] == ' >50K').astype(int)

Y_train = X_train["income_bracket"]
Y_test = X_test["income_bracket"]

X_train = X_train.drop('fnlwgt', axis=1)
X_test = X_test.drop('fnlwgt', axis=1)

X_train = X_train.drop('income_bracket', axis=1)
X_test = X_test.drop('income_bracket', axis=1)

Y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: income_bracket, dtype: int64

In [8]:
feature_columns = []
categorical_columns = []
numeric_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = X_train[feature_name].unique()
    col = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
    categorical_columns.append(col)
    feature_columns.append(col)

for feature_name in NUMERIC_COLUMNS:
    col = tf.feature_column.numeric_column(feature_name, dtype=tf.float32)
    numeric_columns.append(col)
    feature_columns.append(col)

print(feature_columns)

[VocabularyListCategoricalColumn(key='workclass', vocabulary_list=(' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='education', vocabulary_list=(' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=(' Never-married', ' Married-civ-spouse', ' Divorced', ' Married-spouse-absent', ' Separated', ' Married-AF-spouse', ' Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='occupation', vocabulary_list=(' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners', ' Prof-specialty', ' Other-service', ' Sal

In [9]:
def make_input_function(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)

        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function


In [10]:
train_input_function = make_input_function(X_train, Y_train)
eval_input_function = make_input_function(X_test, Y_test, num_epochs=1, shuffle=False)

In [11]:
linear_model = tf.estimator.LinearClassifier(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmplyv1puih', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [12]:
linear_model.train(train_input_function)
result = linear_model.evaluate(eval_input_function)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmplyv1puih/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 179.628
INFO:tensorflow:loss = 5.7943354, step = 100 (0.558 sec)
INFO:tensorflow:global_step/sec: 329.987
INFO:tensorflow:loss = 2.9879627, step = 200 (0.303 sec)
INFO:tensorflow:global_step/sec

In [13]:
print(result['accuracy'])

0.8470328


In [43]:
def prediction_input_function(features, batch_size = 256):
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

In [44]:
income_type = ['<=50K', '>50K']
i = 3
newX = X_test.iloc[i:i+1]
newY = income_test[i]
predictions = linear_model.predict(input_fn = lambda: prediction_input_function(newX))

for pred_dict in predictions: 
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]    
    print('Prediction is "{}" ({:.1f}%), Real value: "{}""'.format(income_type[class_id], 100 * probability, newY))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpviv3z1d1/model.ckpt-10180
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Prediction is ">50K" (100.0%), Real value: " >50K""


In [45]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)

# Build receiver function, and export.
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
model_dir = linear_model.export_saved_model(export_dir, serving_input_receiver_fn)
print('model dir:', model_dir)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from /tmp/tmpviv3z1d1/model.ckpt-10180
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: export/temp-1609809296/saved_model.pb
model dir: b'export/1609809296'


In [46]:
imported = tf.saved_model.load(model_dir)

In [47]:
def predict(i):
    inputs = X_test[i:i+1]
    example = tf.train.Example()
    for col in inputs:  
        if col in CATEGORICAL_COLUMNS:  
            cat_value = inputs[col].tolist()[0][0].encode('utf8')
            example.features.feature[col].bytes_list.value.extend([cat_value]) 
        if col in NUMERIC_COLUMNS:
            example.features.feature[col].float_list.value.extend(inputs[col])
    return imported.signatures["predict"](examples=tf.constant([example.SerializeToString()]))


In [48]:

predictions = predict(3)

In [49]:
class_ids = predictions['class_ids'].numpy()[0][0]
probability = predictions['probabilities'].numpy()[0][class_ids]
print('Prediction is "{}" ({:.1f}%)'.format(income_type[class_ids], 100 * probability))

Prediction is ">50K" (100.0%)
