# Prediction of salary class using Canned estimators with Tensorflow

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
print("Tensorflow version is : ", tf.__version__)

Tensorflow version is :  1.10.0


#### fetching Train and test Data

In [19]:
census_train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
census_test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

In [109]:
census_train_path = tf.contrib.keras.utils.get_file("census_train", census_train_url)
census_test_path = tf.contrib.keras.utils.get_file("census_test", census_test_url)

In [110]:
column_names= ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
  'marital-status', 'occupation', 'relationship', 'race', 'gender',
  'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
  'income']

In [111]:
census_train =pd.read_csv(census_train_path, names = column_names, index_col = False)
census_test =pd.read_csv(census_test_path, names = column_names, index_col = False, skiprows = 1)


##### Drop missing rows

In [112]:
census_train = census_train.dropna(how = "any", axis = 0)
census_test = census_test.dropna(how = "any", axis = 0)

In [113]:
from collections import Counter 
Counter (census_train)

Counter({'age': 1,
         'workclass': 1,
         'fnlwgt': 1,
         'education': 1,
         'education-num': 1,
         'marital-status': 1,
         'occupation': 1,
         'relationship': 1,
         'race': 1,
         'gender': 1,
         'capital-gain': 1,
         'capital-loss': 1,
         'hours-per-week': 1,
         'native-country': 1,
         'income': 1})

In [114]:
Counter(census_train.income)

Counter({' <=50K': 24720, ' >50K': 7841})

In [115]:
census_train_label= census_train.pop("income").apply(lambda x: ">50K" in x)
census_test_label= census_test.pop("income").apply(lambda x: ">50K" in x)


In [116]:
print(Counter(census_train_label))
print(Counter(census_test_label))

Counter({False: 24720, True: 7841})
Counter({False: 12435, True: 3846})


#### Define train and test imput functions

In [117]:
def create_train_input_fn():
    return (tf.estimator.inputs.pandas_input_fn(x=census_train,
                                                y=census_train_label,
                                               batch_size=32,
                                               num_epochs= None, # loop forever until estimator finish training
                                                                 # till it stops asking for batches
                                               shuffle= True))

In [118]:
def create_test_input_fn():
    return(tf.estimator.inputs.pandas_input_fn(x=census_test,
                                              y=census_test_label,
                                              batch_size=32,
                                              num_epochs=1, # we want outputs for each sample test
                                              shuffle= False)) # Do not shuffle. We need to compare pred and y test labels

##### feature Engineering

Now we'll specify the features we'll use and how we'd like them represented.Here are five different types we'll use in our Linear model:



* A numeric_column. This is just a real-valued attribute.
* A bucketized_column. TensorFlow automatically buckets a numeric column for us.
* A categorical_column_with_vocabulary_list. This is just a categorical column, where you know the possible values in advance. This is useful when you have a small number of possibilities.
* A categorical_column_with_hash_bucket. This is a useful way to represent categorical features when you have a large number of values. Beware of hash collisions.
* A crossed_column. Linear models cannot consider interactions between features, so we'll ask TensorFlow to cross features for us.

In the Deep model, we'll also use:

* An embedding column(!). This automatically creates an embedding for categorical data.

In [119]:
feature_columns = []
#numeric column
age =  tf.feature_column.numeric_column("age")
feature_columns.append(age)
#bucketize the numerical column
age_buckets= tf.feature_column.bucketized_column(tf.feature_column.numeric_column("age"),boundaries=[31,46,60,75,90])
feature_columns.append(age_buckets)
feature_columns

[_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(31, 46, 60, 75, 90))]

In [120]:
#categorical column with vocabulary list
education = tf.feature_column.categorical_column_with_vocabulary_list("education",["Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"])
feature_columns.append(education)

In [121]:
#for a cat column if vocab can't be specified in advanc, (i.e. 1000 categories ) hashing is a better approach
native_country = tf.feature_column.categorical_column_with_hash_bucket("native-country",1000)
feature_columns.append(native_country)

In [122]:
native_country

_HashedCategoricalColumn(key='native-country', hash_bucket_size=1000, dtype=tf.string)

In [123]:
#cross column feature creation
age_cross_function = tf.feature_column.crossed_column([age_buckets,education],hash_bucket_size=int(1e4))
feature_columns.append(age_cross_function)

##### Train an canned linear estimator

In [124]:
census_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [125]:
feature_columns

[_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(31, 46, 60, 75, 90)),
 _VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 _HashedCategoricalColumn(key='native-country', hash_bucket_size=1000, dtype=tf.string),
 _CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(31, 46, 60, 75, 90)), _VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc

In [126]:
train_input_fn = create_test_input_fn()
estimator = tf.estimator.LinearClassifier(feature_columns, n_classes=2,model_dir='graphs/linear')
estimator.train(train_input_fn,steps =1000) # 1000 epochs

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'graphs/linear', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000231A11C03C8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into graphs/linear\model.ckpt.
INFO:tenso

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x231a1153e48>

##### evaluation of model on test data

In [127]:
test_input_fn = create_test_input_fn()
estimator.evaluate(test_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-11-07:30:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from graphs/linear\model.ckpt-509
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-11-07:30:27
INFO:tensorflow:Saving dict for global step 509: accuracy = 0.76377374, accuracy_baseline = 0.76377374, auc = 0.68360966, auc_precision_recall = 0.36514795, average_loss = 0.5093252, global_step = 509, label/mean = 0.23622628, loss = 16.2914, precision = 0.0, prediction/mean = 0.16753425, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 509: graphs/linear\model.ckpt-509


{'accuracy': 0.76377374,
 'accuracy_baseline': 0.76377374,
 'auc': 0.68360966,
 'auc_precision_recall': 0.36514795,
 'average_loss': 0.5093252,
 'label/mean': 0.23622628,
 'loss': 16.2914,
 'precision': 0.0,
 'prediction/mean': 0.16753425,
 'recall': 0.0,
 'global_step': 509}

##### How to get predictions for individual samples

In [137]:
test_input_fn = create_test_input_fn()
predictions = estimator.predict(test_input_fn)

i = 0
for pred in predictions:
    true_label = census_test_label[i]
    pred_label = pred["class_ids"][0]
    print("Example %d. Actual: %d, Predicted: %d" % (i, true_label, pred_label))
    i += 1
    if i == 5: break

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from graphs/linear\model.ckpt-509
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Example 0. Actual: 0, Predicted: 0
Example 1. Actual: 0, Predicted: 0
Example 2. Actual: 1, Predicted: 0
Example 3. Actual: 1, Predicted: 0
Example 4. Actual: 0, Predicted: 0


model performance :

accuracy = 0.76377374, accuracy_baseline = 0.76377374, auc = 0.68360966, auc_precision_recall = 0.36514795, average_loss = 0.5093252, global_step = 509, label/mean = 0.23622628, loss = 16.2914, precision = 0.0, prediction/mean = 0.16753425, recall = 0.0

### improve model performance, Train a deep network

In [139]:
census_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [143]:
Counter(census_train["workclass"]).keys()

dict_keys([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'])

In [144]:
# We'll provide vocabulary lists for features with just a few terms
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    'workclass',
    [' Self-emp-not-inc', ' Private', ' State-gov', ' Federal-gov',
     ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'])

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'education',
    [' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college',
     ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school',
     ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th'])

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'marital-status',
    [' Married-civ-spouse', ' Divorced', ' Married-spouse-absent',
     ' Never-married', ' Separated', ' Married-AF-spouse', ' Widowed'])
     
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'relationship',
    [' Husband', ' Not-in-family', ' Wife', ' Own-child', ' Unmarried',
     ' Other-relative'])

In [146]:
feature_columns = [
    #indicator matrix for dimensional vocabularies
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(relationship),
    
    #embeddings for high dimensional vocabularies
    tf.feature_column.embedding_column(
    tf.feature_column.categorical_column_with_hash_bucket("occupation", 1000),10),
    
    # numeric features
    tf.feature_column.numeric_column('age'),
    tf.feature_column.numeric_column('education-num'),
    tf.feature_column.numeric_column('capital-gain'),
    tf.feature_column.numeric_column('capital-loss'),
    tf.feature_column.numeric_column('hours-per-week')
    
]

##### train a DNN

In [150]:
estimator = tf.estimator.DNNClassifier(hidden_units=[256,128,64],
                                       feature_columns=feature_columns,
                                       n_classes=2,
                                      model_dir="graphs/dnn")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'graphs/dnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000231A60027B8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [151]:
train_input_fn = create_train_input_fn()
estimator.train(train_input_fn,steps =2000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into graphs/dnn\model.ckpt.
INFO:tensorflow:loss = 128.25418, step = 1
INFO:tensorflow:global_step/sec: 187.968
INFO:tensorflow:loss = 16.363972, step = 101 (0.536 sec)
INFO:tensorflow:global_step/sec: 258.063
INFO:tensorflow:loss = 15.151709, step = 201 (0.392 sec)
INFO:tensorflow:global_step/sec: 225.732
INFO:tensorflow:loss = 21.3796, step = 301 (0.443 sec)
INFO:tensorflow:global_step/sec: 228.57
INFO:tensorflow:loss = 15.638337, step = 401 (0.445 sec)
INFO:tensorflow:global_step/sec: 256.081
INFO:tensorflow:loss = 14.02853, step = 501 (0.383 sec)
INFO:tensorflow:global_step/sec: 258.062
INFO:tensorflow:loss = 29.52417, step = 601 (0.391 sec)
INFO:tensorflow:global_step/sec: 253.484
INFO:tensorflow:loss = 6.82

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x231a58ca3c8>

In [152]:
test_input_fn = create_test_input_fn()
estimator.evaluate(test_input_fn,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-11-08:04:43
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from graphs/dnn\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Evaluation [300/1000]
INFO:tensorflow:Evaluation [400/1000]
INFO:tensorflow:Evaluation [500/1000]
INFO:tensorflow:Finished evaluation at 2019-02-11-08:04:48
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.84896505, accuracy_baseline = 0.76377374, auc = 0.9002356, auc_precision_recall = 0.75125724, average_loss = 0.3295633, global_step = 2000, label/mean = 0.23622628, loss = 10.541493, precision = 0.7113685, prediction/mean = 0.25303918, recall = 0.6068643
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: graphs/dnn\model.ckpt-2000


{'accuracy': 0.84896505,
 'accuracy_baseline': 0.76377374,
 'auc': 0.9002356,
 'auc_precision_recall': 0.75125724,
 'average_loss': 0.3295633,
 'label/mean': 0.23622628,
 'loss': 10.541493,
 'precision': 0.7113685,
 'prediction/mean': 0.25303918,
 'recall': 0.6068643,
 'global_step': 2000}

The performance has improved