In [4]:
import tensorflow as tf
import pandas as pd

In [7]:
## Define path data
COLUMNS = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_week', 'native_country', 'label']
PATH = "d:/adult_data.csv"
PATH_test = "d:/adult_test.csv"

In [8]:
df_train = pd.read_csv(PATH, skipinitialspace=True, names = COLUMNS, index_col=False)
df_test = pd.read_csv(PATH_test,skiprows = 1, skipinitialspace=True, names = COLUMNS, index_col=False)

In [9]:
print(df_train.shape, df_test.shape)
print(df_train.dtypes)

(32561, 15) (16281, 15)
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label             object
dtype: object


In [10]:
label = {'<=50K': 0,'>50K': 1}
df_train.label = [label[item] for item in df_train.label]
label_t = {'<=50K.': 0,'>50K.': 1}
df_test.label = [label_t[item] for item in df_test.label]

In [11]:
print(df_train["label"].value_counts())
### The model will be correct in atleast 70% of the case
print(df_test["label"].value_counts())
## Unbalanced label
print(df_train.dtypes)

0    24720
1     7841
Name: label, dtype: int64
0    12435
1     3846
Name: label, dtype: int64
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label              int64
dtype: object


In [12]:
## Add features to the bucket: 
### Define continuous list
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week']
### Define the categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [13]:
def print_transformation(feature = "age", continuous = True, size = 2): 
    #X = fc.numeric_column(feature)
    ## Create feature name
    feature_names = [
    feature]

    ## Create dict with the data
    d = dict(zip(feature_names, [df_train[feature]]))

    ## Convert age
    if continuous == True:
        c = tf.feature_column.numeric_column(feature)
        feature_columns = [c]
    else: 
        c = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=size) 
        c_indicator = tf.feature_column.indicator_column(c)
        feature_columns = [c_indicator]
    
## Use input_layer to print the value
    input_layer = tf.feature_column.input_layer(
        features=d,
        feature_columns=feature_columns
        )
    ## Create lookup table
    zero = tf.constant(0, dtype=tf.float32)
    where = tf.not_equal(input_layer, zero)
    ## Return lookup tble
    indices = tf.where(where)
    values = tf.gather_nd(input_layer, indices)
    ## Initiate graph
    sess = tf.Session()
    ## Print value
    print(sess.run(input_layer))
print_transformation(feature = "age", continuous = True)

[[ 39.]
 [ 50.]
 [ 38.]
 ..., 
 [ 58.]
 [ 22.]
 [ 52.]]


In [14]:
continuous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]	

In [15]:
print_transformation(feature = "sex", continuous = False, size = 2)

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [16]:
relationship = tf.feature_column.categorical_column_with_vocabulary_list('relationship', ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried','Other-relative'])

In [17]:
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000) for k in CATE_FEATURES]

In [18]:
model = tf.estimator.LinearClassifier(
    n_classes = 2,
    model_dir="ongoing/train", 
    feature_columns=categorical_features+ continuous_features)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000015448785CC0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
FEATURES = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_week', 'native_country']
LABEL= 'label'
def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [20]:
model.train(input_fn=get_input_fn(df_train, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ongoing/train\model.ckpt.
INFO:tensorflow:loss = 88.7229, step = 1
INFO:tensorflow:global_step/sec: 128.915
INFO:tensorflow:loss = 52583.6, step = 101 (0.780 sec)
INFO:tensorflow:global_step/sec: 256.835
INFO:tensorflow:loss = 25203.8, step = 201 (0.388 sec)
INFO:tensorflow:global_step/sec: 256.837
INFO:tensorflow:loss = 54924.3, step = 301 (0.390 sec)
INFO:tensorflow:global_step/sec: 253.576
INFO:tensorflow:loss = 68509.3, step = 401 (0.393 sec)
INFO:tensorflow:global_step/sec: 260.859
INFO:tensorflow:loss = 9151.75, step = 501 (0.383 sec)
INFO:tensorflow:global_step/sec: 233.433
INFO:tensorflow:loss = 34576.1, step = 601 (0.428 sec)
INFO:tensorflow:global_step/sec: 257.498
INFO:tensorflow:loss = 36184.2, s

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x15448735668>

In [21]:
model.evaluate(input_fn=get_input_fn(df_test, 
                                      num_epochs=1,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-14-14:41:19
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ongoing/train\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Finished evaluation at 2019-05-14-14:41:21
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.796081, accuracy_baseline = 0.763774, auc = 0.615321, auc_precision_recall = 0.555618, average_loss = 123.137, global_step = 1000, label/mean = 0.236226, loss = 15662.4, precision = 0.66773, prediction/mean = 0.0963189, recall = 0.272231


{'accuracy': 0.7960813,
 'accuracy_baseline': 0.76377374,
 'auc': 0.61532056,
 'auc_precision_recall': 0.55561829,
 'average_loss': 123.13694,
 'global_step': 1000,
 'label/mean': 0.23622628,
 'loss': 15662.441,
 'precision': 0.66772962,
 'prediction/mean': 0.096318893,
 'recall': 0.27223089}

In [22]:
def square_var(df_t, df_te, var_name = 'age'):
    df_t['new'] = df_t[var_name].pow(2) 
    df_te['new'] = df_te[var_name].pow(2) 
    return df_t, df_te

In [23]:
df_train_new, df_test_new = square_var(df_train, df_test, var_name = 'age')

In [24]:
print(df_train_new.shape, df_test_new.shape)

(32561, 16) (16281, 16)


In [25]:
CONTI_FEATURES_NEW  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week', 'new']
continuous_features_new = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES_NEW]

In [26]:
model_1 = tf.estimator.LinearClassifier(
    model_dir="ongoing/train1", 
    feature_columns=categorical_features+ continuous_features_new)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001544D3DF080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [27]:
FEATURES_NEW = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_week', 'native_country', 'new']
def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES_NEW}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [28]:
model_1.train(input_fn=get_input_fn(df_train, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ongoing/train1\model.ckpt.
INFO:tensorflow:loss = 88.7229, step = 1
INFO:tensorflow:global_step/sec: 113.792
INFO:tensorflow:loss = 70077.7, step = 101 (0.884 sec)
INFO:tensorflow:global_step/sec: 238.446
INFO:tensorflow:loss = 49522.1, step = 201 (0.416 sec)
INFO:tensorflow:global_step/sec: 220.993
INFO:tensorflow:loss = 107121.0, step = 301 (0.453 sec)
INFO:tensorflow:global_step/sec: 228.625
INFO:tensorflow:loss = 12814.2, step = 401 (0.437 sec)
INFO:tensorflow:global_step/sec: 238.446
INFO:tensorflow:loss = 19573.9, step = 501 (0.421 sec)
INFO:tensorflow:global_step/sec: 229.676
INFO:tensorflow:loss = 26382.0, step = 601 (0.433 sec)
INFO:tensorflow:global_step/sec: 243.088
INFO:tensorflow:loss = 23417.7,

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1544d3df4a8>

In [29]:
model_1.evaluate(input_fn=get_input_fn(df_test_new, 
                                      num_epochs=1,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-14-14:44:46
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ongoing/train1\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Finished evaluation at 2019-05-14-14:44:47
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.794423, accuracy_baseline = 0.763774, auc = 0.609376, auc_precision_recall = 0.548858, average_loss = 111.005, global_step = 1000, label/mean = 0.236226, loss = 14119.3, precision = 0.66824, prediction/mean = 0.0911626, recall = 0.25767


{'accuracy': 0.79442292,
 'accuracy_baseline': 0.76377374,
 'auc': 0.60937554,
 'auc_precision_recall': 0.54885805,
 'average_loss': 111.0046,
 'global_step': 1000,
 'label/mean': 0.23622628,
 'loss': 14119.265,
 'precision': 0.66824007,
 'prediction/mean': 0.091162622,
 'recall': 0.25767031}

In [30]:
age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [31]:
education_x_occupation = [tf.feature_column.crossed_column(
    ['education', 'occupation'], hash_bucket_size=1000)]
age_buckets_x_education_x_occupation = [tf.feature_column.crossed_column(
    [age_buckets, 'education', 'occupation'], hash_bucket_size=1000)]

In [32]:
base_columns = [
    age_buckets,
]

model_imp = tf.estimator.LinearClassifier(
    model_dir="ongoing/train3", 
    feature_columns=categorical_features+base_columns+education_x_occupation+age_buckets_x_education_x_occupation)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001544B6EEBE0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [33]:
FEATURES_imp = ['age','workclass', 'education', 'education_num', 'marital',
                'occupation', 'relationship', 'race', 'sex', 'native_country', 'new']

def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES_imp}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [34]:
model_imp.train(input_fn=get_input_fn(df_train_new, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ongoing/train3\model.ckpt.
INFO:tensorflow:loss = 88.7229, step = 1
INFO:tensorflow:global_step/sec: 117.402
INFO:tensorflow:loss = 50.3345, step = 101 (0.856 sec)
INFO:tensorflow:global_step/sec: 270.758
INFO:tensorflow:loss = 56.1532, step = 201 (0.368 sec)
INFO:tensorflow:global_step/sec: 303.675
INFO:tensorflow:loss = 45.792, step = 301 (0.329 sec)
INFO:tensorflow:global_step/sec: 293.851
INFO:tensorflow:loss = 37.4857, step = 401 (0.340 sec)
INFO:tensorflow:global_step/sec: 286.272
INFO:tensorflow:loss = 56.4845, step = 501 (0.350 sec)
INFO:tensorflow:global_step/sec: 287.095
INFO:tensorflow:loss = 32.5289, step = 601 (0.347 sec)
INFO:tensorflow:global_step/sec: 286.273
INFO:tensorflow:loss = 37.4381, s

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1544b6eeac8>

In [35]:
model_imp.evaluate(input_fn=get_input_fn(df_test_new, 
                                      num_epochs=1,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-14-14:46:05
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ongoing/train3\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Finished evaluation at 2019-05-14-14:46:07
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.835821, accuracy_baseline = 0.763774, auc = 0.884016, auc_precision_recall = 0.695996, average_loss = 0.351227, global_step = 1000, label/mean = 0.236226, loss = 44.6744, precision = 0.689867, prediction/mean = 0.233207, recall = 0.554082


{'accuracy': 0.83582091,
 'accuracy_baseline': 0.76377374,
 'auc': 0.88401639,
 'auc_precision_recall': 0.69599575,
 'average_loss': 0.35122654,
 'global_step': 1000,
 'label/mean': 0.23622628,
 'loss': 44.67437,
 'precision': 0.68986726,
 'prediction/mean': 0.23320661,
 'recall': 0.55408216}

In [36]:
model_regu = tf.estimator.LinearClassifier(
    model_dir="ongoing/train4", feature_columns=categorical_features+base_columns+education_x_occupation+age_buckets_x_education_x_occupation,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.9,
        l2_regularization_strength=5))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001544BD601D0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [37]:
model_regu.train(input_fn=get_input_fn(df_train_new, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ongoing/train4\model.ckpt.
INFO:tensorflow:loss = 88.7229, step = 1
INFO:tensorflow:global_step/sec: 117.678
INFO:tensorflow:loss = 50.3878, step = 101 (0.853 sec)
INFO:tensorflow:global_step/sec: 290.435
INFO:tensorflow:loss = 55.3801, step = 201 (0.344 sec)
INFO:tensorflow:global_step/sec: 287.922
INFO:tensorflow:loss = 46.8067, step = 301 (0.347 sec)
INFO:tensorflow:global_step/sec: 296.467
INFO:tensorflow:loss = 38.6827, step = 401 (0.338 sec)
INFO:tensorflow:global_step/sec: 281.435
INFO:tensorflow:loss = 56.994, step = 501 (0.354 sec)
INFO:tensorflow:global_step/sec: 283.029
INFO:tensorflow:loss = 33.2636, step = 601 (0.354 sec)
INFO:tensorflow:global_step/sec: 273.724
INFO:tensorflow:loss = 37.7902, s

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1544bd60eb8>

In [38]:
model_regu.evaluate(input_fn=get_input_fn(df_test_new, 
                                      num_epochs=1,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-14-14:46:31
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ongoing/train4\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Finished evaluation at 2019-05-14-14:46:32
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.838339, accuracy_baseline = 0.763774, auc = 0.886979, auc_precision_recall = 0.701491, average_loss = 0.346914, global_step = 1000, label/mean = 0.236226, loss = 44.1258, precision = 0.697206, prediction/mean = 0.236621, recall = 0.557982


{'accuracy': 0.83833915,
 'accuracy_baseline': 0.76377374,
 'auc': 0.8869794,
 'auc_precision_recall': 0.70149052,
 'average_loss': 0.34691378,
 'global_step': 1000,
 'label/mean': 0.23622628,
 'loss': 44.125809,
 'precision': 0.69720596,
 'prediction/mean': 0.2366209,
 'recall': 0.55798233}