# Classification On California Census Data

Find who whon less or equal to 50k$

In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
# Create the data frame with the California Census Data
df = pd.read_csv("census_data.csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
education_num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
capital_gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital_loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours_per_week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
age               32561 non-null int64
workclass         32561 non-null object
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


In [6]:
df["income_bracket"].unique()

array([' <=50K', ' >50K'], dtype=object)

In [7]:
# Change the target value for 0 and 1

df["income_bracket"] = df["income_bracket"].apply(lambda x: 0 if x == ' <=50K' else 1)

In [8]:
# Train test split the data

from sklearn.model_selection import train_test_split

X = df.drop("income_bracket", axis=1)

y = df["income_bracket"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
# Create a list of the features
feat_cols = X_train.columns

In [10]:
# Separate the categorical cols to the numerical ones

# numerical
num_cols = []

# Categorical
cat_cols = []

for col in feat_cols:
    if X_train[col].dtype == "object":
        cat_cols.append(col)
    else:
        num_cols.append(col)
        
print("Numerical:",num_cols)
print("\n")
print("Categorical:",cat_cols)

Numerical: ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']


Categorical: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country']


In [11]:
# Create the tf objects for every columns
feat_cols = []

# Transform the categorical value with hash bucket 
for col in cat_cols:
    
    uniqueValue = len(X_train[col].unique())
    
    # Using hashbucket
    exec(f"{col} = tf.feature_column.categorical_column_with_hash_bucket({'col'}, hash_bucket_size={uniqueValue})")
    
    # create embedded columns out of the cateogrical feature
    exec(f"{col} = tf.feature_column.embedding_column({col}, dimension={uniqueValue})")

    exec(f"feat_cols.append({col})")

for col in num_cols:
    
    exec(f"{col} = tf.feature_column.numeric_column('{col}')")
    
    exec(f"feat_cols.append({col})")

In [12]:
feat_cols

[_EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='workclass', hash_bucket_size=9, dtype=tf.string), dimension=9, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1a1120dda0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='education', hash_bucket_size=16, dtype=tf.string), dimension=16, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1a1120d518>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='marital_status', hash_bucket_size=7, dtype=tf.string), dimension=7, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1a1120ddd8>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 _EmbeddingColumn(categorical_column=_HashedCategor

In [13]:
# Create the tuning parameters

batchSize=100
numEpochs = 20000
numSteps = 10000
hiddenUnit = [10,20,20,20,10]
dropOut = 0.01

In [14]:
# Create the input function
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=batchSize, 
                                                 num_epochs=numEpochs, shuffle=True)

In [15]:
# Initialize the classifier (DNNClassifier)

clf = tf.estimator.DNNClassifier(hidden_units=hiddenUnit, feature_columns=feat_cols, dropout=dropOut)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/gf/bvnvfh0s21x1yrbw8vsz0mt00000gn/T/tmpytew64dv', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [16]:
# Train the model on the test_datas
clf.train(input_fn=input_func, steps=numSteps)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/gf/bvnvfh0s21x1yrbw8vsz0mt00000gn/T/tmpytew64dv/model.ckpt.
INFO:tensorflow:loss = 412.073, step = 1
INFO:tensorflow:global_step/sec: 135.19
INFO:tensorflow:loss = 46.1827, step = 101 (0.741 sec)
INFO:tensorflow:global_step/sec: 146.649
INFO:tensorflow:loss = 32.3489, step = 201 (0.683 sec)
INFO:tensorflow:global_step/sec: 148.378
INFO:tensorflow:loss = 43.0974, step = 301 (0.674 sec)
INFO:tensorflow:global_step/sec: 142.263
INFO:tensorflow:loss = 50.5656, step = 401 (0.703 sec)
INFO:tensorflow:global_step/sec: 141.862
INFO:tensorflow:loss = 41.3097, step = 501 (0.705 sec)
INFO:tensorflow:global_step/sec: 140.565
INFO:tensorflow:loss = 32.788, step = 601 (0.711 sec)
INFO:tensorflow:global_step/sec: 148.468
INFO:tensorflow:loss = 34.0014, step = 701 (0.673 sec)
INFO:tensorflow:global_step/sec: 152.286
INFO:tensorflow:loss = 49.3322, step = 801 (0.657 sec)
INFO:tensorflow:global_step/se

INFO:tensorflow:global_step/sec: 162.275
INFO:tensorflow:loss = 37.0871, step = 8401 (0.616 sec)
INFO:tensorflow:global_step/sec: 167.24
INFO:tensorflow:loss = 39.0764, step = 8501 (0.598 sec)
INFO:tensorflow:global_step/sec: 168.162
INFO:tensorflow:loss = 35.0023, step = 8601 (0.595 sec)
INFO:tensorflow:global_step/sec: 167.251
INFO:tensorflow:loss = 37.2984, step = 8701 (0.598 sec)
INFO:tensorflow:global_step/sec: 167.511
INFO:tensorflow:loss = 33.2957, step = 8801 (0.597 sec)
INFO:tensorflow:global_step/sec: 167.713
INFO:tensorflow:loss = 45.7001, step = 8901 (0.596 sec)
INFO:tensorflow:global_step/sec: 167.566
INFO:tensorflow:loss = 41.9354, step = 9001 (0.597 sec)
INFO:tensorflow:global_step/sec: 167.803
INFO:tensorflow:loss = 36.6156, step = 9101 (0.596 sec)
INFO:tensorflow:global_step/sec: 168.201
INFO:tensorflow:loss = 40.3436, step = 9201 (0.595 sec)
INFO:tensorflow:global_step/sec: 165.597
INFO:tensorflow:loss = 31.1461, step = 9301 (0.604 sec)
INFO:tensorflow:global_step/sec

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1a1120e6d8>

In [17]:
# Create the input function for the test data
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=batchSize, 
                                                      num_epochs=1, shuffle=False)

In [18]:
# Create the predictions dictionnary (0 if salary <=50K$ else 1)

y_pred = []

for pred in list(clf.predict(input_fn=eval_input_func)):
    
    y_pred.append(pred['class_ids'][0])

INFO:tensorflow:Restoring parameters from /var/folders/gf/bvnvfh0s21x1yrbw8vsz0mt00000gn/T/tmpytew64dv/model.ckpt-10000


In [19]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.85      0.94      0.89      8196
          1       0.72      0.47      0.57      2550

avg / total       0.82      0.83      0.82     10746

