#Tensorflow Classification Exercise

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
census_data = pd.read_csv("census_data.csv")

In [0]:
census_data.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [0]:
census_data.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [0]:
census_data.income_bracket.unique()

array([' <=50K', ' >50K'], dtype=object)

In [0]:
census_data.income_bracket = census_data.groupby(census_data['income_bracket']).ngroup()

In [0]:
census_data.head(20)

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [0]:
census_data['income_bracket'].unique()

array([0, 1])

In [0]:
labels = census_data['income_bracket']

In [0]:
from sklearn.model_selection import train_test_split 

In [0]:
X_train, X_test, y_train, y_test = train_test_split(census_data.drop('income_bracket', axis = 1), labels, test_size=0.3, random_state=100)

In [0]:
X_train.head(1)

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country
29313,45,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States


In [0]:
len(X_train.workclass.unique())

9

In [0]:
#Categorical columns
workclass = tf.feature_column.categorical_column_with_hash_bucket("workclass", hash_bucket_size=25)
embedded_workclass = tf.feature_column.embedding_column(workclass, dimension=len(X_train.workclass.unique()))

education = tf.feature_column.categorical_column_with_hash_bucket("education", hash_bucket_size=25)
embedded_education = tf.feature_column.embedding_column(education, dimension=len(X_train.education.unique()))

marital_status = tf.feature_column.categorical_column_with_hash_bucket("marital_status", hash_bucket_size=25)
embedded_marital_status = tf.feature_column.embedding_column(marital_status, dimension=len(X_train.marital_status.unique()))

occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation", hash_bucket_size=25)
embedded_occupation = tf.feature_column.embedding_column(occupation, dimension=len(X_train.occupation.unique()))

relationship = tf.feature_column.categorical_column_with_hash_bucket("relationship", hash_bucket_size=25)
embedded_relationship = tf.feature_column.embedding_column(relationship, dimension=len(X_train.relationship.unique()))

race = tf.feature_column.categorical_column_with_hash_bucket("race", hash_bucket_size=25)
embedded_race = tf.feature_column.embedding_column(race, dimension=len(X_train.race.unique()))

gender = tf.feature_column.categorical_column_with_hash_bucket("gender", hash_bucket_size=25)
embedded_gender = tf.feature_column.embedding_column(gender, dimension=len(X_train.gender.unique()))

native = tf.feature_column.categorical_column_with_hash_bucket("native_country", hash_bucket_size=25)
embedded_native = tf.feature_column.embedding_column(native, dimension=len(X_train.native_country.unique()))

#Numerical Columns
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
captial_gain = tf.feature_column.numeric_column("capital_gain")
captial_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [0]:
boundaries = [x for x in range(1,101) if x%5==0]
print(boundaries)

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]


In [0]:
age_bucket = tf.feature_column.bucketized_column(age, boundaries=boundaries)

In [0]:
feat_col = [embedded_workclass, embedded_education, embedded_marital_status, embedded_occupation, embedded_relationship, embedded_race, embedded_gender, embedded_native,
            age_bucket, education_num, captial_gain, captial_loss, hours_per_week]

In [0]:
input_func = tf.estimator.inputs.pandas_input_fn(x = X_train, y=y_train, batch_size=10, num_epochs=10, shuffle=True)

In [0]:
model = tf.estimator.DNNClassifier(hidden_units=[3,3,3], feature_columns=feat_col, n_classes=2)

W0726 15:43:15.948004 140119488165760 estimator.py:1811] Using temporary folder as model directory: /tmp/tmpdzkjhm8u


In [0]:
model.train(input_fn=input_func)

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x7f6fd019ff98>

In [0]:
test_input_func = tf.estimator.inputs.pandas_input_fn(x = X_test, y = y_test, batch_size=10, num_epochs=1, shuffle=True)

In [0]:
model.evaluate(input_fn=test_input_func)

W0726 15:44:53.825125 140119488165760 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/metrics_impl.py:2027: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
W0726 15:44:54.528877 140119488165760 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.
W0726 15:44:54.555316 140119488165760 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.
W0726 15:44:54.792201 140119488165760 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with th

{'accuracy': 0.7603644,
 'accuracy_baseline': 0.7603644,
 'auc': 0.5359128,
 'auc_precision_recall': 0.42770436,
 'average_loss': 0.54175055,
 'global_step': 22792,
 'label/mean': 0.23963559,
 'loss': 5.416951,
 'precision': 0.0,
 'prediction/mean': 0.23722254,
 'recall': 0.0}

In [0]:
pred_input_fn = tf.estimator.inputs.pandas_input_fn(x = X_test, batch_size=10, num_epochs=1, shuffle=False)
pred_gen = model.predict(input_fn = pred_input_fn)
predicitions = list(pred_gen)

In [0]:
final_preds = [pred['class_ids'][0] for pred in predicitions]
final_preds

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test, final_preds))