In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':'1CruDuyKVuEcb5QwdGvOTnV8GfIP8OUd4'}) # replace the id with id of file you want to access
downloaded.GetContentFile('adult_dataset.csv')

In [1]:
import time

import numpy as np
import tensorflow as tf
import pandas as pd

from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('adult_dataset.csv')
df.head(15)

In [15]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [0]:
BATCH_SIZE = 40

num_epochs = 1
shuffle = True

In [0]:
y = df["income"].apply(lambda x: ">50K" in x).astype(int)

In [0]:
del df["fnlwgt"] 
del df["income"]
X = df

In [22]:
X

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,?,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,?,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States
32557,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32558,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32559,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [0]:
# Make data input ready
train_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_train,
        y=y_train,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [0]:
test_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        y=y_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [0]:
def generate_input_fn(filename, num_epochs=None, shuffle=True, batch_size=BATCH_SIZE):
    df = pd.read_csv(filename)#, header=None, names=COLUMNS)
    labels = df["income"].apply(lambda x: ">50K" in x).astype(int)
    del df["fnlwgt"] # Unused column
    del df["income"] # Labels column, already saved to labels variable
    
    type(df['age'].iloc[3])
    
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=labels,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [0]:
sex = tf.feature_column.categorical_column_with_vocabulary_list(
    key="sex",                                                           
    vocabulary_list=["female", "male"])

race = tf.feature_column.categorical_column_with_vocabulary_list(
    key="race",                                                             
    vocabulary_list=["Amer-Indian-Eskimo",
                     "Asian-Pac-Islander",
                     "Black", "Other", "White"])

education = tf.feature_column.categorical_column_with_hash_bucket(
  "education", hash_bucket_size=1000)

marital_status = tf.feature_column.categorical_column_with_hash_bucket(
  "marital.status", hash_bucket_size=100)

relationship = tf.feature_column.categorical_column_with_hash_bucket(
  "relationship", hash_bucket_size=100)

workclass = tf.feature_column.categorical_column_with_hash_bucket(
  "workclass", hash_bucket_size=100)

occupation = tf.feature_column.categorical_column_with_hash_bucket(
  "occupation", hash_bucket_size=1000)

native_country = tf.feature_column.categorical_column_with_hash_bucket(
  "native.country", hash_bucket_size=1000)

In [0]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education.num")
capital_gain = tf.feature_column.numeric_column("capital.gain")
capital_loss  = tf.feature_column.numeric_column("capital.loss")
hours_per_week = tf.feature_column.numeric_column("hours.per.week")

In [0]:
# Wide columns and deep columns.
wide_columns = [sex, race, native_country,
      education, occupation, workclass,
      marital_status, relationship]

In [0]:
deep_columns = [
    # Multi-hot indicator columns for columns with fewer possibilities
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    # Embeddings for categories with more possibilities. Should have at least (possibilties)**(0.25) dims
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    # Numerical columns
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

In [0]:
def create_model_dir(model_type):
    return 'models/model_' + model_type + '_' + str(int(time.time()))

In [0]:
# If new_model=False, pass in the desired model_dir 
def get_model(model_type, wide_columns=None, deep_columns=None, new_model=False, model_dir=None):
    if new_model or model_dir is None:
        model_dir = create_model_dir(model_type) # Comment out this line to continue training a existing model
    print("Model directory = %s" % model_dir)
    
    m = None
    
    # Linear Classifier
    if model_type == 'WIDE':
        m = tf.estimator.LinearClassifier(
            model_dir=model_dir, 
            feature_columns=wide_columns)

    # Deep Neural Net Classifier
    if model_type == 'DEEP':
        m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100, 50])

    # Combined Linear and Deep Classifier
    if model_type == 'WIDE_AND_DEEP':
        m = tf.estimator.DNNLinearCombinedClassifier(
                model_dir=model_dir,
                linear_feature_columns=wide_columns,
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=[100, 70, 50, 25])#4 hidden layers
        
    print('estimator built')
    
    return m, model_dir

In [33]:
MODEL_TYPE = 'WIDE_AND_DEEP'
model_dir = create_model_dir(model_type=MODEL_TYPE)
m, model_dir = get_model(model_type = MODEL_TYPE, wide_columns=wide_columns, deep_columns=deep_columns, model_dir=model_dir)

Model directory = models/model_WIDE_AND_DEEP_1583342999
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/model_WIDE_AND_DEEP_1583342999', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f84a01ef2b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_

In [35]:
m.train(input_fn=train_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP_1583342999/model.ckpt-652
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 652 into models/model_WIDE_AND_DEEP_1583342999/model.ckpt.
INFO:tensorflow:loss = 15.09136, step = 653
INFO:tensorflow:global_step/sec: 102.063
INFO:tensorflow:loss = 84.88582, step = 753 (0.983 sec)
INFO:tensorflow:global_step/sec: 218.463
INFO:tensorflow:loss = 19.048126, step = 853 (0.457 sec)
INFO:tensorflow:global_step/sec: 216.566
INFO:tensorflow:loss = 17.764381, step = 953 (0.461 sec)
INFO:tensorflow:global_step/sec: 213.882
INFO:tensorflow:loss = 13.015619, step = 1053 (0.469 sec)
INFO:tensorflow:global_step/sec: 211.612
INFO:tensorflow:loss = 

<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier at 0x7f84a01ef240>

In [38]:
results = m.evaluate(input_fn=test_input_fn)
print('evaluate done')
print('\nAccuracy: %s' % results['accuracy'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-03-04T17:31:40Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP_1583342999/model.ckpt-1304
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-03-04-17:31:42
INFO:tensorflow:Saving dict for global step 1304: accuracy = 0.8228159, accuracy_baseline = 0.76124674, auc = 0.87142855, auc_precision_recall = 0.6982479, average_loss = 0.37864637, global_step = 1304, label/mean = 0.23875326, loss = 15.129594, precision = 0.7655629, prediction/mean = 0.23643577, recall = 0.3717042
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1304: models/model_WIDE_AND_DEEP_1583342999/model.ckpt-1304
evaluate done

Accuracy: 0.8228159
