## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb
import tensorflow as tf

from common import column_variants, drop_missing, load_dataset, plot_confusion

np.random.seed(0)
tf.set_random_seed(0)

In [None]:
data_train = load_dataset('data/train.csv')
data_test = load_dataset('data/test.csv', skiprows=1)

drop_missing(data_train)
drop_missing(data_test)

data_train.info()
data_train.head()

In [None]:
sb.countplot(data_train['Income'])

In [None]:
def extract_that(data, column, value):
    return data[data[column] == value]

def choose_that(data, column, value, count):
    index = data[data[column] == value].index
    index = np.random.choice(index, count, replace=False)
    return data.loc[index]

data_train_high = extract_that(data_train, 'Income', 'High')
data_train_low = choose_that(data_train, 'Income', 'Low', len(data_train_high))
data_train = pd.concat([data_train_high, data_train_low])
data_train = data_train.sample(frac=1).reset_index(drop=True)

In [None]:
sb.countplot(data_train['Income'])

In [None]:
def create_train_input(x, y, batch_size=32): 
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)

def create_test_input(x, y):
    return tf.estimator.inputs.pandas_input_fn(
        x=x, y=y, num_epochs=1, shuffle=False)

def create_predict_input(x):
    return tf.estimator.inputs.pandas_input_fn(
        x=x, num_epochs=1, shuffle=False)

y_train = data_train.pop('Income')
x_train = data_train

y_test = data_test.pop('Income')
x_test = data_test

In [None]:
age = tf.feature_column.numeric_column('Age')

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', column_variants().get('Education'))

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', column_variants().get('MaritalStatus'))

occupation = tf.feature_column.categorical_column_with_vocabulary_list(
    'Occupation', column_variants().get('Occupation'))

race = tf.feature_column.categorical_column_with_vocabulary_list(
    'Race', column_variants().get('Race'))

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', column_variants().get('Relationship'))

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', column_variants().get('Sex'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', column_variants().get('WorkClass'))

feature_columns = [
    tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(occupation),
    tf.feature_column.indicator_column(race),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(work_class),
]

In [None]:
model = tf.estimator.DNNClassifier(
    hidden_units=[256, 128, 64],
    feature_columns=feature_columns,
    label_vocabulary=column_variants().get('Income'),
    optimizer='Adam', model_dir='/tmp/model')

model.train(create_train_input(x_train, y_train), steps=2000)
metrics = model.evaluate(create_test_input(x_test, y_test))

In [None]:
y_pred = [prediction['classes'][0] for prediction in model.predict(create_predict_input(x_test))]
y_pred = pd.Series(y_pred).map({b'Low': 0, b'High': 1})
y_real = y_test.map({'Low': 0, 'High': 1})

confusion = tf.confusion_matrix(list(y_real), list(y_pred))
confusion = tf.Session().run(confusion)
confusion = pd.DataFrame(confusion, index=['Low', 'High'], columns=['Low', 'High'])

display(confusion)
plot_confusion(confusion)

In [None]:
def print_metric(name, value):
    print('{}: {:.2f}%'.format(name, 100 * value))

true_positive = confusion.iloc[1, 1]
true_negative = confusion.iloc[0, 0]
false_positive = confusion.iloc[0, 1]
false_negative = confusion.iloc[1, 0]

positive = true_positive + false_negative
negative = true_negative + false_positive

all = positive + negative

print_metric('Error', (false_positive + false_negative) / all)
print_metric('Accuracy', (true_positive + true_negative) / all)
print_metric('Precision', true_positive / (true_positive + false_positive))
print_metric('Recall', true_positive / (true_positive + false_negative))
print_metric('AUC (true/false positive)', metrics['auc'])
print_metric('AUC (precision/recall)', metrics['auc_precision_recall'])