## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb
import tensorflow as tf

from common import balance, column_variants, drop_missing, load_dataset
from common import plot_confusion, print_confusion
from common import plot_precision_recall, plot_roc

np.random.seed(0)
tf.set_random_seed(0)
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
class Experiment:
    batch_size = 32
    steps = 2000
    labels = column_variants()['Income']
    negative_index = labels.index('Low')
    positive_index = labels.index('High')

    def __init__(self, name, data_train, data_test):
        self.name = name
        self.y_train = data_train.pop('Income')
        self.x_train = data_train
        self.y_test = data_test.pop('Income')
        self.x_test = data_test

    def run(self, model):
        model.train(self.create_train_input(), steps=self.steps)
        predictions = list(model.predict(self.create_test_input()))
        y_pred = pd.Series([prediction['classes'][0] for prediction in predictions])
        y_score = [prediction['probabilities'][self.positive_index] for prediction in predictions]
        return y_pred, y_score

    def assess(self, y_pred, y_score):
        y_test = self.y_test.map({'Low': 0, 'High': 1})
        y_pred = y_pred.map({b'Low': 0, b'High': 1})
        confusion = tf.confusion_matrix(y_test, y_pred)
        confusion = tf.Session().run(confusion)
        confusion = pd.DataFrame(confusion, index=self.labels,
                                 columns=self.labels)
        _, axes = pp.subplots(1, 3, figsize=(15, 4))
        pp.suptitle(self.name)
        pp.sca(axes[0])
        plot_confusion(confusion)
        pp.sca(axes[1])
        plot_precision_recall(y_test, y_score)
        pp.sca(axes[2])
        plot_roc(y_test, y_score)
        print('{}:'.format(self.name))
        print_confusion(confusion)

    def create_train_input(self): 
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_train, y=self.y_train,
            batch_size=self.batch_size,
            num_epochs=None, shuffle=True)

    def create_test_input(self):
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_test, num_epochs=1, shuffle=False)

In [None]:
data_train = load_dataset('data/train.csv')
data_test = load_dataset('data/test.csv', skiprows=1)

drop_missing(data_train)
drop_missing(data_test)

data_train.info()
data_train.head()

In [None]:
_, axes = pp.subplots(1, 2, figsize=(10, 4))
sb.countplot(data_train['Income'], ax=axes[0])
sb.countplot(data_test['Income'], ax=axes[1])

In [None]:
age = tf.feature_column.numeric_column('Age')

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', column_variants().get('Education'))

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', column_variants().get('MaritalStatus'))

occupation = tf.feature_column.categorical_column_with_vocabulary_list(
    'Occupation', column_variants().get('Occupation'))

race = tf.feature_column.categorical_column_with_vocabulary_list(
    'Race', column_variants().get('Race'))

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', column_variants().get('Relationship'))

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', column_variants().get('Sex'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', column_variants().get('WorkClass'))

feature_columns = [
    tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(occupation),
    tf.feature_column.indicator_column(race),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(work_class),
]

In [None]:
data_train_balanced = balance(data_train, 'Income', *Experiment.labels)

experiments = [
    Experiment('DNN (imbalanced)', data_train.copy(), data_test.copy()),
    Experiment('DNN (balanced)', data_train_balanced.copy(), data_test.copy()),
]

for experiment in experiments:
    model = tf.estimator.DNNClassifier(
        hidden_units=[256, 128, 64], feature_columns=feature_columns,
        label_vocabulary=Experiment.labels, optimizer='Adam',
        model_dir='/tmp/model/{}'.format(experiment.name))
    experiment.assess(*experiment.run(model))