## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb
import tensorflow as tf

from common import balance_dataset, column_variants, load_dataset
from common import plot_confusion, print_confusion
from sklearn.model_selection import train_test_split as split

np.random.seed(0)
tf.set_random_seed(0)
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
data = pd.concat([load_dataset('data/train.csv'),
                  load_dataset('data/test.csv', skiprows=1)])

data.dropna(inplace=True)
data = data.sample(frac=1).reset_index(drop=True)

data.info()
display(data.head())
sb.countplot(data['Income'])

In [None]:
class Dataset:
    def __init__(self, data, balance=False, test_size=0.3):
        data_train, data_test = split(data, test_size=test_size, random_state=0)
        if balance: data_train = balance_dataset(data_train, 'Income')
        self.y_train = data_train.pop('Income')
        self.x_train = data_train
        self.y_test = data_test.pop('Income')
        self.x_test = data_test

    def create_train_input_fn(self, batch_size=32): 
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_train, y=self.y_train,
            batch_size=batch_size,
            num_epochs=None, shuffle=True)

    def create_test_input_fn(self, batch_size=32):
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_test, batch_size=batch_size,
            num_epochs=1, shuffle=False)

In [None]:
datasets = {
    'Original': Dataset(data),
    'Balanced': Dataset(data, balance=True),
}

for name in datasets:
    assert(datasets['Original'].x_test.equals(datasets[name].x_test))

In [None]:
age = tf.feature_column.numeric_column('Age')

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', column_variants().get('Education'))

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', column_variants().get('MaritalStatus'))

occupation = tf.feature_column.categorical_column_with_vocabulary_list(
    'Occupation', column_variants().get('Occupation'))

race = tf.feature_column.categorical_column_with_vocabulary_list(
    'Race', column_variants().get('Race'))

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', column_variants().get('Relationship'))

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', column_variants().get('Sex'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', column_variants().get('WorkClass'))

In [None]:
def create_deep_model_fn(feature_columns, hidden_units=[256, 128, 64]):
    def _function(**arguments):
        return tf.estimator.DNNClassifier(
            hidden_units=hidden_units, feature_columns=feature_columns,
            optimizer='Adam', **arguments)
    return _function

def create_linear_model_fn(feature_columns, hidden_units=[256, 128, 64]):
    def _function(**arguments):
        return tf.estimator.LinearClassifier(
            feature_columns=feature_columns,
            optimizer='Adam', **arguments)
    return _function

In [None]:
models = {
    'Deep': create_deep_model_fn([
        tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
        tf.feature_column.indicator_column(education),
        tf.feature_column.indicator_column(marital_status),
        tf.feature_column.indicator_column(occupation),
        tf.feature_column.indicator_column(race),
        tf.feature_column.indicator_column(relationship),
        tf.feature_column.indicator_column(sex),
        tf.feature_column.indicator_column(work_class),
    ]),
    'Linear': create_linear_model_fn([
        tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
        education,
        marital_status,
        occupation,
        race,
        relationship,
        sex,
        work_class,   
    ]),
}

In [None]:
def assess(dataset, y_predicted, y_score):
    print_confusion(dataset.y_test, y_predicted, y_score)
    plot_confusion(dataset.y_test, y_predicted, y_score)

def train_predict(model, dataset, steps=2000):
    model.train(dataset.create_train_input_fn(), steps=steps)
    predictions = list(model.predict(dataset.create_test_input_fn()))
    y_predicted = pd.Series([prediction['classes'][0] == b'1' for prediction in predictions])
    y_score = [prediction['probabilities'][1] for prediction in predictions]
    return y_predicted, y_score

In [None]:
for dataset_name in datasets:
    for model_name in models:
        title = '{} dataset and {} model'.format(dataset_name, model_name)
        print(title)
        model_dir = '/tmp/model/dataset({})-model({})'.format(dataset_name, model_name)
        model = models[model_name](model_dir=model_dir)
        result = train_predict(model, datasets[dataset_name])
        assess(datasets[dataset_name], *result)
        pp.suptitle(title)
        print()