## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import common
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb
import tensorflow as tf

from common import column_variants, load_data
from common import compute_confusion, plot_confusion

np.random.seed(0)
tf.set_random_seed(0)
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
data = pd.concat([load_data('data/train.csv'),
                  load_data('data/test.csv', skiprows=1)])

data.dropna(inplace=True)
data = data.sample(frac=1).reset_index(drop=True)

data.info()
display(data.head())
sb.countplot(data['Income'])

In [None]:
class Dataset(common.Dataset):
    def create_test_input_fn(self, batch_size=32):
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_test, batch_size=batch_size,
            num_epochs=1, shuffle=False)

    def create_train_input_fn(self, batch_size=32): 
        return tf.estimator.inputs.pandas_input_fn(
            x=self.x_train, y=self.y_train,
            batch_size=batch_size,
            num_epochs=None, shuffle=True)

In [None]:
def create_deep_model_fn(feature_columns, hidden_units=[256, 128, 64], **arguments1):
    def _function(**arguments2):
        return tf.estimator.DNNClassifier(
            feature_columns=feature_columns, optimizer='Adagrad',
            config=tf.estimator.RunConfig(tf_random_seed=0),
            hidden_units=hidden_units, **arguments1, **arguments2)
    return _function

def create_linear_model_fn(feature_columns, **arguments1):
    def _function(**arguments2):
        return tf.estimator.LinearClassifier(
            feature_columns=feature_columns, optimizer='Adagrad',
            config=tf.estimator.RunConfig(tf_random_seed=0),
            **arguments1, **arguments2)
    return _function

In [None]:
datasets = {
    'Original': Dataset(data),
    'Balanced': Dataset(data, balance=True),
    'Weighted': Dataset(data, weight=True),
}

In [None]:
age = tf.feature_column.numeric_column('Age')

education = tf.feature_column.categorical_column_with_vocabulary_list(
    'Education', column_variants().get('Education'))

hours_per_week = tf.feature_column.numeric_column('HoursPerWeek')

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'MaritalStatus', column_variants().get('MaritalStatus'))

native_country = tf.feature_column.categorical_column_with_vocabulary_list(
    'NativeCountry', column_variants().get('NativeCountry'))

occupation = tf.feature_column.categorical_column_with_vocabulary_list(
    'Occupation', column_variants().get('Occupation'))

race = tf.feature_column.categorical_column_with_vocabulary_list(
    'Race', column_variants().get('Race'))

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'Relationship', column_variants().get('Relationship'))

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', column_variants().get('Sex'))

work_class = tf.feature_column.categorical_column_with_vocabulary_list(
    'WorkClass', column_variants().get('WorkClass'))

In [None]:
models = {
    'Deep': create_deep_model_fn([
        age,
        tf.feature_column.indicator_column(education),
        hours_per_week,
        tf.feature_column.indicator_column(marital_status),
        tf.feature_column.indicator_column(native_country),
        tf.feature_column.indicator_column(occupation),
        tf.feature_column.indicator_column(race),
        tf.feature_column.indicator_column(relationship),
        tf.feature_column.indicator_column(sex),
        tf.feature_column.indicator_column(work_class),
    ]),
    'DeepWeighted': create_deep_model_fn([
        age,
        tf.feature_column.indicator_column(education),
        hours_per_week,
        tf.feature_column.indicator_column(marital_status),
        tf.feature_column.indicator_column(native_country),
        tf.feature_column.indicator_column(occupation),
        tf.feature_column.indicator_column(race),
        tf.feature_column.indicator_column(relationship),
        tf.feature_column.indicator_column(sex),
        tf.feature_column.indicator_column(work_class),
    ], weight_column='Weight'),
    'Linear': create_linear_model_fn([
        tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
        education,
        tf.feature_column.bucketized_column(hours_per_week, boundaries=[20, 40, 60, 80]),
        marital_status,
        native_country,
        occupation,
        race,
        relationship,
        sex,
        work_class,   
    ]),
    'LinearWeighted': create_linear_model_fn([
        tf.feature_column.bucketized_column(age, boundaries=[30, 40, 50, 60, 70]),
        education,
        tf.feature_column.bucketized_column(hours_per_week, boundaries=[20, 40, 60, 80]),
        marital_status,
        native_country,
        occupation,
        race,
        relationship,
        sex,
        work_class,   
    ], weight_column='Weight'),
}

In [None]:
scenarios = [
    ('Original', 'Linear'),
    ('Original', 'Deep'),
    ('Balanced', 'Linear'),
    ('Balanced', 'Deep'),
    ('Weighted', 'DeepWeighted'),
    ('Weighted', 'LinearWeighted'),
]

summaries = []

for (dataset_name, model_name) in scenarios:
    dataset = datasets[dataset_name]
    model_dir = '/tmp/model/dataset({})-model({})'.format(dataset_name, model_name)
    model = models[model_name](model_dir=model_dir)
    model.train(dataset.create_train_input_fn(), steps=5000)
    predictions = list(model.predict(dataset.create_test_input_fn()))
    y_predicted = pd.Series([prediction['classes'][0] == b'1' for prediction in predictions])
    y_score = [prediction['probabilities'][1] for prediction in predictions]
    summary = {
        'Dataset': dataset_name,
        'Model': model_name,
    }
    summary.update(compute_confusion(dataset.y_test, y_predicted, y_score))
    summaries.append(summary)
    plot_confusion(dataset.y_test, y_predicted, y_score)
    pp.suptitle('{} dataset and {} model'.format(dataset_name, model_name))
    print('.', end='')

head = ['Dataset', 'Model']
columns = head + sorted(summaries[0].keys() - head)
summaries = pd.DataFrame(summaries, columns=columns)

summaries.head(len(summaries))