## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb

from common import Dataset, column_variants, load_data
from common import compute_confusion, plot_confusion

np.random.seed(0)

In [None]:
data = pd.concat([load_data('data/train.csv'),
                  load_data('data/test.csv', skiprows=1)])

data.dropna(inplace=True)
data = data.sample(frac=1).reset_index(drop=True)

data.info()
display(data.head())
sb.countplot(data['Income'])

In [None]:
def make_dummy(data, column, drop=None, keep=None):
    dummies = pd.get_dummies(data[column])
    if keep: drop = list(set(dummies.columns) - set(keep))
    if drop: dummies.drop(drop, axis=1, inplace=True)
    dummies.columns = [column.lower() for column in dummies.columns]
    data = data.join(dummies)
    data.drop([column], axis=1, inplace=True)
    return data

data.drop(['FinalSamplingWeight', 'EducationNumber', 'CapitalGain', 'CapitalLoss'],
          axis=1, inplace=True)

data = make_dummy(data, 'Education')
data = make_dummy(data, 'MaritalStatus')
data = make_dummy(data, 'NativeCountry', keep=['Mexico', 'United-States'])
data = make_dummy(data, 'Occupation')
data = make_dummy(data, 'Race')
data = make_dummy(data, 'Relationship')
data = make_dummy(data, 'Sex')
data = make_dummy(data, 'WorkClass')

data.info()
data.head()

In [None]:
datasets = {
    'Original': Dataset(data),
    'Balanced': Dataset(data, balance=True),
}

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    'Adaptive boosting':
        lambda: AdaBoostClassifier(n_estimators=50),
    'Decision tree':
        lambda: DecisionTreeClassifier(max_depth=10),
    'Gaussian naive Bayes':
        lambda: GaussianNB(),
    'K-nearest neighbors':
        lambda: KNeighborsClassifier(n_neighbors=10),
    'Logistic regression':
        lambda: LogisticRegression(penalty='l2', C=1.0),
    'Multi-layer perceptron':
        lambda: MLPClassifier(activation='logistic', solver='adam', alpha=0.0001),
    'Random forest':
        lambda: RandomForestClassifier(n_estimators=50, max_depth=20),
}

In [None]:
summaries = []

for dataset_name in sorted(datasets.keys()):
    for model_name in sorted(models.keys()):
        dataset = datasets[dataset_name]
        model = models[model_name]()
        model.fit(dataset.x_train, dataset.y_train)
        y_score = model.predict_proba(dataset.x_test)
        y_predicted = [model.classes_[i] for i in np.argmax(y_score, axis=1)]
        y_score = y_score[:, list(model.classes_).index(True)]
        summary = {
            'Dataset': dataset_name,
            'Model': model_name,
            'Score': model.score(dataset.x_train, dataset.y_train),
        }
        summary.update(compute_confusion(dataset.y_test, y_predicted, y_score))
        summaries.append(summary)
        plot_confusion(dataset.y_test, y_predicted, y_score)
        pp.suptitle('{} dataset and {} model'.format(dataset_name, model_name))
        print('.', end='')

head = ['Dataset', 'Model', 'Score']
columns = head + sorted(summaries[0].keys() - head)
summaries = pd.DataFrame(summaries, columns=columns)

summaries.head(len(summaries))