## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb

from common import Dataset, column_variants, encode_categorical, load_data
from common import compute_confusion, plot_confusion

np.random.seed(0)
pp.rcParams.update({'figure.max_open_warning': 0})

In [None]:
data = pd.concat([load_data('data/train.csv'),
                  load_data('data/test.csv', skiprows=1)])

data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

data.info()
sb.countplot(data['Income'])

In [None]:
redundant = [
    'CapitalGain',
    'CapitalLoss',
    'EducationNumber',
    'FinalSamplingWeight',
]

categorical = [
    'Education',
    'MaritalStatus',
    'NativeCountry',
    'Occupation',
    'Race',
    'Relationship',
    'Sex',
    'WorkClass',
]

data.drop(redundant, axis=1, inplace=True)

for name in categorical:
    data = encode_categorical(data, name)

data.info()

In [None]:
datasets = {
    'Original': Dataset(data),
    'Oversampled': Dataset(data, oversample=True),
    'Undersampled': Dataset(data, undersample=True),
}

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

models = {
    'Adaptive boosting':
        lambda: AdaBoostClassifier(n_estimators=50),
    'Decision tree':
        lambda: DecisionTreeClassifier(max_depth=10),
    'Gaussian naive Bayes':
        lambda: GaussianNB(),
    'K-nearest neighbors':
        lambda: KNeighborsClassifier(n_neighbors=10),
    'Linear SVC':
        lambda: LinearSVC(penalty='l2', C=1.0, dual=False),
    'Logistic regression':
        lambda: LogisticRegression(penalty='l2', C=1.0),
    'Multi-layer perceptron':
        lambda: MLPClassifier(activation='logistic', solver='adam', alpha=0.0001),
    'Random forest':
        lambda: RandomForestClassifier(n_estimators=50, max_depth=20),
}

In [None]:
summaries = []

for dataset_name in sorted(datasets.keys()):
    for model_name in sorted(models.keys()):
        dataset = datasets[dataset_name]
        model = models[model_name]()
        model.fit(dataset.x_train, dataset.y_train)
        score = 'predict_proba' in dir(model)
        if score:
            y_score = model.predict_proba(dataset.x_test)
            y_predicted = [model.classes_[i] for i in np.argmax(y_score, axis=1)]
            y_score = y_score[:, list(model.classes_).index(True)]
        else:
            y_predicted = model.predict(dataset.x_test)
            y_score = np.zeros(y_predicted.shape)
        summary = {
            'Dataset': dataset_name,
            'Model': model_name,
            'Score': model.score(dataset.x_train, dataset.y_train),
        }
        summary.update(compute_confusion(dataset.y_test, y_predicted, y_score))
        summaries.append(summary)
        plot_confusion(dataset.y_test, y_predicted, y_score)
        pp.suptitle('{} dataset and {} model'.format(dataset_name, model_name))
        print('.', end='')

head = ['Dataset', 'Model', 'Score']
columns = head + sorted(summaries[0].keys() - head)
summaries = pd.DataFrame(summaries, columns=columns)

summaries.head(len(summaries))