In [1]:
import data_loader as dl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

train_df, valid_df = dl.load_train_data("adult.data")
test_df = dl.load_test_data("adult.test")
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'martial.status',
                   'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
                   'hours.per.week', 'native.country', 'income']
train_df.columns = column_names

for col in ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 
            'sex', 'native.country', 'income']:
    train_df[col] = train_df[col].astype('category')
num_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
cat_features = ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

for category in cat_features:
    train_df[category] = train_df[category].cat.codes
train_df['income.prediction'] = train_df.income.cat.codes
train_df.drop(labels=["income"], axis = 1, inplace = True)

valid_df.columns = column_names

for col in ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 
            'sex', 'native.country', 'income']:
    valid_df[col] = valid_df[col].astype('category')
num_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
cat_features = ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

for category in cat_features:
    valid_df[category] = valid_df[category].cat.codes
valid_df['income.prediction'] = valid_df.income.cat.codes
valid_df.drop(labels=["income"], axis = 1, inplace = True)
missing_cols = set( train_df.columns ) - set( valid_df.columns )
for c in missing_cols:
    valid_df[c] = 0
valid_df = valid_df[train_df.columns]

test_df.columns = column_names

for col in ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 
            'sex', 'native.country', 'income']:
    test_df[col] = test_df[col].astype('category')
num_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
cat_features = ['workclass', 'education', 'martial.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

for category in cat_features:
    test_df[category] = test_df[category].cat.codes
test_df['income.prediction'] = test_df.income.cat.codes
test_df.drop(labels=["income"], axis = 1, inplace = True)
missing_cols = set( train_df.columns ) - set( test_df.columns )
for c in missing_cols:
    test_df[c] = 0
test_df = test_df[train_df.columns]

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from collections import namedtuple

Model = namedtuple('Model', 'name model')

train = train_df.values
X, Y= train[:,:-1], train[:,-1]
valid = valid_df.values
Xval, Yval = valid[:,:-1], valid[:,-1]
test = test_df.values
Xtst, Ytst = test[:,:-1], test[:,-1]
models = []
models.append(Model('LR', LogisticRegression()))
models.append(Model('KNN', KNeighborsClassifier()))
# evalutate each model in turn
Yhat = np.zeros(Yval.shape)
print("All under 50K baseline model")
auc = roc_auc_score(Yval, Yhat)
accuracy = accuracy_score(Yval, Yhat)
print("AUC: {}".format(auc))
print("Accuracy: {}".format(accuracy))
for m in models:
    m.model.fit(X, Y)
    print(m.name)
    Yhat = m.model.predict(Xval)
    auc = roc_auc_score(Yval, Yhat)
    accuracy = accuracy_score(Yval, Yhat)
    print("AUC: {}".format(auc))
    print("Accuracy: {}".format(accuracy))

All under 50K baseline model
AUC: 0.5
Accuracy: 0.7674060382008626




LR
AUC: 0.6273285815691456
Accuracy: 0.8034504004929144
KNN
AUC: 0.6097237472118169
Accuracy: 0.7686383240911892
