In [1]:
#import libraries
import pandas as pd
import numpy as np


In [2]:

# read in the data
data = pd.read_csv('adult.data', header=None)


In [3]:

# assign column names
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']


In [4]:
# remove rows with missing values
data = data[(data.astype(str) != ' ?').all(axis=1)]


# Change the salary column to binary data instead of boolean conditions
data['salary_bi'] = data.apply(lambda row: 1 if '>50K'in row['salary'] else 0, axis=1)
data = data.drop(['salary', 'education', 'native-country', 'fnlwgt'], axis=1)



#data_num = data.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], axis=1)

In [5]:

# convert discrete variables to dummy variables
data = pd.get_dummies(data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])

In [6]:

# normalize numerical variables
data['age'] = (data['age'] - data['age'].min()) / (data['age'].max() - data['age'].min())
data['education-num'] = (data['education-num'] - data['education-num'].min()) / (data['education-num'].max() - data['education-num'].min())
data['capital-gain'] = (data['capital-gain'] - data['capital-gain'].min()) / (data['capital-gain'].max() - data['capital-gain'].min())
data['capital-loss'] = (data['capital-loss'] - data['capital-loss'].min()) / (data['capital-loss'].max() - data['capital-loss'].min())
data['hours-per-week'] = (data['hours-per-week'] - data['hours-per-week'].min()) / (data['hours-per-week'].max() - data['hours-per-week'].min())



In [7]:
corr_matrix = data.corr()

corr_matrix['salary_bi'].sort_values(ascending=False)

salary_bi                                1.000000
marital-status_ Married-civ-spouse       0.445418
relationship_ Husband                    0.401236
education-num                            0.335286
age                                      0.241998
hours-per-week                           0.229480
capital-gain                             0.221196
sex_ Male                                0.216699
occupation_ Exec-managerial              0.213442
occupation_ Prof-specialty               0.181458
capital-loss                             0.150053
workclass_ Self-emp-inc                  0.137646
relationship_ Wife                       0.125126
race_ White                              0.084735
workclass_ Federal-gov                   0.057394
workclass_ Local-gov                     0.028673
occupation_ Protective-serv              0.026360
workclass_ Self-emp-not-inc              0.025575
occupation_ Tech-support                 0.022829
occupation_ Sales                        0.018450


In [8]:

# split data into features and target
X = data.drop(columns='salary_bi')
y = data['salary_bi']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=16), n_estimators=200, bootstrap=True, n_jobs=-1, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=200, max_depth=16, bootstrap=True, n_jobs=-1, random_state=42)
ext_clf = ExtraTreesClassifier(n_estimators=200, max_depth=16, bootstrap=True, n_jobs=-1, random_state=42)
svm_clf = SVC(random_state=42)
log_clf = LogisticRegression(max_iter=1000, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('bag', bag_clf), ('rnd', rnd_clf), ('svc', svm_clf), ('log', log_clf), ('ext', ext_clf)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score

for clf in (bag_clf, rnd_clf, svm_clf, log_clf, ext_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.8639151334327864
RandomForestClassifier 0.8612630532073595
SVC 0.8340792308967346
LogisticRegression 0.8506547323056523
ExtraTreesClassifier 0.8461793469252444
VotingClassifier 0.8529753025029008


In [12]:
#from sklearn.metrics import mean_squared_error

#data_predictions = rnd_clf.predict(X)
#tree_mse = mean_squared_error(y, data_predictions)
#tree_rmse = np.square(tree_mse)
#tree_rmse

In [13]:
#from sklearn.model_selection import GridSearchCV

#params = {'max_depth': list(range(2, 6)), 'max_leaf_nodes': list(range(2, 10)), 'min_samples_split': [20, 40, 60]}
#grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, cv=3)

#grid_search_cv.fit(X_train, y_train)