In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/Users/markespina/Downloads/census.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 14 columns):
age                45222 non-null int64
workclass          45222 non-null object
education_level    45222 non-null object
education-num      45222 non-null float64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
sex                45222 non-null object
capital-gain       45222 non-null float64
capital-loss       45222 non-null float64
hours-per-week     45222 non-null float64
native-country     45222 non-null object
income             45222 non-null object
dtypes: float64(4), int64(1), object(9)
memory usage: 4.8+ MB


In [5]:
df[['education_level', 'education-num']].sort_values('education-num').drop_duplicates()

Unnamed: 0,education_level,education-num
24651,Preschool,1.0
28591,1st-4th,2.0
15348,5th-6th,3.0
20263,7th-8th,4.0
17185,9th,5.0
39334,10th,6.0
1958,11th,7.0
39722,12th,8.0
20433,HS-grad,9.0
95,Some-college,10.0


In [6]:
# categories in 'education level correpond 1 to 1 to values in education num', remove redudant column
df.drop('education_level', 1, inplace=True)

In [7]:
# binarize sex 
df['gender'] = np.where(df['sex'] == " Male", 1, 0)

In [8]:
df['gender'].unique()

array([1, 0])

In [9]:
non_numeric_cols = list(df.select_dtypes([np.object]).columns)
non_numeric_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'income']

In [10]:
# convert remaining nominal variable to dummy variables for intial models
workclass = pd.get_dummies(df['workclass'])
marital_status = pd.get_dummies(df['marital-status'])
occupation = pd.get_dummies(df['occupation'])
relationship = pd.get_dummies(df['relationship'])
race = pd.get_dummies(df['race'])
country = pd.get_dummies(df['native-country'])

In [11]:
X = df.drop(non_numeric_cols, 1)

X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, workclass, marital_status, occupation, relationship, race, country], axis=1)

y = np.where(df['income']=='<=50K', 1, 0)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=42)

In [13]:
X_train.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,gender,Federal-gov,Local-gov,Private,Self-emp-inc,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
7963,32,13.0,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
26402,45,12.0,0.0,0.0,66.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
31411,45,7.0,0.0,0.0,50.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13367,39,9.0,0.0,0.0,70.0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
38742,51,9.0,0.0,0.0,38.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [15]:
print('baseline dominant class is \n{}'.format(y.mean()))


baseline dominant class is 
0.7521560302507629


In [16]:
mlp = MLPClassifier(hidden_layer_sizes=(500,500))
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 500), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [17]:
mlp.score(X_test, y_test)

0.8118297401879492

In [18]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
rfc.score(X_test, y_test)

0.8420121614151465

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [21]:
mlp_5_folds = cross_val_score(mlp, X_test, y_test, cv=5)


In [22]:
print(mlp_5_folds)
print('multilayer perceptron, with relu function, hidden layers (500, 500), avg accuracy achieved {}'.format(mlp_5_folds.mean()))



[0.77998894 0.81370923 0.72802653 0.80652294 0.77667219]
multilayer perceptron, with relu function, hidden layers (500, 500), avg accuracy achieved 0.7809839690436706


In [23]:
rfc_5_folds = cross_val_score(rfc, X_test, y_test, cv=5)

In [24]:
print(rfc_5_folds)
print('random forest, with defualts, avg accuracy achieved {}'.format(rfc_5_folds.mean()))



[0.84245439 0.83360973 0.82642344 0.83139856 0.8352681 ]
random forest, with defualts, avg accuracy achieved 0.8338308457711442


In [25]:
mlp_sigmoid = MLPClassifier(activation='logistic', hidden_layer_sizes=(500,500))
mlp_sigmoid.fit(X_train, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 500), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
mlp_sigmoid.score(X_test, y_test)

0.8360420121614152

In [30]:
mlp_tanh = MLPClassifier(activation='tanh', hidden_layer_sizes=(500,500))
mlp_tanh.fit(X_train, y_train)

MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 500), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [31]:
mlp_tanh.score(X_test, y_test)

0.8202321724709785

In [29]:
X.corrwith(pd.Series(y)).sort_values(ascending=False)


 Never-married            0.319536
 Own-child                0.222635
 Not-in-family            0.195561
 Other-service            0.165428
 Unmarried                0.147262
 Divorced                 0.133982
 Private                  0.116721
 Adm-clerical             0.096389
 Handlers-cleaners        0.091692
 Black                    0.090405
 Other-relative           0.085604
 Machine-op-inspct        0.076726
 Separated                0.073853
 Mexico                   0.064730
 Widowed                  0.060443
 Farming-fishing          0.056077
 Priv-house-serv          0.039071
 Married-spouse-absent    0.038624
 Amer-Indian-Eskimo       0.028762
 Other                    0.024727
 El-Salvador              0.022884
 Transport-moving         0.022306
 Guatemala                0.021530
 Dominican-Republic       0.021079
 Craft-repair             0.020658
 Columbia                 0.019651
 Puerto-Rico              0.019280
 Vietnam                  0.016239
 Nicaragua          