In [1]:
#import libraries
import pandas as pd
import numpy as np


In [2]:

# read in the data
data = pd.read_csv('adult.data', header=None)


In [3]:

# assign column names
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']


In [4]:
# remove rows with missing values
data = data[(data.astype(str) != ' ?').all(axis=1)]


# Change the salary column to binary data instead of boolean conditions
data['salary_bi'] = data.apply(lambda row: 1 if '>50K'in row['salary'] else 0, axis=1)
data = data.drop(['salary', 'education', 'native-country', 'fnlwgt'], axis=1)



#data_num = data.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], axis=1)

In [5]:

# convert discrete variables to dummy variables
data = pd.get_dummies(data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])

In [6]:

# normalize numerical variables
data['age'] = (data['age'] - data['age'].min()) / (data['age'].max() - data['age'].min())
data['education-num'] = (data['education-num'] - data['education-num'].min()) / (data['education-num'].max() - data['education-num'].min())
data['capital-gain'] = (data['capital-gain'] - data['capital-gain'].min()) / (data['capital-gain'].max() - data['capital-gain'].min())
data['capital-loss'] = (data['capital-loss'] - data['capital-loss'].min()) / (data['capital-loss'].max() - data['capital-loss'].min())
data['hours-per-week'] = (data['hours-per-week'] - data['hours-per-week'].min()) / (data['hours-per-week'].max() - data['hours-per-week'].min())



In [7]:
corr_matrix = data.corr()

corr_matrix['salary_bi'].sort_values(ascending=False)

salary_bi                                1.000000
marital-status_ Married-civ-spouse       0.445418
relationship_ Husband                    0.401236
education-num                            0.335286
age                                      0.241998
hours-per-week                           0.229480
capital-gain                             0.221196
sex_ Male                                0.216699
occupation_ Exec-managerial              0.213442
occupation_ Prof-specialty               0.181458
capital-loss                             0.150053
workclass_ Self-emp-inc                  0.137646
relationship_ Wife                       0.125126
race_ White                              0.084735
workclass_ Federal-gov                   0.057394
workclass_ Local-gov                     0.028673
occupation_ Protective-serv              0.026360
workclass_ Self-emp-not-inc              0.025575
occupation_ Tech-support                 0.022829
occupation_ Sales                        0.018450


In [8]:

# split data into features and target
X = data.drop(columns='salary_bi')
y = data['salary_bi']

In [9]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42, max_depth=5, max_leaf_nodes=9, min_samples_split=20)
rnd_clf.fit(X,y)

In [10]:
rnd_clf.oob_score_

0.8336980306345733

In [11]:
rnd_clf.feature_importances_

array([6.40225839e-02, 1.33124766e-01, 1.47280753e-01, 2.43965378e-02,
       3.88086784e-02, 3.37982411e-04, 1.92847708e-05, 1.09917584e-03,
       2.56268703e-03, 6.38915866e-04, 0.00000000e+00, 0.00000000e+00,
       1.02035329e-02, 0.00000000e+00, 1.99232795e-01, 1.49987329e-05,
       7.08622298e-02, 1.72489034e-04, 2.94641821e-04, 4.01873710e-04,
       0.00000000e+00, 1.97086431e-03, 2.82577731e-02, 1.78205729e-03,
       1.72238836e-03, 1.52314629e-03, 7.69776815e-03, 1.00244722e-05,
       2.15055277e-02, 0.00000000e+00, 2.54750825e-04, 1.60664362e-04,
       5.38459816e-04, 1.48391886e-01, 1.77028255e-02, 3.48456142e-04,
       1.79777883e-02, 9.19398082e-03, 1.14278521e-02, 2.93894829e-05,
       0.00000000e+00, 2.02707809e-04, 1.72079453e-05, 1.40559767e-04,
       1.88354368e-02, 1.68345587e-02])

In [12]:
from sklearn.metrics import mean_squared_error

data_predictions = rnd_clf.predict(X)
tree_mse = mean_squared_error(y, data_predictions)
tree_rmse = np.square(tree_mse)
tree_rmse

0.027392324139125544

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': list(range(2, 6)), 'max_leaf_nodes': list(range(2, 10)), 'min_samples_split': [20, 40, 60]}
grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, cv=3)

grid_search_cv.fit(X_train, y_train)

In [15]:
grid_search_cv.best_estimator_