In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('adult-training.csv', sep=',')
len(df)

32561

In [3]:
df = df[(df.astype(str) != ' ?').all(axis=1)]
len(df)

30162

In [4]:
df.columns

Index(['age', 'workclass', 'fniwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race ', 'gender',
       'gain', 'loss', 'hours', 'native-country', 'income'],
      dtype='object')

In [5]:
df['income_bi'] = df.apply(lambda row: 1 if '>50K'in row['income'] else 0, axis=1)
# Remove redundant columns
df = df.drop(['income','fniwgt','gain','loss','native-country'], axis=1)

In [6]:
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race ', 'gender'])

In [7]:
df

Unnamed: 0,age,educational-num,hours,income_bi,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race _ Amer-Indian-Eskimo,race _ Asian-Pac-Islander,race _ Black,race _ Other,race _ White,gender_ Female,gender_ Male
0,39,13,40,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
1,50,13,13,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
2,38,9,40,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,53,7,40,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,28,13,40,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5,37,14,40,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,1,0
6,49,5,16,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
7,52,9,45,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
8,31,14,50,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
9,42,13,40,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [8]:
df=df.sample(frac=1)

In [9]:
df

Unnamed: 0,age,educational-num,hours,income_bi,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race _ Amer-Indian-Eskimo,race _ Asian-Pac-Islander,race _ Black,race _ Other,race _ White,gender_ Female,gender_ Male
323,29,7,40,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
26138,51,15,40,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
16598,18,7,20,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
6054,43,9,24,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
15254,21,9,40,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
8318,53,9,40,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
14386,58,13,37,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
29690,44,13,45,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
2494,38,9,50,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
18489,66,13,40,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1


In [10]:
dtrain=df[:25000]
dtest=df[25000:]

In [11]:
dtrain_att=dtrain.drop(['income_bi'],axis=1)

In [12]:
dtrain_gt50=dtrain['income_bi']

In [13]:
dtrain.columns

Index(['age', 'educational-num', 'hours', 'income_bi',
       'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors',
       'education_ Doctorate', 'education_ HS-grad', 'education_ Masters',
       'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'marital-status_ Divorced',
       'marital-status_ Married-AF-spouse',
       'marital-status_ Married-civ-spouse',
       'marital-status_ Married-spouse-absent',
       'marital-status_ Never-married', 'marital-status_ Separated',
       'marital-status_ Widowed', 'occupation_ Adm-clerical',
       'occupation_ Armed-Forces', 'occupation_

In [14]:
dtrain['income_bi']

323      0
26138    0
16598    0
6054     0
15254    0
8318     0
14386    0
29690    0
2494     0
18489    1
13537    0
29856    0
5398     0
4135     0
16147    0
18347    0
25127    0
17651    0
14512    0
20444    1
20121    0
17891    0
30817    0
15722    0
1016     1
15428    0
29069    0
30339    1
13476    0
24705    0
        ..
32087    0
16650    0
27110    1
9875     0
2563     0
26693    0
8633     1
26120    0
4737     0
10685    0
1111     1
15211    1
26359    0
6318     0
8971     0
11834    0
11549    0
2878     0
3520     0
20939    0
9769     1
15247    0
16680    0
3952     1
10447    0
13596    0
5948     1
26583    0
4494     0
23833    0
Name: income_bi, Length: 25000, dtype: int64

In [15]:
d_test_att = dtest.drop(['income_bi'], axis=1)
d_test_gt50 = dtest['income_bi']
d_att = df.drop(['income_bi'], axis=1)
d_gt50 = df['income_bi']

In [16]:
print("Income >50K: %d out of %d (%.2f%%)" % (np.sum(d_gt50), len(d_gt50), 100*float(np.sum(d_gt50)) / len(d_gt50)))

Income >50K: 7508 out of 30162 (24.89%)


In [17]:
t = tree.DecisionTreeClassifier(criterion='entropy', max_depth=7)
t = t.fit(dtrain_att, dtrain_gt50)

In [19]:
t.score(d_test_att, d_test_gt50)

0.8211933359163115

In [20]:
scores = cross_val_score(t, d_att, d_gt50, cv=5)
# Show avarage score and +/- two standard deviations away (covering 95% or scores)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2))

Accuracy: 0.83 (+/- 0.01)


In [21]:
for max_depth in range(1, 20):
    t = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    scores = cross_val_score(t, d_att, d_gt50, cv=5)
    print("Max depth: %d, Accuracy: %0.2f (+/- %0.2f)" % (max_depth, scores.mean(), scores.std()*2))

Max depth: 1, Accuracy: 0.75 (+/- 0.00)
Max depth: 2, Accuracy: 0.82 (+/- 0.01)
Max depth: 3, Accuracy: 0.82 (+/- 0.01)
Max depth: 4, Accuracy: 0.82 (+/- 0.01)
Max depth: 5, Accuracy: 0.82 (+/- 0.01)
Max depth: 6, Accuracy: 0.82 (+/- 0.01)
Max depth: 7, Accuracy: 0.83 (+/- 0.01)
Max depth: 8, Accuracy: 0.83 (+/- 0.01)
Max depth: 9, Accuracy: 0.83 (+/- 0.01)
Max depth: 10, Accuracy: 0.83 (+/- 0.01)
Max depth: 11, Accuracy: 0.82 (+/- 0.01)
Max depth: 12, Accuracy: 0.82 (+/- 0.01)
Max depth: 13, Accuracy: 0.82 (+/- 0.00)
Max depth: 14, Accuracy: 0.82 (+/- 0.01)
Max depth: 15, Accuracy: 0.81 (+/- 0.00)
Max depth: 16, Accuracy: 0.81 (+/- 0.00)
Max depth: 17, Accuracy: 0.81 (+/- 0.01)
Max depth: 18, Accuracy: 0.80 (+/- 0.01)
Max depth: 19, Accuracy: 0.80 (+/- 0.00)
