In [71]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [3]:
ci_train = pd.read_csv(r'census_income.csv')

In [4]:
ci_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
ci_train.shape

(32561, 15)

In [7]:
pd.crosstab(ci_train['education.num'], ci_train['education'])

education,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
education.num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,51,0,0
2,0,0,0,168,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,333,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,646,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,514,0,0,0,0,0,0,0,0,0
6,933,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1175,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,433,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,10501,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7291


Each education class has a one-to-one relationship to education.num

In [9]:
ci_train.drop(['education'], axis=1, inplace=True)

In [15]:
ci_train['Y'].value_counts().index

Index([' <=50K', ' >50K'], dtype='object')

In [16]:
ci_train['Y'] = (ci_train['Y']==' >50K').astype(int)

In [17]:
ci_train['Y'].value_counts()

0    24720
1     7841
Name: Y, dtype: int64

In [19]:
cat_cols = ci_train.select_dtypes('object').columns
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')

In [22]:
ci_train.shape

(32561, 14)

In [32]:
for cols in cat_cols:
    freqs = ci_train[cols].value_counts()
    k = freqs.index[freqs>500][:-1]
    for cat in k:
        name = cols+'_'+cat
        ci_train[name] = (ci_train[cols]==cat).astype(int)
    del ci_train[cols]
    print(cols)

workclass
marital.status
occupation
relationship
race
sex
native.country


In [33]:
ci_train.shape

(32561, 39)

In [34]:
ci_train.isnull().sum()

age                                   0
fnlwgt                                0
education.num                         0
capital.gain                          0
capital.loss                          0
hours.per.week                        0
Y                                     0
workclass_ Private                    0
workclass_ Self-emp-not-inc           0
workclass_ Local-gov                  0
workclass_ ?                          0
workclass_ State-gov                  0
workclass_ Self-emp-inc               0
marital.status_ Married-civ-spouse    0
marital.status_ Never-married         0
marital.status_ Divorced              0
marital.status_ Separated             0
occupation_ Prof-specialty            0
occupation_ Craft-repair              0
occupation_ Exec-managerial           0
occupation_ Adm-clerical              0
occupation_ Sales                     0
occupation_ Other-service             0
occupation_ Machine-op-inspct         0
occupation_ ?                         0


In [35]:
x_train = ci_train.drop(['Y'],axis=1)
y_train = ci_train['Y']

# Hyperparameter Tuning using RandomizedSearchCV for DecisionTreeClassifier

In [37]:
clf = DecisionTreeClassifier()

In [50]:
params = { 'class_weight':[None,'balanced'],
           'criterion':['gini', 'entropy'],
           'max_depth':[None, 5,10,20,30,50,70],
           'min_samples_leaf':[1,2,5,10,15,20],
           'min_samples_split':[2,5,10,15,20]
         }

In [51]:
random_search = RandomizedSearchCV(clf, cv=10,
                                  param_distributions=params,
                                  scoring='roc_auc',
                                  random_state=12,
                                  n_jobs=-1)

In [52]:
random_search.fit(x_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=None,
                                                    splitter='best'

In [53]:
random_search.best_estimator_

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=70,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [54]:
dtree = random_search.best_estimator_

In [55]:
dtree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=70,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

# Implementation using RandomForests

In [78]:
clf = RandomForestClassifier()

In [79]:
param_dist = {'class_weight':[None,'balanced'],
           'criterion':['gini', 'entropy'],
           'max_depth':[None, 5,10,20,30,50,70],
           'min_samples_leaf':[1,2,5,10,15,20],
           'min_samples_split':[2,5,10,15,20],
              'n_estimators' : [100,200,300,500,700,1000],
              'max_features' : [5,10,15,20,25,30,35],
              'bootstrap' : [True, False]
             }

max_features should be less than the number of columns

In [80]:
x_train.shape

(32561, 38)

In [81]:
n_iter_search = 10

In [82]:
random_search1 = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring = 'roc_auc',
                                   cv=5, n_jobs = -1)
random_search1.fit(x_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [83]:
random_search1.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=50, max_features=15,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=15, min_weight_fraction_leaf=0.0,
                       n_estimators=700, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [84]:
rf = RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=50, max_features=15,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=15, min_weight_fraction_leaf=0.0,
                       n_estimators=700, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [85]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=50, max_features=15,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=15, min_weight_fraction_leaf=0.0,
                       n_estimators=700, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

# Feature Immportance

In [87]:
feat_imp_df = pd.DataFrame({'features':x_train.columns, 'importance':rf.feature_importances_})
feat_imp_df.sort_values('importance', ascending = False)

Unnamed: 0,features,importance
12,marital.status_ Married-civ-spouse,0.225804
2,education.num,0.136545
0,age,0.11193
3,capital.gain,0.105273
28,relationship_ Husband,0.082885
1,fnlwgt,0.074361
5,hours.per.week,0.058408
13,marital.status_ Never-married,0.043305
4,capital.loss,0.02493
32,relationship_ Wife,0.018079
