In [2]:
# -----------------------------------------------------------------
# Decision Tree Classifier
# Predict the income of an adult based on the census data
# -----------------------------------------------------------------

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
%matplotlib inline

In [4]:
# Read dataset
data = pd.read_csv('decisiontreeAdultIncome.csv')
data.head()

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K


In [5]:
# Check for Null values
data.isnull().sum()

age               0
wc                0
education         0
marital status    0
race              0
gender            0
hours per week    0
IncomeClass       0
dtype: int64

In [7]:
# Create Dummy variables
data.dtypes
data=pd.get_dummies(data,drop_first=True)
data

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [11]:
# Create X and Y Variables
Y=data[['IncomeClass_ >50K']]
Y

Unnamed: 0,IncomeClass_ >50K
0,0
1,0
2,0
3,1
4,1
...,...
19782,1
19783,0
19784,1
19785,0


In [17]:
X=data.drop(['IncomeClass_ >50K'],axis=1)
X

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0


In [19]:
# Split the X and Y dataset into training and testing set
x_train,x_test,y_train,y_test = \
train_test_split(X,Y,test_size=0.3,random_state=1234)

In [21]:
# Import and train classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()


In [22]:
# Test the model
dtc.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [23]:
# Evaluate the model
y_pred=dtc.predict(x_test)

In [24]:
print("Classification Report for Decision Tree:""\n",metrics.classification_report(y_test, y_pred))

Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.83      0.88      0.86      4378
           1       0.61      0.51      0.55      1559

    accuracy                           0.78      5937
   macro avg       0.72      0.70      0.71      5937
weighted avg       0.78      0.78      0.78      5937

