In [1]:
# -----------------------------------------------------------------
# Decision Tree Classifier
# Predict the income of an adult based on the census data
# -----------------------------------------------------------------

# Import libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Read dataset
df = pd.read_csv("decisiontreeAdultIncome.csv")
df

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K
...,...,...,...,...,...,...,...,...
19782,53,Private,Masters,Married,White,Male,40,>50K
19783,22,Private,Some-college,Never-married,White,Male,40,<=50K
19784,40,Private,HS-grad,Married,White,Male,40,>50K
19785,58,Private,HS-grad,Widowed,White,Female,40,<=50K


In [3]:
# Check for Null values
df.isnull().sum(axis=0)

age               0
wc                0
education         0
marital status    0
race              0
gender            0
hours per week    0
IncomeClass       0
dtype: int64

In [4]:
# Create Dummy variables
df.dtypes
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [20]:
# Create X and Y Variables
X = df['age']
Y = df['education_ HS-grad']
df['age'].values.reshape(-1,1)
X = df['age'].values.reshape(-1,1)
y = df['education_ HS-grad'].values.reshape(-1,1)

In [21]:
# Split the X and Y dataset into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [22]:
# Import and train classifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

In [23]:
# Test the model
Y_pred = clf.predict(X_test)

In [24]:
# Evaluate the model
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Classification Report for Decision Tree:""\n",metrics.classification_report(Y_test, Y_pred))

Accuracy: 0.5787434731345797
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.58      0.97      0.73      3453
           1       0.46      0.04      0.07      2484

    accuracy                           0.58      5937
   macro avg       0.52      0.50      0.40      5937
weighted avg       0.53      0.58      0.45      5937

