# Decision Tree Classifier with Employee Attrition Dataset

In this notebook, we will build a decision tree classifier using the scikit-learn library. We will use a hypothetical employee attrition dataset for this example.

## Import Libraries
First, let's import the necessary libraries.

In [182]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


## Load and Explore the Dataset
Next, we will load the employee attrition dataset ('employee_attrition_small.csv') and explore its contents.

In [183]:
df=pd.read_csv('employee_attrition_small.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,TotalWorkingYears
0,41,Yes,Travel_Rarely,1102,Sales,Life Sciences,Female,94,Sales Executive,4,Single,5993,19479,8,Yes,8
1,49,No,Travel_Frequently,279,Research & Development,Life Sciences,Male,61,Research Scientist,2,Married,5130,24907,1,No,10
2,37,Yes,Travel_Rarely,1373,Research & Development,Other,Male,92,Laboratory Technician,3,Single,2090,2396,6,Yes,7
3,33,No,Travel_Frequently,1392,Research & Development,Life Sciences,Female,56,Research Scientist,3,Married,2909,23159,1,Yes,8
4,27,No,Travel_Rarely,591,Research & Development,Medical,Male,40,Laboratory Technician,2,Married,3468,16632,9,No,6


## Preprocess the Data
We need to preprocess the data, including handling categorical variables and missing values.

In [184]:
df.describe()

Unnamed: 0,Age,DailyRate,HourlyRate,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,TotalWorkingYears
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,65.891156,2.728571,6502.931293,14313.103401,2.693197,11.279592
std,9.135373,403.5091,20.329428,1.102846,4707.956783,7117.786044,2.498009,7.780782
min,18.0,102.0,30.0,1.0,1009.0,2094.0,0.0,0.0
25%,30.0,465.0,48.0,2.0,2911.0,8047.0,1.0,6.0
50%,36.0,802.0,66.0,3.0,4919.0,14235.5,2.0,10.0
75%,43.0,1157.0,83.75,4.0,8379.0,20461.5,4.0,15.0
max,60.0,1499.0,100.0,4.0,19999.0,26999.0,9.0,40.0


In [185]:
# clean the data
# check for 0s
df.isna().sum()

Age                   0
Attrition             0
BusinessTravel        0
DailyRate             0
Department            0
EducationField        0
Gender                0
HourlyRate            0
JobRole               0
JobSatisfaction       0
MaritalStatus         0
MonthlyIncome         0
MonthlyRate           0
NumCompaniesWorked    0
OverTime              0
TotalWorkingYears     0
dtype: int64

In [186]:
# for discrete data, check the unique entry types

discrete_dict = {
    'BusinessTravel': df['BusinessTravel'].unique() ,
    'Department': df['Department'].unique(),
    'EducationField': df['EducationField'].unique(),
    'Gender': df['Gender'].unique(),
    'JobRole': df['JobRole'].unique(),
    'MaritialStatus': df['MaritalStatus'].unique(),
    'Overtime': df['OverTime'].unique()
}

max_len = max(len(arr) for arr in discrete_dict.values())

padded_dict = {}

#.items() returns a series of tuples eg
# ('BusinessTravel', ['Travel_Rarely,'Travel_Frequently','Non-Travel'])
for col, arr in discrete_dict.items():
    #calculate how many nan needed
    pad_needed = max_len - len(arr)

    #for each array, the original array would concat with the num of nan needed
    padded_array = np.concatenate([
        arr,
        [np.nan] * pad_needed
    ])
    # each key from discrete_dict is updated with the new padded_array
    padded_dict[col] = padded_array

df_unique = pd.DataFrame(padded_dict)
df_unique

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritialStatus,Overtime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Other,Male,Research Scientist,Married,No
2,Non-Travel,Human Resources,Medical,,Laboratory Technician,Divorced,
3,,,Marketing,,Manufacturing Director,,
4,,,Technical Degree,,Healthcare Representative,,
5,,,Human Resources,,Manager,,
6,,,,,Sales Representative,,
7,,,,,Research Director,,
8,,,,,Human Resources,,


In [187]:
# customised mapping for ordinal data
map_dict = {
    "Non-Travel":0,
    "Travel_Rarely":1,
    "Travel_Frequently":2
}

#map to only one column
df['BusinessTravel']=df['BusinessTravel'].map(map_dict)
print("BusinessTravel\n", df['BusinessTravel'].head())

#Nominal data mapping
# df['Department'] = df['Department'].astype('category').cat.codes
# df['EducationField'] = df['EducationField'].astype('category').cat.codes
# df['Gender'] = df['Gender'].astype('category').cat.codes
# df['JobRole'] = df['JobRole'].astype('category').cat.codes
# df['MaritalStatus'] = df['MaritalStatus'].astype('category').cat.codes
# df['OverTime'] = df['OverTime'].astype('category').cat.codes

# print(df[['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']].tail())


BusinessTravel
 0    1
1    2
2    1
3    2
4    1
Name: BusinessTravel, dtype: int64


In [188]:
# reference table for mappings

nominal_cols = ['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

all_mappings = {}

for col in nominal_cols:
    df[col] = df[col].astype('category')
    categories = df[col].cat.categories
    all_mappings[col] = categories.tolist()
    df[col] = df[col].cat.codes

max_len = max(len(v) for v in all_mappings.values()) 

for col in all_mappings:
    all_mappings[col] += [None]*(max_len-len(all_mappings[col]))

print(df[['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']].tail())

reference_df = pd.DataFrame(all_mappings)
reference_df

      Department  EducationField  Gender  JobRole  MaritalStatus  OverTime
1465           1               3       1        2              1         0
1466           1               3       1        0              1         0
1467           1               1       1        4              1         1
1468           2               3       1        7              1         0
1469           1               3       1        2              1         0


Unnamed: 0,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Human Resources,Human Resources,Female,Healthcare Representative,Divorced,No
1,Research & Development,Life Sciences,Male,Human Resources,Married,Yes
2,Sales,Marketing,,Laboratory Technician,Single,
3,,Medical,,Manager,,
4,,Other,,Manufacturing Director,,
5,,Technical Degree,,Research Director,,
6,,,,Research Scientist,,
7,,,,Sales Executive,,
8,,,,Sales Representative,,


In [189]:
# 1) binning using quantile binning for Age, DailyRate, HourlyRate. MonthlyIncome, MonthlyRte, TotalWorkingYears

q_bin = ["Age", "DailyRate", "HourlyRate", "MonthlyIncome","MonthlyRate", "TotalWorkingYears"]
for col in q_bin:
    df[col] = pd.qcut(df[col], q=4, labels=False)

# 2) threshold binning for NumCompaniesWorked
bins = [0,1,4,10]
labels = [0,1,2]
df['NumCompaniesWorked'] = pd.cut(df['NumCompaniesWorked'], bins=bins, labels=labels,right=True,include_lowest=True)

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,TotalWorkingYears
0,2,Yes,1,2,2,1,0,3,7,4,2,2,2,2,1,1
1,3,No,2,0,1,1,1,1,6,2,1,2,3,0,0,1
2,2,Yes,1,3,1,4,1,3,2,3,2,0,0,2,1,1
3,1,No,2,3,1,1,0,1,6,3,1,0,3,0,1,1
4,0,No,1,1,1,3,1,0,2,2,1,1,2,2,0,0


## Split the Dataset
We will split the dataset into training and testing sets.

In [190]:
# drop the outcome column
data = df.drop(["Attrition"], axis=1)
target = df['Attrition']

# Split the dataset into training and testing sets
# default test-split is 0.25
x_train, x_test, y_train, y_test = train_test_split(data,target,random_state=42)


## Train and evaluate the Decision Tree Model
## Please not that the maximum depth shouldn't be greater than 3

In [191]:
# Create and train the decision tree classifier
dt = DecisionTreeClassifier(max_depth=3, random_state=0)
dt.fit(x_train,y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [192]:
x_train.shape

(1102, 15)

In [193]:
# make predictions on test set
y_pred = dt.predict(x_test)

In [194]:
# calculate accuracy
accuracy_score(y_test,y_pred)

0.8695652173913043

In [195]:
data = {
    'y_test':y_test,
    'y_pred':y_pred
}

df_tp = pd.DataFrame(data)
df_tp

Unnamed: 0,y_test,y_pred
1041,No,No
184,No,No
1222,Yes,No
67,No,No
220,No,No
...,...,...
571,No,No
1163,No,No
243,No,No
1411,No,No


In [196]:
# from sklearn import tree
# import graphviz
# from graphviz import Source

# Source(tree.export_graphviz(dt, out_file=None, class_names=True, feature_names= x_train.columns))