In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
employees = pd.read_csv('/kaggle/input/trainings/employees_attrition.csv')
employees.head()

In [None]:
print(employees.shape)
print(employees['Attrition'].value_counts() / employees.shape[0])

In [None]:
## Gini impurity at root node
## gi = 1 - p(0)^2 - p(1)^2
gi_bf = 1 - np.square(1233/1470) - np.square(237/1470)
gi_bf

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
target_col = 'Attrition'
input_cols = ['MonthlyIncome', 'Age']
train_x, test_x, train_y, test_y = train_test_split(employees[input_cols],
                                                    employees[target_col],
                                                   test_size=0.2,
                                                   random_state=1)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

In [None]:
print(train_x.shape)
print(train_y.value_counts())

In [None]:
gi_train_root = 1 - np.square(997/1176) - np.square(179/1176)
gi_train_root

In [None]:
# monthly income
train_x['MonthlyIncome'].sort_values().unique().tolist()

In [None]:
train_x['MonthlyIncome'].nunique(), train_x['MonthlyIncome'].min(), train_x['MonthlyIncome'].max()

In [None]:
col = 'MonthlyIncome'
cuts1 = train_x['MonthlyIncome'].sort_values().unique().tolist()
cuts2 = cuts1[1:]
cuts = []
for i in range(len(cuts2)):
    cuts.append(np.mean([cuts1[i], cuts2[i]]))

gains = []
for cut in cuts:
    samples_left = train_x[train_x[col] < cut]
    samples_right = train_x[train_x[col] >= cut]
    n_left = samples_left.shape[0]
    n_right = samples_right.shape[0]
    left_indexes = samples_left.index
    right_indexes = samples_right.index
    targets_left = train_y[train_y.index.isin(left_indexes)]
    p0_left = targets_left[targets_left==0].shape[0] / targets_left.shape[0]
    p1_left = targets_left[targets_left==1].shape[0] / targets_left.shape[0]
    gi_left = 1 - np.square(p0_left) - np.square(p1_left)
    
    targets_right = train_y[train_y.index.isin(right_indexes)]
    p0_right = targets_right[targets_right==0].shape[0] / targets_right.shape[0]
    p1_right = targets_right[targets_right==1].shape[0] / targets_right.shape[0]
    gi_right = 1 - np.square(p0_right) - np.square(p1_right)
    
    gi_overall = n_left/train_x.shape[0]*gi_left + n_right/train_x.shape[0]*gi_right
    gain = 0.258 - gi_overall
    gains.append(gain)

In [None]:
df = pd.DataFrame({
    'cut': cuts,
    'gain': gains
})
df.sort_values('gain', ascending=False).head(1)

In [None]:

model = DecisionTreeClassifier(random_state=1, max_depth=3).fit(train_x, train_y)
draw_tree(model, train_x.columns)

In [None]:
test_x[['MonthlyIncome']].head()

In [None]:
probs = pd.DataFrame(model.predict_proba(test_x), columns=['Prob0', 'Prob1'])
probs.head()


In [None]:
model.predict(test_x)

In [None]:
!pip install pydotplus

In [None]:
def draw_tree(model, columns):
    import pydotplus
    from sklearn.externals.six import StringIO
    from IPython.display import Image
    import os
    from sklearn import tree
    
    graphviz_path = 'C:\Program Files (x86)\Graphviz2.38/bin/'
    os.environ["PATH"] += os.pathsep + graphviz_path

    dot_data = StringIO()
    tree.export_graphviz(model,
                         out_file=dot_data,
                         feature_names=columns)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [None]:
train_x['Age'].nunique()

In [None]:
train_x['Age'].sort_values().unique()