In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.DataFrame({
    'sarjana': [0, 0, 0, 1, 1, 1],
    'diterima': [0, 0, 1, 0, 1, 1]
})
# 1 root & 2 leaf = stump
# root node / starting question
# leaves / conclusion
# internal node / question branch
df

### Gini Impurity

- Hitung Gini untuk tiap leaf dalam feature: __gini S1True__ & __gini S1False__
- Gini $\displaystyle = 1 - $ (probability TRUE) $^2 - $ (probability FALSE) $^2$
- Hitung Gini total untuk 1 feature: __gini S1__

In [None]:
giniS1True = 1 - (2/3)**2 - (1/3)**2
giniS1False = 1 - (2/3)**2 - (1/3)**2
giniS1 = (3/6)*giniS1True + (3/6)*giniS1False
giniS1True, giniS1False, giniS1

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(df[['sarjana']], df['diterima'])
model.predict([[0]])[0]

In [None]:
tree.plot_tree(model.fit(df[['sarjana']], df['diterima']))

In [None]:
tree.export_graphviz(
    model.fit(df[['sarjana']], df['diterima']),
    out_file = 'decision.dot',
    feature_names = ['Sarjana'],
    class_names = ['Diterima', 'Tidak diterima']
)

### [https://dreampuf.github.io/GraphvizOnline](https://dreampuf.github.io/GraphvizOnline)

### Entropy & Gain

- Hitung Entropy untuk tiap leaf di feature: E(s1True) & E(s1False)
- Entropy $\displaystyle = \sum -p_i \cdot log_2 (p_i)$
- Gain = E - E.feature

In [None]:
Es1TruedTrue = -1 * (2/3) * np.log2(2/3)
Es1TruedFalse = -1 * (1/3) * np.log2(1/3)
Es1True = Es1TruedTrue + Es1TruedFalse
Es1TruedTrue, Es1TruedFalse, Es1True

In [None]:
Es1FalsedTrue = -1 * (1/3) * np.log2(1/3)
Es1FalsedFalse = -1 * (2/3) * np.log2(2/3)
Es1False = Es1FalsedTrue + Es1FalsedFalse
Es1FalsedTrue, Es1FalsedFalse, Es1False

In [None]:
Esarjana = -1 * (3/6) * np.log2(3/6) + -1 * (3/6) * np.log2(3/6)
Esarjana

In [None]:
# information Gain s1true = E sarjana - E s1 true
gains1true = Esarjana - Es1True
gains1true

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='entropy')
model.fit(df[['sarjana']], df['diterima'])
model.predict([[0]])[0]

In [None]:
tree.plot_tree(model.fit(df[['sarjana']], df['diterima']))

In [None]:
df = pd.DataFrame([
    {'sarjana':0, 'sehat':1, 'supel':1, 'diterima':1},
    {'sarjana':1, 'sehat':0, 'supel':0, 'diterima':1},
    {'sarjana':0, 'sehat':1, 'supel':1, 'diterima':0},
    {'sarjana':0, 'sehat':0, 'supel':1, 'diterima':0},
    {'sarjana':1, 'sehat':0, 'supel':1, 'diterima':0},
])
df

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(df[['sarjana', 'sehat', 'supel']], df['diterima'])
model.predict([[0, 1, 1]])[0]

In [None]:
tree.plot_tree(
    model.fit(df[['sarjana', 'sehat', 'supel']], df['diterima'])
)

In [None]:
tree.export_graphviz(
    model.fit(df[['sarjana', 'sehat', 'supel']], df['diterima']),
    out_file = 'decision2.dot',
    feature_names = ['Sarjana', 'Sehat', 'Supel'],
    class_names = ['Diterima', 'Tidak diterima']
)

In [None]:
from sklearn.datasets import load_iris
x = load_iris()

In [None]:
df = pd.DataFrame(x['data'], columns=['sl','sw','pl','pw'])
df['target'] = x['target']
model = tree.DecisionTreeClassifier()
model.fit(df[['sl','sw','pl','pw']], df['target'])

# tree.export_graphviz(
#     model.fit(df[['sl','sw','pl','pw']], df['target']),
#     out_file = 'decisionIris.dot',
#     feature_names = ['SL', 'SW', 'PL', 'PW'],
#     class_names = x['target_names']
# )