In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from tree_models import DecisionTreeClassifier

# Explore DecisionTreeClassifier

In [2]:
iris = load_iris() 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], test_size=0.2)

In [4]:
X_train

array([[5.5, 2.5, 4. , 1.3],
       [7.2, 3.6, 6.1, 2.5],
       [5. , 3.3, 1.4, 0.2],
       [5.8, 2.7, 5.1, 1.9],
       [5.1, 3.5, 1.4, 0.2],
       [6. , 2.2, 5. , 1.5],
       [5.7, 4.4, 1.5, 0.4],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [6.9, 3.2, 5.7, 2.3],
       [6.1, 2.6, 5.6, 1.4],
       [5.4, 3. , 4.5, 1.5],
       [5.4, 3.9, 1.3, 0.4],
       [7.9, 3.8, 6.4, 2. ],
       [5.8, 2.7, 3.9, 1.2],
       [6.3, 3.3, 6. , 2.5],
       [7.7, 2.6, 6.9, 2.3],
       [4.6, 3.4, 1.4, 0.3],
       [5.1, 2.5, 3. , 1.1],
       [5.6, 2.7, 4.2, 1.3],
       [5.8, 4. , 1.2, 0.2],
       [6. , 3. , 4.8, 1.8],
       [5. , 2. , 3.5, 1. ],
       [4.5, 2.3, 1.3, 0.3],
       [6.6, 3. , 4.4, 1.4],
       [5.2, 4.1, 1.5, 0.1],
       [5. , 3. , 1.6, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [5.1, 3.5, 1.4, 0.3],
       [5.1, 3.7, 1.5, 0.4],
       [5.8, 2.6, 4. , 1.2],
       [5.1, 3.8, 1.6, 0.2],
       [6.3, 3.3, 4.7, 1.6],
       [7.2, 3.2, 6. , 1.8],
       [6.9, 3

In [5]:
y_train

array([1, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 1, 0, 2, 1, 2, 2, 0, 1, 1, 0, 2,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2, 0, 0, 2, 0, 1, 2,
       1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 2, 0, 2, 1, 2, 1, 1, 2, 1, 0,
       2, 2, 1, 0, 0, 0, 2, 1, 0, 2, 0, 2, 1, 2, 2, 2, 2, 1, 1, 0, 0, 0,
       1, 1, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 2,
       1, 1, 1, 1, 0, 1, 1, 0, 2, 2])

In [6]:
dt = DecisionTreeClassifier(10, 2)

In [7]:
for method in dir(dt):
    if not method.startswith('__'):
        print(method)

build_tree
compute_entropy
compute_gini
compute_information_gain
compute_leaf_value
criterion
fit
get_best_split
make_prediction
make_split
max_depth
min_samples_split
n_cuts
predict
print_tree
root


In [8]:
dt.fit(X_train, y_train)

In [9]:
dt.root.right_node

<tree_models.Node at 0x12fddf9e0>

In [10]:
dataset_left, dataset_right, y_left, y_right =  dt.make_split(X_train, y_train, 0, 5.5)

In [11]:
dt.get_best_split(X_train, y_train)

{'feature_index': 2,
 'threshold': 2.311111111111111,
 'dataset_left': array([[5. , 3.3, 1.4, 0.2],
        [5.1, 3.5, 1.4, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5.8, 4. , 1.2, 0.2],
        [4.5, 2.3, 1.3, 0.3],
        [5.2, 4.1, 1.5, 0.1],
        [5. , 3. , 1.6, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [5.1, 3.5, 1.4, 0.3],
        [5.1, 3.7, 1.5, 0.4],
        [5.1, 3.8, 1.6, 0.2],
        [4.3, 3. , 1.1, 0.1],
        [5.3, 3.7, 1.5, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [4.9, 3.6, 1.4, 0.1],
        [4.4, 3. , 1.3, 0.2],
        [5. , 3.5, 1.6, 0.6],
        [5.4, 3.4, 1.5, 0.4],
        [5.5, 4.2, 1.4, 0.2],
        [5. , 3.4, 1.5, 0.2],
        [4.8, 3. , 1.4, 0.3],
        [4.8, 3. , 1.4, 0.1],
        [4.8, 3.4, 1.9, 0.2],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [5. , 3.2, 1.2, 0.2],
        [5.7, 3.8, 1.7, 0.3],


In [12]:
dt.print_tree()

 Level 0 ---> Feature index 2 with threshold 2.311111111111111 and IG 0.3413247863247863.
   left
   Leaf value 0
   right
   Level 1 ---> Feature index 2 with threshold 4.733333333333333 and IG 0.4281149182759201.
     left
     Leaf value 1
     right
     Level 2 ---> Feature index 2 with threshold 5.033333333333333 and IG 0.06003244997295844.
       left
       Leaf value 2
       right
       Leaf value 2


In [13]:
y_pred = dt.predict(X_test)

In [14]:
confusion_matrix(y_test, y_pred)

array([[8, 0, 0],
       [0, 9, 3],
       [0, 1, 9]])