#Decision Tree

each item has two continuous features
class label is {0, 1}



In [1]:
import numpy as np
import pandas as pd
import math

df = pd.read_csv('./data/D1.txt', sep='\s+', names=['x1', 'x2', 'y'])
df.head()

Unnamed: 0,x1,x2,y
0,0.264185,0.178456,0
1,0.409499,0.213456,1
2,0.926224,0.540329,1
3,0.573685,0.282145,1
4,0.953159,0.608121,1


In [10]:
def determine_candidate_numeric_splits(D: pd.DataFrame, X: str) -> list((str, float)):
    C = []
    instances = D.sort_values(by=X)
    instances = instances.reset_index(drop=True)
    for i, row in instances.iterrows():
        if i < len(instances) - 1:
            y_i = row['y']
            y_next = instances.at[i + 1, 'y']
            x_next = instances.at[i + 1, X]
            if y_i != y_next:
                C.append((X, x_next))
    return C

In [11]:
def determine_candidate_splits(D: pd.DataFrame) -> list((str, float)):
    C = []
    for feature in D.columns[:-1]:
        instances = D[:]
        C.extend(determine_candidate_numeric_splits(instances, feature))
    return C

In [12]:
# GainRatio(D, Split) = (H(Y) - H(Y | Split)) / H(Split)

def entropy_Y(D: pd.DataFrame, Y: str) -> float:
    # H(Y) = -Sum(P(y)*log2(P(y)) for each y in Y)
    H = 0
    y = D[Y].unique()
    for y_i in y:
        Py_i = sum(D[Y] == y_i) / len(D[Y])
        H += Py_i if Py_i == 0 else Py_i * math.log2(Py_i)
    H *= -1
    return H

def entropy_S(D: pd.DataFrame, S: (str, float)) -> float:
    H = 0
    X, c = S
    Pgreater = sum(D[X] >= c) / len(D[X])
    H += Pgreater if Pgreater == 0 else Pgreater * math.log2(Pgreater)
    Plesser = sum(D[X] < c) / len(D[X])
    H += Plesser if Plesser == 0 else Plesser * math.log2(Plesser)
    H *= -1
    return H

def conditional_entropy(D: pd.DataFrame, Y: str, S: (str, float)) -> float:
    # H(Y | Split) = (P(X >= c) * H(Y | X >= c)) + (P(X < c) * H(Y | X < c))
    H = 0
    X, c = S
    HYgreater = entropy_Y(D[D[X] >= c], Y)
    Pgreater = sum(D[X] >= c) / len(D[X])
    HYlesser = entropy_Y(D[D[X] < c], Y)
    Plesser = sum(D[X] < c) / len(D[X])
    H += Pgreater * HYgreater
    H += Plesser * HYlesser
    return H

#print(entropy_Y(df, 'y'))
#print(entropy_S(df, ('x1', .5)))
#print(conditional_entropy(df, 'y', ('x2', .2)))

In [13]:
from dataclasses import dataclass
from typing import List

@dataclass
class Node:
    feature: str = None
    threshold: float = None
    children: List['Node'] = None
    class_label: float = None

    def __str__(self):
        if self.class_label != None:
            return 'y = ' + str(self.class_label)
        elif self.feature and self.threshold and self.children:
            return self.feature + ' >= ' + str(self.threshold) + '[' + \
                str(self.children[0]) + ', ' + str(self.children[1]) + ']'
        else:
            return '<node>'

In [21]:
def make_subtree(D: pd.DataFrame) -> Node:
    # print('BEGIN make_subtree')
    # print('Data: \n')
    # print(D)
    stop = False
    best_split = None
    candidate_splits = determine_candidate_splits(D[:])
    # print('Candidate splits: \n')
    # print(candidate_splits)

    # if node is empty or all splits have 0 info or any split has 0 entropy
    if len(D) == 0 or len(candidate_splits) == 0:
        stop = True
    else:
        gainRatios = []
        for split in candidate_splits:
            split_entropy = entropy_S(D, split)
            if split_entropy == 0:
                gainRatio = float('inf')
                gainRatios.append(gainRatio)
                break
            else:
                # GainRatio(D, Split) = (H(Y) - H(Y | Split)) / H(Split)
                gainRatio = (entropy_Y(D, 'y') - conditional_entropy(D, 'y', split)) / entropy_S(D, split)
                gainRatios.append(gainRatio)
        gainRatios = pd.Series(gainRatios)
        if len(gainRatios) == 0 or gainRatios.max() == 0:
            stop = True
        else:
            best_split = candidate_splits[gainRatios.idxmax()]
    if stop:
        # stopping criteria met
        # make leaf node with class label
        majority_class = D['y'].mode()
        if len(majority_class) > 1:
            majority_class = '1'
        else:
            majority_class = majority_class[0]
        print(str(sum(D['y'] == majority_class)) + ' / ' + str(len(D)) + ' = ' + str(majority_class))
        new_node = Node(class_label=majority_class)
        return new_node
    else:
        # make internal node and make children subtrees
        feature, threshold = best_split
        left_data = D[D[feature] < threshold]
        right_data = D[D[feature] >= threshold]
        print('split: ' + str(len(left_data)) + ' & ' + str(len(right_data)))
        left_child = make_subtree(left_data)
        right_child = make_subtree(right_data)
        new_node = Node(feature, threshold, [left_child, right_child])
        return new_node

In [22]:
dtree = make_subtree(df)
print(dtree)

TypeError: can only concatenate str (not "int") to str

In [18]:
d2 = pd.read_csv('./data/D2.txt', sep='\s+', names=['x1', 'x2', 'y'])
dtree2 = make_subtree(d2)
print(dtree2)

BEGIN make_subtree
Data: 

           x1        x2  y
0    0.315561  0.956006  1
1    0.344972  0.693422  1
2    0.398419  0.974354  1
3    0.347109  0.566740  0
4    0.985903  0.564388  1
..        ...       ... ..
995  0.418671  0.044160  0
996  0.794456  0.142425  0
997  0.384576  0.609262  0
998  0.843186  0.848278  1
999  0.691812  0.751075  1

[1000 rows x 3 columns]
Candidate splits: 

[('x1', 0.041245), ('x1', 0.043176), ('x1', 0.053427), ('x1', 0.054872), ('x1', 0.06542), ('x1', 0.066174), ('x1', 0.104043), ('x1', 0.104094), ('x1', 0.111076), ('x1', 0.112471), ('x1', 0.118056), ('x1', 0.118397), ('x1', 0.120891), ('x1', 0.121206), ('x1', 0.125506), ('x1', 0.125963), ('x1', 0.128628), ('x1', 0.129363), ('x1', 0.135573), ('x1', 0.135836), ('x1', 0.144594), ('x1', 0.147693), ('x1', 0.149399), ('x1', 0.15102), ('x1', 0.164889), ('x1', 0.165009), ('x1', 0.167782), ('x1', 0.168873), ('x1', 0.176701), ('x1', 0.176757), ('x1', 0.177071), ('x1', 0.17751), ('x1', 0.186236), ('x1', 0.186