In [9]:
import pandas as pd
import numpy as np
import math

In [10]:
def entropy_func(c, n):
    """
    The math formula
    """
    return -(c*1.0/n)*math.log(c*1.0/n, 2)

def entropy_cal(c1, c2):
    """
    Returns entropy of a group of data
    c1: count of one class
    c2: count of another class
    """
    if c1== 0 or c2 == 0:  # when there is only one class in the group, entropy is 0
        return 0
    return entropy_func(c1, c1+c2) + entropy_func(c2, c1+c2)

# get the entropy of one big circle showing above
def entropy_of_one_division(division): 
    """
    Returns entropy of a divided group of data
    Data may have multiple classes
    """
    s = 0
    n = len(division)
    if n ==0:
        return s,0
    classes = set(division)
    for c in classes:   # for each class, get entropy
        n_c = sum(division==c)
        e = n_c*1.0/n * entropy_cal(sum(division==c), sum(division!=c)) # weighted avg
        s += e
    return s, n

# The whole entropy of two big circles combined
def get_entropy(y_predict, y_real):
    """
    Returns entropy of a split
    y_predict is the split decision, True/Fasle, and y_true can be multi class
    """
    if len(y_predict) != len(y_real):
        print('They have to be the same length')
        return None
    n = len(y_real)
    s_true, n_true = entropy_of_one_division(y_real[y_predict]) # left hand side entropy
    s_false, n_false = entropy_of_one_division(y_real[~y_predict]) # right hand side entropy
    s = n_true*1.0/n * s_true + n_false*1.0/n * s_false # overall entropy, again weighted average
    return s

In [16]:
def find_best_split_of_all(X, y):
    """
    Find the best split from all features
    returns: the column to split on, the cutoff value, and the actual entropy
    """
    col = None
    min_entropy = 1
    cutoff = None
    for column in range(X.shape[1]):  # iterating through each feature
        entropy, cur_cutoff = find_best_split(X[column], y)  # find the best split of that feature
        if entropy == 0:    # find the first perfect cutoff. Stop Iterating
            return entropy, cur_cutoff, column
        elif entropy <= min_entropy:  # check if it's best so far
            min_entropy = entropy
            col = column
            cutoff = cur_cutoff
    return min_entropy,cutoff,col

In [17]:
def find_best_split(col, y):
    """
    col: the column we split on
    y: target var
    """
    min_entropy = 10    
    n = len(y)
    
    
    for value in set(col):  # iterating through each value in the column
        y_predict = col < value  # separate y into 2 groups
        my_entropy = get_entropy(y_predict, y)  # get entropy of this split
        if my_entropy <= min_entropy:  # check if it's the best one so far
            min_entropy = my_entropy
            cutoff = value
    return min_entropy, cutoff

In [18]:
class Node:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.is_leaf = True
        self.column = None
        self.split_point = None
        self.children = None
    
    def is_pure(self):
        p = self.probabilities()
        if p[0] == 1 or p[1] == 1:
            return True
        return False

    def split(self, depth=0):
        X, y = self.X, self.y
        if self.is_leaf and not self.is_pure():
           
            splits = find_best_split_of_all(X, y)
            print(splits)

            
            gini, split_point, column = splits
            self.is_leaf = False
            self.column = column
            self.split_point = split_point
            
            below = X.loc[:,column] <= split_point
            above = X.loc[:,column] > split_point 
            
            self.children = [
                Node(X[below], y[below]),
                Node(X[above], y[above])
            ]
            
            if depth:
                for child in self.children:
                    child.split(depth-1)
                    
                    
    def probabilities(self):
        return np.array([
        np.mean(self.y == 0),
        np.mean(self.y == 1),
    ])

    def predict_proba(self, row):
        if self.is_leaf:
            return self.probabilities()
        else:
            #print(row,self.column,self.split_point)
            if row[self.column] <= self.split_point:
                return self.children[0].predict_proba(row)
            else:
                return self.children[1].predict_proba(row)
    def formatted(self, indent=0):
        if self.is_leaf:
            s = "Leaf({p[0]:.3f}, {p[1]:.3f})".format(p=self.probabilities())
        else:
            s = "Branch(X{column} <= {split_point})\n{left}\n{right}".format(
                column=self.column, 
                split_point=self.split_point,
                left=self.children[0].formatted(indent+1),
                right=self.children[1].formatted(indent+1))

        return "    " * indent + s

    def __str__(self):
        return self.formatted()

    def __repr__(self):
        return str(self)

In [19]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=3):
        self.max_depth = int(max_depth)
        self.root = None
        
    def fit(self, X, y):
        self.root = Node(X, y)
        self.root.split(self.max_depth)
        
    def predict_proba(self, X):
        results = []
        for index, row in X.iterrows():
            #print(row)
            p = self.root.predict_proba(row)
            results += [p]
        return np.array(results)
            
    def predict(self, X):
        predictions = (self.predict_proba(X)[:, 1] > 0.5)
        #print(predictions)
        return predictions


In [20]:
# a small classification data set with 30 to get with. 
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()

X = pd.DataFrame(breast_cancer['data'])
y = pd.DataFrame(breast_cancer['target'])

model = DecisionTreeClassifier(max_depth=4)
model.fit(X, y)
y_hat = model.predict(X)
p_hat = model.predict_proba(X)[:,1]

(0.0, 14.81, 0)
(0.0, 10.03, 0)
(0.0, 14.2, 0)
(0.0, 12.75, 0)
(0.0, 11.36, 0)
(0.0, 13.15, 0)
(0.0, 14.69, 0)
(0.0, 14.6, 0)
(0.0, 14.74, 0)
(0.0, 17.29, 0)
(0.0, 16.24, 0)
(0.0, 15.75, 0)
(0.0, 15.5, 0)
(0.0, 16.14, 0)
(0.0, 16.3, 0)
(0.0, 16.25, 0)
(0.0, 17.08, 0)
(0.0, 17.54, 0)
(0.0, 20.47, 0)
(0.0, 17.6, 0)


In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(confusion_matrix(y, y_hat))
print('Accuracy:', accuracy_score(y, y_hat))


[[167  45]
 [ 22 335]]
Accuracy: 0.8822495606326889
