In [36]:
import numpy as np
from collections import Counter


def Get_Entropy(target):
    histogram = np.bincount(target)
    ps = histogram / len(target)

    return -np.sum([p * np.log2(p) for p in ps if p > 0])


class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def Is_Leaf_Node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, nr_Min_Samples_Split=2, nr_Max_Depth=10, nr_Features=None):
        self.nr_Min_Samples_Split = nr_Min_Samples_Split
        self.nr_Max_Depth = nr_Max_Depth
        self.nr_Features = nr_Features
        self.root = None

    def fit(self, data, target):
        self.nr_Features = data.shape[1] if not self.nr_Features else min(
            self.nr_Features, data.shape[1])
        self.root = self.Grow_Tree(data, target)

    def predict(self, data):
        return np.array([self.Get_Traverse_Tree(x, self.root) for x in data])

    def Get_Traverse_Tree(self, data, node):
        if node.Is_Leaf_Node():
            return node.value

        if data[node.feature] <= node.threshold:
            return self.Get_Traverse_Tree(data, node.left)

        return self.Get_Traverse_Tree(data, node.right)

    def Grow_Tree(self, data, target, nr_Depth=0):
        nr_Samples, nr_Features = data.shape
        nr_Target = len(np.unique(target))

        if(nr_Depth >= self.nr_Max_Depth or nr_Target == 1 or nr_Samples < self.nr_Min_Samples_Split):
            nr_Leaf_Value = self.Most_Commom_Target(target)
            return Node(value=nr_Leaf_Value)

        feature_Indexes = np.random.choice(
            nr_Features, self.nr_Features, replace=False)

        best_Feature, best_Threshold = self.Get_Best_Criteria(
            data, target, feature_Indexes)

        left_Indexes, right_Indexes = self.Split(data[:, best_Feature], best_Threshold)
        left = self.Grow_Tree(data[left_Indexes, :], target[left_Indexes], nr_Depth + 1)
        right = self.Grow_Tree(data[right_Indexes, :], target[right_Indexes], nr_Depth + 1)
        return Node(best_Feature, best_Threshold, left, right)

    def Get_Best_Criteria(self, data, target, feature_Indexes):
        nr_Best_Gain = -1
        nr_Split_Index, nr_Split_Threshold = None, None

        for index in feature_Indexes:
            column = data[:, index]
            thresholds = np.unique(column)
            
            for threshold in thresholds:
                nr_Gain = self.Get_Information_Gain(target, column, threshold)
                
                if nr_Gain > nr_Best_Gain:
                    nr_Best_Gain = nr_Gain
                    nr_Split_Index = index
                    nr_Split_Threshold = threshold
                      
        return nr_Split_Index, nr_Split_Threshold

    def Get_Information_Gain(self, target, column, nr_Split_Threshold):
        nr_Parent_Entropy = Get_Entropy(target)

        left_Indexes, right_Indexes = self.Split(column, nr_Split_Threshold)

        if(len(left_Indexes) == 0 or len(right_Indexes) == 0):
            return 0

        nr_Length = len(target)

        nr_Left_Length, nr_Right_Length = len(left_Indexes), len(right_Indexes)
        nr_Entropy_Left, nr_Entropy_Right = Get_Entropy(target[left_Indexes]), Get_Entropy(target[right_Indexes])
        nr_Child_Entropy = (nr_Left_Length / nr_Length) * nr_Entropy_Left + (nr_Right_Length / nr_Length) * nr_Entropy_Right

        return nr_Parent_Entropy - nr_Child_Entropy
        

    def Split(self, column, nr_Split_Threshold):
        left_Indexes = np.argwhere(column <= nr_Split_Threshold).flatten()
        right_Indexes = np.argwhere(column > nr_Split_Threshold).flatten()

        return left_Indexes, right_Indexes

    def Most_Commom_Target(self, target):
        counter = Counter(target)
        nr_Counter = counter.most_common(1)[0][0]
        return nr_Counter


In [40]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

def Get_Accuracy(trueTarget, target):
    return np.sum(trueTarget == target) / len(trueTarget)

breast = datasets.load_breast_cancer()
data = breast.data
target = breast.target

trainData, testData, trainTarget, testTarget = train_test_split(data, target, test_size=0.2)

classifier = DecisionTree()
classifier.fit(trainData, trainTarget)

predictions = classifier.predict(testData)
nr_Accuracy = Get_Accuracy(testTarget, predictions)

print("Accuracy:", nr_Accuracy)





Accuracy: 0.9122807017543859
