In [1]:
import numpy as np
import pandas as pd

In [2]:
class Node():
    def __init__(self, feature_index = None, threshold = None, left = None, right = None, info_gain = None, *, value = None):
        #decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        #leaf node
        self.value = value
        
    def is_leaf_node(self):
        if self.value is not None:
            return True
        return False

In [3]:
class DecisionTree():
    def __init__(self, min_sample_leaf = 2, max_depth = 2):
        #initialize the root of tree
        self.root = None
        #stoping conditions
        self.min_sample_leaf = min_sample_leaf
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth = 0):
        ''' recursive function to build tree'''
        X, Y = dataset[ : ,  :-1], dataset[ :  , -1]
        num_samples, num_features = np.shape(X)
        #split until stopping condition are met
        if num_samples >= self.min_sample_leaf and num_features <= self.max_depth:
            #find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            #check if the infomation gain is positive
            if best_split["info_gain"] > 0:
                #recursive left
                left_node = self.build_tree(best_split["dataset_left"], curr_depth + 1)
                #recursive right
                right_node = self.build_tree(best_split["dataset_right"], curr_depth + 1)
                #return decision node
                return Node(best_split["feature_index"], best_split["threshold"], left_node, right_node, best_split["inf_gain"])
        #compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        #return leaf node
        return Node(value = leaf_value)
            
    def getd_best_split(self, dataset, num_samples, num_features):
        #dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        for feature_index in range(num_features):
            feature_values = dataset[ :  , feature_index]
            possible_threshold = np.unique(feature_values)
            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:,-1], dataset_right[:,-1]
                    cur_info_gain = self.get_IG(y, left_y, right_y, "gini")
                    if cur_info_gain > max_info_gain:
                        best_split["feature"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = cur_info_gain

                            
        return best_split
            
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([ row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([ row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right
            
    def get_IG(self, parent, l_child, r_child, mode = "entropy" ):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent) 
        if mode == "gini" :
            IG = self.gini_index(parent) - (weight_l * self.gini_index(l_child) +  weight_r * self.gini_index(r_child))
        else :
            IG = self.entropy(parent) - (weight_l * self.entropy(l_child) +  weight_r * self.entropy(r_child))
        return IG
            
    def entropy(self, y): 
        class_labels = np.unique(y)
        E = 0
        for label in class_labels:
            e = -len(y[y == label]) / len(y)
            E += e * np.log2()
        return E
            
    def gini(self, y):        
        class_labels = np.unique(y)
        G = 0
        for label in class_labels:
            g = len(y[y == label]) / len(y)
            G += g * g
        return 1 - G
            
    def calculate_leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key = Y.count)
        
    def print_tree(self, root = None, indent = " "):
        if not root:
            root = self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X_" + str(root.feature_index), "<=", tree.thresold, "?", tree.info_gain)
            print("%sleft:" %(indent), end = "")
            self.print_tree(root.left, indent + indent)
            print("%sright:" %(indent), end = "")
            self.print_tree(root.right, indent + indent)
                        
    def fit(self, X, Y):
        dataset = np.concatenate((X, Y), axis = 1)
        self.root = self.build_tree(dataset)

    def predict(self, X):
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
            
        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        else:
            return self.traverse_tree(X, node.right)

In [4]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = datasets.load_breast_cancer()

X, y = data.data, data.target.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 1234
)

clf =  DecisionTree()
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)

def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, prediction)
accs = accuracy_score(y_test, prediction)
print(acc,"   ", accs)

69.0     0.6052631578947368


In [14]:
data1 = pd.read_csv('weather.csv')

X1 = data1.iloc[: , :-1].values
y1 = data1.iloc[: , :-1].values.reshape(-1, 1)
X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size = 0.2, random_state = 27
)
clf1 =  DecisionTree(3, 3)
clf1.fit(X1_train, y1_train)
prediction1 = clf.predict(X1_test)

acc1 = accuracy(y1_test, prediction1)
accs1 = accuracy_score(y1_test, prediction1)
print(acc1,"   ", accs1)

ValueError: Found input variables with inconsistent numbers of samples: [14, 70]