# Run all cells to see the training/testing time and accuracy comparison between the implemented and built-in function

In [34]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
import time

In [35]:


class Node:
    def __init__(self,feature=None,thresh=None,left_child=None,right_child=None,val=None):
        self.feature = feature
        self.threshold = thresh
        self.left = left_child
        self.right = right_child
        self.value = val

    def is_a_leaf(self):
        return self.value is not None


In [36]:

class Tree:

    def __init__(self, max_depth=15, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def reached_end(self, depth):
        if (depth >= self.max_depth or self.n_class_labels == 1
                or self.n_samples < self.min_samples_split):
            return True
        return False

    def fit(self, X, y):
        self.root = self.build_tree(X, y)

    def entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy

    def split(self, X, thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def gain(self, X, y, thresh):
        parent_loss = self.entropy(y)
        left_idx, right_idx = self.split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)
        if n_left == 0 or n_right == 0:
            return 0
        child_loss = (n_left / n) * self.entropy(
            y[left_idx]) + (n_right / n) * self.entropy(y[right_idx])
        return parent_loss - child_loss

    def find_best_split(self, X, y, features):
        split = {'score': -1, 'feat': None, 'thresh': None}
        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self.gain(X_feat, y, thresh)
                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh
        return split['feat'], split['thresh']

    def build_tree(self, X, y, depth=0):
        self.n_samples, self.n_features = X.shape
        self.n_class_labels = len(np.unique(y))
        rnd_feats = np.random.choice(self.n_features,
                                     self.n_features,
                                     replace=False)
        best_feat, best_thresh = self.find_best_split(X, y, rnd_feats)
        if self.reached_end(depth):
            most_common_Label = np.argmax(np.bincount(y))
            return Node(val=most_common_Label)
        left_idx, right_idx = self.split(X[:, best_feat], best_thresh)
        left_child = self.build_tree(X[left_idx, :], y[left_idx], depth + 1)
        right_child = self.build_tree(X[right_idx, :], y[right_idx],
                                       depth + 1)
        return Node(best_feat, best_thresh, left_child, right_child)

    def traverse_tree(self, x, node):
        if node.is_a_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

    def predict(self, X):
        predictions = [self.traverse_tree(x, self.root) for x in X]
        return np.array(predictions)


def find_accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


In [38]:
df = pd.read_csv("cardio_train.csv", sep=';')

#-------------------- Data Preprocessing ---------------#

#-------------------- Change age values from days to years -----------------#

df['age']=df['age'].floordiv(365)

#-------------------- Approximate weight, ap_hi, ap_lo values to be multiples of 5 -----------------#

df['weight']=5*round(df['weight'] / 5)
df['weight']=df['weight'].astype(int)
df['ap_hi']=5*round(df['ap_hi']/5).astype(int)
df['ap_lo']=5*round(df['ap_lo']/5).astype(int)


X = np.array(df[df.columns[1:-1]])
Y = np.array(df.cardio)

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,train_size=0.8,random_state=0)
implemented_tree = Tree(max_depth=10)
start = time.time()
implemented_tree.fit(X_train, y_train)
end= time.time()
print("Implemented tree training time= ", end - start)
start = time.time()
implemented_y_pred = implemented_tree.predict(X_test)
end = time.time()
print("Implemented tree testing time= ", end - start)
tree_accuracy = np.round(find_accuracy(y_test, implemented_y_pred),4)
start = time.time()
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=10)
clf.fit(X_train, y_train)
end = time.time()
print("Scikit learn training time= ",  end - start)
start = time.time()
scikit_y_pred = clf.predict(X_test)
end = time.time()
print("Scikit learn testing time = ",  end - start)
scikit_accuracy = np.round(find_accuracy(y_test, scikit_y_pred),4)
print("Implemented tree Accuracy= ", tree_accuracy)
print("Scikit learn Accuracy= ", scikit_accuracy)


Implemented tree training time=  4.8631181716918945
Implemented tree testing time=  0.06201934814453125
Scikit learn training time=  0.08701252937316895
Scikit learn testing time =  0.0023818016052246094
Implemented tree Accuracy=  0.7296
Scikit learn Accuracy=  0.7297
