In [1]:
# imports
import numpy as np
import pandas as pd

# class definitions
class TreeNode:

    # constructor
    def __init__(self, i = None, v = None, l = None, r = None, d = None):

        # initialise variables
        self.attribute_index = i
        self.value = v
        self.left_child = l
        self.right_child = r
        self.dependant = d
        
    # run input through the node
    def runInput(self, x):
        
        # if this is a leaf node
        if (self.dependant != None):
            return self.dependant
        
        if (x[self.attribute_index] <= self.value):
            return self.left_child.runInput(x)
            
        else:
            return self.right_child.runInput(x)
    
    # prints the nodes information
    def printNode(self, names, indent=0):
        
        # make indent
        prefix = ("| " * indent) + "|-"
        
        # detection for leaf
        if (self.dependant == None):
        
            # print this node
            print(prefix, names[self.attribute_index], " : ", self.value)

            # if applicable print the children
            if (self.left_child != None):
                self.left_child.printNode(names, indent+1)

            if (self.left_child != None):
                self.right_child.printNode(names, indent+1)
                
        else:
            print(prefix, "actual_productivity -->" ,self.dependant)

# the decision tree maker
class DecisionTreeMaker:
    
    # constructor
    def __init__(self, in_file_name):

        # initialise variables
        self.attribute_names = None
        self.dataset = None
        self.tree_root = None
        self.branch_count = 0
        self.node_count = 0

        print("reading in input file...")

        # read in the data from the csv
        csv_filename = in_file_name + ".csv"
        data = pd.read_csv(csv_filename)

        # read in the attribute names
        self.attribute_names = data.columns.values

        # split the data into independant and independant
        x_train = data.iloc[:, :-1].values
        y_train = data.iloc[:, -1].values.reshape(-1,1)
        
        # create dataset
        self.dataset = np.concatenate((x_train, y_train), axis=1)

        # report attribute count
        instances, attributes = x_train.shape

        # announce progress
        print("succesfully read in data set with ", instances, " instances and ", (attributes+1), "attributes")

    # main functions

    # starts off the process of building the tree
    def buildTree(self):
        print("beginning to build the tree...")
        self.tree_root = self.buildBranch(self.dataset)
        print("tree build complete making a tree containing", self.branch_count, " branches and ", self.node_count, " nodes")

    # runs given test data through the tree returning the results
    def runThroughTree(self, x):
        # initialise answer holder
        y = []
        for input in x:
            y.append(self.tree_root.runInput(input))
        return y
        
    # runs tests on the accuracy of the tree using the test data
    def testTree(self, in_file_name):
        
        csv_filename = in_file_name + ".csv"
        data = pd.read_csv(csv_filename)

        x_test = data.iloc[:, :-1].values
        y_test = data.iloc[:, -1].values.reshape(-1,1)

        # run the test data through the tree
        y_result = self.runThroughTree(x_test)
        
        # display the results for each class
        for label in np.unique(y_test):
            # inform which class the tests are on
            print("the following results are on the ", label, " class")

            # perform and report class accuracy
            correct = 0
            for x in range(len(y_test)): 
                # increment the correct count if the class is correctly
                # classified with regards to the given class
                if (((y_test[x] == label) and (y_result[x] == label)) or ((y_test[x] != label) and (y_result[x] != label))):
                    correct+=1
                    
            print("accuracy_score: ", correct/len(y_test))
            
            # perform and report precision test
            tp = 0
            fp = 0
            for x in range(len(y_test)): 
                # if correctly classed as label, increment
                if ((y_test[x] == label) and (y_result[x] == label)):
                    tp+=1
                # if classed as label when it shouldn't, increment
                if ((y_test[x] != label) and (y_result[x] == label)):
                    fp+=1
            
            precision = (tp/(tp+fp))
            print("precision: ", precision)

            # perform and report recall test
            tp = 0
            fn = 0
            for x in range(len(y_test)): 
                # if correctly classed as label, increment
                if ((y_test[x] == label) and (y_result[x] == label)):
                    tp+=1
                # if not classed as label when it should, increment
                if ((y_test[x] == label) and (y_result[x] != label)):
                    fn+=1
            
            recall = (tp/(tp+fn))
            print("recall: ", recall)
            
            # perform and report sklearn f1 test
            if (precision + recall != 0):
                f1_score = ((2 * precision * recall)/(precision + recall))
            else:
                f1_score = 0
                
            print("F1: ", f1_score)

    # recursively prints the tree
    def printTree(self):
        self.tree_root.printNode(self.attribute_names)
        
    # supporting functions

    # a recurrsive function which builds out the tree from a given node position with using
    # the given data
    def buildBranch(self, dataset):

        # split the dataset into independant and dependant variables
        x, y = dataset[:,:-1],dataset[:,-1]

        # get instance count
        instances, attributes = np.shape(x)

        # increase node count
        self.node_count+=1

        # if there are is enough instances to split and the attributes have not run out
        if ((instances >= 4) and (attributes >=2)):
            
            # find the best split
            info, attribute_index, value, left_dataset, right_dataset = self.findBestSplit(dataset, attributes)
            
            # if this split results in an information gain
            # then perform the split
            if (info > 0):
                
                # increment branch count
                self.branch_count+=2
                
                # create child trees
                left_tree = self.buildBranch(left_dataset)
                right_tree = self.buildBranch(right_dataset)
                
                # return root node
                return TreeNode(attribute_index, value, left_tree, right_tree)

        # else compute leaf value (by finding the most frequently occuring value) and send it to the node
        dependant_variables = list(y)       
        return TreeNode(d=max(dependant_variables, key=dependant_variables.count))
            
    # finds where the if would be best to split the dataset in such a way to better distingish between output
    def findBestSplit(self, dataset, attribute_count):
        
        # initialise
        out_info = 0
        out_attribute_index = None
        out_value = None
        out_left = None
        out_right = None    
        
        # iterate through the attributes looking for one which
        # contains an instance holding the best split
        for attribute_index in range(attribute_count):
            
            # seperate instances, and unique instances
            instances = dataset[:, attribute_index]
            unique_instances = np.unique(instances)
        
            # loop through the unique instance values for the given attribute
            # looking for a good split
            for value in unique_instances:
                
                # make current split
                left_split = np.array([row for row in dataset if row[attribute_index]<=value])
                right_split = np.array([row for row in dataset if row[attribute_index]>value])
                
                # check splits aren't null
                if ((len(left_split) > 0) and (len(right_split) > 0)):
                    
                    # measure info gain
                    tmp_info = self.measureInfoGain(dataset[:, -1], left_split[:, -1], right_split[:, -1])
                    
                    # if the info gain is higher
                    if (tmp_info > out_info):
                        
                        # replace all of the output values
                        out_attribute_index = attribute_index
                        out_value = value
                        out_left = left_split
                        out_right = right_split
                        out_info = tmp_info
        
        return out_info, out_attribute_index, out_value, out_left, out_right
            
    # measure the info gain of the given split over the given parent through the use of
    # the entropy formula
    def measureInfoGain(self, parent, new_left, new_right):
        
        # initialise variables
        left_entropy = 0
        right_entropy = 0
        parent_entropy = 0

        # calculate parent entropy
        for instance in np.unique(parent):
            probability = len(parent[parent==instance]) / len(parent)
            parent_entropy += -probability * np.log2(probability)
        
        # calculate child entropies
        for instance in np.unique(new_left):
            probability = len(new_left[new_left==instance]) / len(new_left)
            left_entropy += -probability * np.log2(probability)
            
        left_entropy = left_entropy * (len(new_left) / len(parent))
        
        for instance in np.unique(new_right):
            probability = len(new_right[new_right==instance]) / len(new_right)
            right_entropy += -probability * np.log(probability)
        
        right_entropy = right_entropy * (len(new_right) / len(parent))

        # return the gain
        return (parent_entropy - (left_entropy + right_entropy))

# beginning of program

# variable definition
in_file = "continuousv2"
in_test_files = "continuousv2test"

# construct tree maker object
tree_maker = DecisionTreeMaker(in_file)

# build the tree
tree_maker.buildTree()

print("performing tests")

# test the tree
tree_maker.testTree(in_test_files)

# print the tree
tree_maker.printTree()

reading in input file...
succesfully read in data set with  1147  instances and  10 attributes
beginning to build the tree...
tree build complete making a tree containing 468  branches and  469  nodes
performing tests
the following results are on the  0.23 - 0.38  class
accuracy_score:  0.76
precision:  0.4
recall:  0.4
F1:  0.4000000000000001
the following results are on the  0.38 - 0.54  class
accuracy_score:  0.8
precision:  0.5
recall:  0.5
F1:  0.5
the following results are on the  0.54 - 0.69  class
accuracy_score:  0.88
precision:  0.6666666666666666
recall:  0.8
F1:  0.7272727272727272
the following results are on the  0.69 - 0.84  class
accuracy_score:  0.7
precision:  0.0
recall:  0.0
F1:  0
the following results are on the  0.84 - 1.00  class
accuracy_score:  0.78
precision:  0.46153846153846156
recall:  0.6
F1:  0.5217391304347826
|- targeted_productivity  :  0.35
| |- standard_minute_value  :  4.3
| | |- standard_minute_value  :  4.15
| | | |- standard_minute_value  :  2.9

In [2]:
# sklearn decision tree implementation
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn import tree

# this function runs tests on the results recieved from the tree using
# methods from the sklearn package
def testTree(tree, in_file_name):

    csv_filename = in_file_name + ".csv"
    data = pd.read_csv(csv_filename)

    x_test = data.iloc[:, :-1].values
    y_test = data.iloc[:, -1].values.reshape(-1,1)

    # run the test data through the tree
    y_result = tree.predict(x_test)

    # display the results for each class
    for label in np.unique(y_test):

        # inform which class the tests are on
        print("the following results are on the ", label, " class")

        # perform and report class accuracy
        correct = 0

        for x in range(len(y_test)): 

            # increment the correct count if the class is correctly
            # classified with regards to the given class
            if (((y_test[x] == label) and (y_result[x] == label)) or ((y_test[x] != label) and (y_result[x] != label))):
                correct+=1

        print("accuracy_score: ", correct/len(y_test))

        # perform and report precision test
        tp = 0
        fp = 0

        for x in range(len(y_test)): 

            # if correctly classed as label, increment
            if ((y_test[x] == label) and (y_result[x] == label)):
                tp+=1
            # if classed as label when it shouldn't, increment
            if ((y_test[x] != label) and (y_result[x] == label)):
                fp+=1
                
        if (tp + fp != 0):
            precision = (tp/(tp+fp))
        else:
            precision = 0

        print("precision: ", precision)

        # perform and report sklearn recall test
        tp = 0
        fn = 0

        for x in range(len(y_test)): 

            # if correctly classed as label, increment
            if ((y_test[x] == label) and (y_result[x] == label)):
                tp+=1
            # if not classed as label when it should, increment
            if ((y_test[x] == label) and (y_result[x] != label)):
                fn+=1

        recall = (tp/(tp+fn))
        print("recall: ", recall)

        # perform and report sklearn f1 test
        if (precision + recall != 0):
            f1_score = ((2 * precision * recall)/(precision + recall))
        else:
            f1_score = 0

        print("F1: ", f1_score)

# variable definition
in_file = "continuousv2"
in_test = "continuousv2test"

# read in the dataset 
data = pd.read_csv(in_file+".csv")

# split the dataset into independant and independant variables
X = data[data.columns[:-1]].values
Y = data['actual_productivity'].values

# initialise tree object
dt = DecisionTreeClassifier()

# build tree
dt.fit(X,Y)

# test the tree
testTree(dt, in_test)

# print the tree
text_representation = tree.export_text(dt, max_depth=99)
print(text_representation)

the following results are on the  0.23 - 0.38  class
accuracy_score:  0.8
precision:  0.5
recall:  0.3
F1:  0.37499999999999994
the following results are on the  0.38 - 0.54  class
accuracy_score:  0.82
precision:  0.5714285714285714
recall:  0.4
F1:  0.47058823529411764
the following results are on the  0.54 - 0.69  class
accuracy_score:  0.74
precision:  0.42105263157894735
recall:  0.8
F1:  0.5517241379310345
the following results are on the  0.69 - 0.84  class
accuracy_score:  0.66
precision:  0.0
recall:  0.0
F1:  0
the following results are on the  0.84 - 1.00  class
accuracy_score:  0.78
precision:  0.45454545454545453
recall:  0.5
F1:  0.47619047619047616
|--- feature_4 <= 29.50
|   |--- feature_2 <= 3.50
|   |   |--- feature_1 <= 3.92
|   |   |   |--- feature_8 <= 8.50
|   |   |   |   |--- feature_0 <= 0.78
|   |   |   |   |   |--- feature_1 <= 3.40
|   |   |   |   |   |   |--- feature_0 <= 0.65
|   |   |   |   |   |   |   |--- feature_0 <= 0.55
|   |   |   |   |   |   |   |  