# C4.5 Algorithm Implement 

## ID3: information gain 
## C4.5: ratio of information gain 

In [None]:
import numpy as np


# for numeric data (need encoding data to number).
class C45():

    def __init__(self, x, maxDepth=3, minGain=0.5):
        self.__maxDepth = maxDepth
        self.__minGain = minGain
        self.__depth = 0
        self.__tree = []
        self.__features = np.arrage(x.shape[1])

    # given the y in that dataset , compute the entropy
    def _entropy(self, y):
        values, counts = np.unique(y, return_counts=True)
        total = len(y)
        entropy = -np.sum([count / total * np.log2(count / total) for count in counts])
        return entropy

    # given attribute index and that dataset(x,y), compute conditional entropy
    def _conditEntropy(self, attribute, x, y):
        values = np.unique(x[:, attribute])
        entropy = []
        # iterate each value in that attribute
        for value in values:
            index = np.argwhere(value == x[:, attribute]).reshape(-1)
            # how many data points belongs to that attribute value
            label = y[index]
            weight = len(label) / len(y)
            h = self._entropy(label)
            attrEntrop = weight * h
            entropy.append(attrEntrop)
        conditEntropy = np.sum(entropy)
        return conditEntropy

    def C45Train(self, x, y, ):
        # if all data belong to one class, then return
        values, counts = np.unique(y, return_counts=True)
        # if only a class, return the tree
        if len(values) == 1:
            # depth and its class
            if self.__depth == 0:
                #(tree depth, feature, value, class)
                self.__tree.append((self.__depth, 'Root Node', None, values.item()))
            return self.__tree
        # if no features, return, # if it is root, it is impossible the length of feature is 0
        elif len(self.__features) == 0 or self.__depth > self.__maxDepth:
            return self.__tree
        else:
            # start building decision tree
            if self.__depth == 0:
                valMaxCount = values[np.argmax(counts)]
                self.__tree.append((self.__depth, 'Root Node', None, valMaxCount))
            # 1. compute data entropy
            h = self._entropy(y)
            # 2. compute conditional entropy of each feature
            conEL = []
            for feature in self.__features:
                conditEntropy = self._conditEntropy(feature, x, y)
                conEL.append(conditEntropy)
            # 3. compute gain
            gain = h - np.array(conEL)
            # add this part and change the criterion variable with infGain
            #==* c4.5
            hfList = []
            for feature in self.__features:
                hf = self._entropy(x[:, feature])
                hfList.append(hf)
            infGain = gain / np.array(hfList)
            #==
            # 4. max gain
            maxGain = np.max(infGain)  # changed
            # if max gain less than threshold
            if maxGain <= self.__minGain:
                return self.__tree
            # else split the to sub-nodes
            else:
                # 5. get feature with max gain and set it as standard to split data
                maxGainF = self.__features[np.argmax(infGain)]  # changed
                # 6. according to the feature values to split dataset
                attrValues = np.unique(x[:, maxGainF])
                # a little tricky
                loop = self.__depth
                preXsub, preYsub, subxs, subys = None, None, None, None
                for k, attrValue in enumerate(attrValues):
                    xAttrIndex = np.argwhere(attrValue == x[:, maxGainF])
                    xsub = x[xAttrIndex, :]
                    ysub = y[xAttrIndex]
                    if k > 0:
                        subxs = np.vstack((preXsub, xsub))
                        subys = np.vstack((preYsub, ysub))
                    preXsub = xsub
                    preYsub = ysub
                    subvals, subcts = np.unique(ysub, return_counts=True)
                    val = subvals[np.argmax(subcts)]
                    self.__tree.append((loop + 1, maxGainF, attrValue, val))
                # depth + 1 (important: depth increase must be after the loop above)
                self.__depth += 1
                if self.__depth > self.__maxDepth:
                    return self.__tree
                else:
                    # after built siblings, remove the feature to ready to next loop
                    for subX, subY in zip(subxs, subys):
                        leftF = np.argwhere(maxGainF != self.__features)
                        self.__features = np.array(self.__features)[leftF].tolist()
                        return self.C45Train(subX, subY)

    # tree structure: (tree depth,feature,value,class)
    def predict(self, x):
        predL=[]
        # for each data points
        for data in x:
            # a data point has match history in the tree
            matchL=[]
            for attribute,value in enumerate(data):
                # iterative tree
                for node in self.__tree:
                    # if mach
                    if attribute==node[1]:
                        if value==node[2]:
                            matchL.append((node[0],node[3]))
                        else:
                            continue
                    else:
                        continue
            pred=max(matchL,key=lambda ele:ele[0])[1]
            predL.append(pred)
        return predL

    def accuracy(self,x,y):
        predL=self.predict(x)
        return np.mean(y==predL)


                    