# ID3 Implement 

In [1]:
import numpy as np
import pandas as pd

In [2]:
# for numeric data (need coding data to number).
class ID3v1():
    def __init__(self,x,maxDepth=3,minGain=0.5):
        self.__maxDepth=maxDepth
        self.__minGain=minGain
        self.__depth=0
        self.__tree=[]
        self.__features=np.arrage(x.shape[1])

    # given the y in that dataset , compute the entropy
    def _entropy(self,y):
        values,counts=np.unique(y,return_counts=True)
        total=len(y)
        entropy=-np.sum([count/total*np.log2(count/total) for count in counts])
        return entropy
    # given attribute index and that dataset(x,y), compute conditional entropy
    def _conditEntropy(self,attribute,x,y):
        values=np.unique(x[:,attribute])
        entropy=[]
        # iterate each value in that attribute
        for value in values:
            index=np.argwhere(value==x[:,attribute]).reshape(-1)
            # how many data points belongs to that attribute value
            label=y[index]
            weight=len(label)/len(y)
            h=self._entropy(label)
            attrEntrop=weight*h
            entropy.append(attrEntrop)
        conditEntropy=np.sum(entropy)
        return conditEntropy

    def id3Train(self,x,y,):
        # if all data belong to one class, then return
        values,counts=np.unique(y,return_counts=True)
        # if only a class, return the tree
        if len(values)==1:
            # depth and its class
            if self.__depth==0:
                #(tree depth, feature, value, class)
                self.__tree.append((self.__depth,'Root Node',None,values.item()))
            return self.__tree
        # if no features, return, # if it is root, it is impossible the length of feature is 0
        elif len(self.__features)==0 or self.__depth>self.__maxDepth:
            return self.__tree
        else:
            # start building decision tree
            if self.__depth==0:
                valMaxCount=values[np.argmax(counts)]
                self.__tree.append((self.__depth,'Root Node',None,valMaxCount))
            # 1. compute data entropy
            h=self._entropy(y)
            # 2. compute conditional entropy of each feature
            conEL=[]
            for feature in self.__features:
                conditEntropy=self._conditEntropy(feature,x,y)
                conEL.append(conditEntropy)
            # 3. compute gain
            gain=h-np.array(conEL)
            # 4. max gain
            maxGain=np.max(gain)
            # if max gain less than threshold
            if maxGain<=self.__minGain:
                return self.__tree
            # else split the to sub-nodes
            else:
                # 5. get feature with max gain and set it as standard to split data
                maxGainF=self.__features[np.argmax(gain)]
                # 6. according to the feature values to split dataset
                attrValues=np.unique(x[:,maxGainF])
                # a little tricky
                loop=self.__depth
                preXsub,preYsub,subxs,subys=None,None,None,None
                for k,attrValue in enumerate(attrValues):
                    xAttrIndex=np.argwhere(attrValue==x[:,maxGainF])
                    xsub=x[xAttrIndex,:]
                    ysub=y[xAttrIndex]
                    if k>0:
                        subxs=np.vstack((preXsub,xsub))
                        subys=np.vstack((preYsub,ysub))
                    preXsub=xsub
                    preYsub=ysub
                    subvals,subcts=np.unique(ysub,return_counts=True)
                    val=subvals[np.argmax(subcts)]
                    self.__tree.append((loop+1,maxGainF,attrValue,val))
                # depth + 1 (important! depth increase must be after the loop above)
                self.__depth+=1
                if self.__depth>self.__maxDepth:
                    return self.__tree
                else:
                    # after built siblings, remove the feature to ready to next loop
                    for subX,subY in zip(subxs,subys):
                        leftF=np.argwhere(maxGainF!=self.__features)
                        self.__features=np.array(self.__features)[leftF].tolist()
                        return self.id3Train(subX,subY)

In [None]:
# for text data
class ID3v2:

    # define entropy
    @classmethod
    def entropy(cls, target_col):
        elements, counts = np.unique(target_col, return_counts=True)
        entropy = -np.sum(
            [(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
        return entropy

    # define information gain based on given split attribute
    @classmethod
    def infor_gain(cls, data, split_attribute_name):
        target_name = data.columns[-1]
        data_entropy = cls.entropy(data[target_name])
        elements, counts = np.unique(data[split_attribute_name], return_counts=True)
        entropy_split_attribute = np.sum([(counts[i] / np.sum(counts)) * cls.entropy(
            data.where(data[split_attribute_name] == elements[i]).dropna()[target_name]) for i in range(len(elements))])
        information_gain = data_entropy - entropy_split_attribute
        return information_gain

    # choose the attribute with the biggest information gain to split the dataset
    @classmethod
    def select_best_attribute(cls, data):
        attributes = data.columns[:-1]
        infor_gain_list = []
        for attribute in attributes:
            information_gain = cls.infor_gain(data, attribute)
            infor_gain_list.append((information_gain, attribute))

        return max(infor_gain_list)

    # get tree depth through filtering the nodes named with distinct values of these attributes
    @classmethod
    def tree_depth(cls, data, decision_tree):
        # node_names= features + ['yes','no']
        node_names = list(data.columns[:-1]) + ['yes', 'no']
        # if leaf node, just add 1 to tree depth
        if decision_tree == 'yes' or 'no':
            return 1
        # otherwise continue searching in the tree to find the longest path with the given node names
        else:
            return max(cls.tree_depth(data, v) if all((isinstance(v, dict), k in node_names)) else 0 for k, v in
                       decision_tree.items()) + 1

    # id3 algorithm
    def id3(self, data, previous_data, attributes, max_tree_depth=5, threshold=0.1, parent_node_label=None):
        target_name = data.columns[-1]
        # if data is empty, return it's parent label
        if len(data) == 0:
            return parent_node_label
        # if all data records belong to one class, return that label
        elif len(np.unique(data[target_name])) == 1:
            label = np.unique(data[target_name])[0]
            return label
        # if the feature is empty, return its parent label
        elif len(attributes) == 0:
            return parent_node_label
        # else calculate information gain, choose the biggest gain to split dataset into subset
        else:
            # before splitting, records the current node label as parent label
            parent_node_label = np.unique(data[target_name])[
                np.argmax(np.unique(data[target_name], return_counts=True)[1])]
            # get the biggest information gain and corresponding attribute
            best_information_gain, best_attribute = self.select_best_attribute(data)
            # if the information gain less than threshold, then return its parent node label
            if best_information_gain <= threshold:
                return parent_node_label
            # split dataset according to the attribute value
            else:
                # create decision tree. it also can be regarded as subtree structure
                decision_tree = {best_attribute: {}}
                # remove the splitting attribute first so next loop we will not consider this attribute again
                attributes = [attribute for attribute in attributes if attribute != best_attribute]
                # add nodes at same level on decision tree
                for value in np.unique(data[best_attribute]):
                    # get corresponding subset according to distinct attribute value
                    sub_data = data.where(data[best_attribute] == value).dropna()
                    # now sub_data is new dataset and the original dataset becomes previous one. set parent_node_label as current node label.
                    sub_tree = self.id3(sub_data, data, attributes, parent_node_label=parent_node_label)

                    # if tree depth is over the max depth, then stop growing our decision tree
                    if self.tree_depth(sub_data, sub_tree) >= max_tree_depth:
                        break
                    # otherwise, grow our decision tree
                    else:
                        # decision_tree[best_attribute] means all subtrees but decision_tree[best_attribute][value] means choosing one of subtrees
                        decision_tree[best_attribute][value] = sub_tree
                # after building subtree in each node, return the final tree(including subtree)
                return decision_tree

    # predict the label of a record in dataset
    @classmethod
    def fit_one_record(cls, row, decision_tree):
        # row.keys()=attribute names
        for key in list(row.keys()):
            if key in list(decision_tree.keys()):
                label = decision_tree[key][row[key]]
                # if it's still dict, do it again
                if isinstance(label, dict):
                    return cls.fit_one_record(row, label)
                # else return the label of the record
                else:
                    return label

    # predict all records according to their features
    def fit(self, data, decision_tree):
        row_dict = data.iloc[:, :-1].to_dict(orient='records')
        predicted = pd.DataFrame(columns=['predicted'])
        for row_index in range(len(data)):
            predicted.loc[row_index, 'predicted'] = self.fit_one_record(row_dict[row_index], decision_tree)
        return predicted['predicted']

    # get data accuracy through searching the corresponding attribute value in the decision tree.
    def accuracy(self, data, decision_tree):
        predicted = self.fit(data, decision_tree)
        accuracy_score = np.sum(predicted == data.iloc[:, -1]) / len(data)
        return accuracy_score

