# Classification Tree Implement With Gini

### $Gini(p)=1-\sum_{k=1}^{K}{p_k^2}$
### $Gini(D)=1-\sum_{k=1}^{K}{(\frac{|C_k|}{|D|})^2}$ , Given Dataset D
### $Gini(D,A)=\frac{|D_1|}{|D|}Gini(D_1)+\frac{|D_2|}{|D|}Gini(D_2)$

In [1]:
import numpy as np

In [2]:
class CT:
    def __init__(self, maxDepth=5, maxGini=0.01):
        self.__tree = []
        self.__maxDepth = maxDepth
        self.__maxGini = maxGini

    def dataProcessing(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        x = data[:, :-1]
        y = data[:, -1]
        return x, y

    def _gini(self, y):
        values, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        gini = 1 - np.sum(p ** 2)
        return gini

    def _condGini(self, feature, value, x, y):
        x = x[:, feature]
        subIndex1 = np.nonzero(x == value)
        subIndex2 = np.nonzero(x != value)
        y1 = y[subIndex1]
        y2 = y[subIndex2]
        condGini = len(y1) / len(y) * self._gini(y1) + len(y2) / len(y) * self._gini(y2)
        return condGini

    def train(self, x, y, depth=0, features=None):
        if np.any(features) == None:
            features = np.arange(x.shape[1])
        if len(y) == 0 or len(features) == 0 or depth >= self.__maxDepth:
            return self.__tree
        else:
            if depth == 0:
                values, counts = np.unique(y, return_counts=True)
                value = values[np.argmax(counts)]
                self.__tree.append((depth, 'Root Node', None, None, value))
            depth += 1
            # compute minimal conGini
            condGiniL = []
            for feature in features:
                for value in values:
                    condGini = self._condGini(feature, value, x, y)
                    condGiniL.append((feature, value, condGini))
            minGini = min(condGiniL, key=lambda i: i[2])
            if minGini[2]>=self.__maxGini:
                return self.__tree
            # partition data to two branches
            sf=x[:,minGini[0]]
            sv=minGini[1]
            x1=x[sf==sv]
            y1=y[sf==sv]
            values,counts=np.unique(y1,return_counts=True)
            value=values[np.argmax(counts)]
            self.__tree.append((depth,minGini[0],sv,value))
            x2=x[sf!=sv]
            y2=y[sf!=sv]
            values,counts=np.unique(y2,return_counts=True)
            value=values[np.argmax(counts)]
            self.__tree.append((depth,minGini[0],sv,value))
            leftFeatures=features[features!=minGini[0]]
            R1=self.train(x1,y1,depth,leftFeatures)
            R2=self.train(x2,y2,depth,leftFeatures)
            return R2