In [3]:
from numpy import *

def loadDataSet():
    """加载数据"""

    dataSet = [[0, 0, 0],
               [0, 1, 1],
               [0, 2, 0],
               [1, 0, 1],
               [1, 1, 1],
               [1, 2, 1],
               [2, 0, 0],
               [2, 1, 1],
               [2, 2, 0]]
    labels = ['color','shape']
    return dataSet, labels

def calcGini(dataSet):
    """计算基尼指数"""
    
    totalNum = shape(dataSet)[0]
    
    labelNum = {}
    gini = 0
    
    for data in dataSet:
        label = data[-1]
        if label in labelNum:
            labelNum[label] += 1
        else:
            labelNum[label] = 1
            
    for key in labelNum:
        p = labelNum[key]/ totalNum
        gini += p*(1-p)
        
    return gini

def chooseBestFeatVal2Split(dataSet):
    """选择最优切分点"""
    
    # 1.如果没有切分的变量
    if(len(dataSet[0]) == 1): 
        print("dfadfafa")
        return None, None
    
    # 2.所有的目标变量的相等
    if(len(set([d[-1] for d in dataSet])) == 1): 
#         print("aa")
#         print([d[-1] for d in dataSet])
        return None, None
    
    # 3.寻找最佳切分点
    bestfeatur = 0
    bestValue = 0
    lowestGini = 1000000
    totalGini = calcGini(dataSet)
    totalNum = shape(dataSet)[0]
    
    # 4.遍历特征
    for feature in range(shape(dataSet)[1] - 1):
        allValues = [d[feature] for d in dataSet]
        values = set(allValues)
        
        # 5.遍历每个特征所可能要取的值
        for value in values:
            leftChild, rightChild = splitByFeature(feature, value, dataSet)
            # 6.如果左子树或右子树无节点
#             print("left:", leftChild)
#             print("right:", rightChild)
            if(shape(leftChild)[0] == 0 or shape(rightChild)[0] == 0): return None, None
            
            # ７．计算划分后的基尼指数
            leftNum = shape(leftChild)[0]
            rightNum = shape(rightChild)[0]
            
            curGini = leftNum/totalNum *calcGini(leftChild) + rightNum/totalNum*calcGini(rightChild)
            
            if (curGini < lowestGini):
                bestfeatur = feature
                bestValue = value
                lowestGini = curGini
    
    if(totalNum - lowestGini < 0.00001): return None, None
    
    return bestfeatur, bestValue

def splitByFeature(feature, value, dataSet):
    
    dataSet = mat(dataSet)
    leftChild = dataSet[nonzero(dataSet[:, feature] == value)[0],:].tolist()
    rightChild = dataSet[nonzero(dataSet[:, feature] != value)[0],:].tolist()
    
    return leftChild, rightChild

def checkIsOneCateg(newDataSet):
    """是否划分节点是唯一的节点"""

    flag = False
    categList = [data[-1] for data in newDataSet]
    
    category = set(categList)
    if len(category) == 1:
         flag = True
    
    return flag

def  majorityCateg(newDataSet):
    """主要的类别"""
    
    categCount = {}
    categList = [data[-1] for data in newDataSet]
    
    for c in categList:
        if c not in  categCount:
            categCount[c] = 1
            
        else:
            categCount[c] += 1
            
    sortedCateg = sorted(categCount.item(), key=lambda x:x[1], reverse=True)
    
    return sortedCateg

def createClassifTree(dataSet):
    """创建分类树"""
    
    feature, value = chooseBestFeatVal2Split(dataSet)
    
    if feature == None and checkIsOneCateg(dataSet):
        return dataSet[0][-1]
    
    if feature == None and not checkIsOneCateg:
        return majorityCateg(dataSet)
    
    classifTree = {}
    classifTree["featIndex"] = feature
    classifTree["value"] = value
    leftChild, rightChild = splitByFeature(feature, value, dataSet)
    classifTree["leftChild"] = createClassifTree(leftChild)
    classifTree["rightChild"] = createClassifTree(rightChild)
    
    
    return classifTree
    
if __name__ == '__main__':
    dataSet, labels = loadDataSet()
    classifTree = createClassifTree(dataSet)
    print(classifTree)

{'featIndex': 0, 'value': 1, 'leftChild': 1, 'rightChild': {'featIndex': 1, 'value': 1, 'leftChild': 1, 'rightChild': 0}}
