In [13]:
#计算数据集的香农熵
from math import log
def calcShannonEnt(dataset):
    numEntries = len(dataset)
    labelCounts = {}
    for featVec in dataset:
        print (featVec)
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel] +=1
        print (labelCounts)
    shannonEnt =0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*log(prob,2)
    return shannonEnt

dataset = [[1,1,'yes'],
           [1,1,'yes'],
           [1,0,'no'],
           [0,1,'no'],
          [0,1,'no']]
labels = ['no surfacing','flippers']
calcShannonEnt(dataset)


[1, 1, 'yes']
{'yes': 1}
[1, 1, 'yes']
{'yes': 2}
[1, 0, 'no']
{'yes': 2, 'no': 1}
[0, 1, 'no']
{'yes': 2, 'no': 2}
[0, 1, 'no']
{'yes': 2, 'no': 3}


0.9709505944546686

In [14]:
#往数据里增加新的分类，熵变大
dataset1 = dataset.copy()
dataset1[0][-1] = 'maybe'
dataset1
calcShannonEnt(dataset1)

[1, 1, 'maybe']
{'maybe': 1}
[1, 1, 'yes']
{'maybe': 1, 'yes': 1}
[1, 0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 2}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 3}


1.3709505944546687

ID3的做法是每次选取当前最佳的特征来分割数据，并按照该特征的所有可能取值来切分。ID3算法的问题是不能直接处理连续型特征，只有将连续型特征转换成离散型，才能在ID3算法中使用。

In [15]:
#按照给定特征划分数据集
def splitDataset(dataset,axis,value):
    retDataset = []
    for featVec in dataset:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataset.append(reducedFeatVec)
    return retDataset
        

In [16]:
#选择最好的数据集划分方式,找到最好的feature
def chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0])-1
    baseEntropy = calcShannonEnt(dataset)
    bestInfoGain = 0.0; bestFeature = -1
    for i  in range(numFeatures):
        featList = [example[i] for example in dataset]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataset = splitDataset(dataset,i,value)
            prob = len(subDataset)/float(len(dataset))
            newEntropy +=prob* calcShannonEnt(subDataset) #计算每种划分方式的信息熵
        infoGain = baseEntropy - newEntropy
        if (infoGain>bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

In [17]:
chooseBestFeatureToSplit(dataset)

[1, 1, 'maybe']
{'maybe': 1}
[1, 1, 'yes']
{'maybe': 1, 'yes': 1}
[1, 0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 2}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 3}
[1, 'no']
{'no': 1}
[1, 'no']
{'no': 2}
[1, 'maybe']
{'maybe': 1}
[1, 'yes']
{'maybe': 1, 'yes': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[1, 'no']
{'no': 1}
[1, 'maybe']
{'maybe': 1}
[1, 'yes']
{'maybe': 1, 'yes': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 2}


0

In [20]:
#多数表决决定叶子节点的分类
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]


import operator
#创建树的函数代码_ID3:
def createTree(dataset,labels):
    classList = [example[-1] for example in dataset]#最后一列是类别
    if classList.count(classList[0]) == len(classList):#类别完全相同停止划分
        return classList[0]
    if len(dataset[0]) == 1: #遍历完所有特征仍然不能讲数据集划分成仅包含唯一类别的分组，用投票决定
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataset)
    
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataset]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataset(dataset,bestFeat,value),subLabels)
    return myTree
createTree(dataset,labels.copy())

[1, 1, 'maybe']
{'maybe': 1}
[1, 1, 'yes']
{'maybe': 1, 'yes': 1}
[1, 0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 2}
[0, 1, 'no']
{'maybe': 1, 'yes': 1, 'no': 3}
[1, 'no']
{'no': 1}
[1, 'no']
{'no': 2}
[1, 'maybe']
{'maybe': 1}
[1, 'yes']
{'maybe': 1, 'yes': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[1, 'no']
{'no': 1}
[1, 'maybe']
{'maybe': 1}
[1, 'yes']
{'maybe': 1, 'yes': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 2}
[1, 'maybe']
{'maybe': 1}
[1, 'yes']
{'maybe': 1, 'yes': 1}
[0, 'no']
{'maybe': 1, 'yes': 1, 'no': 1}
['no']
{'no': 1}
['maybe']
{'maybe': 1}
['yes']
{'maybe': 1, 'yes': 1}


{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'maybe'}}}}

In [23]:
#使用决策树的分类函数
def classify(inputTree,featLabels,testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)#将标签字符串转换为索引
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key],featLabels,testVec)
            else: classLabel = secondDict[key]
    return classLabel
myTree = createTree(dataset,labels.copy())
classify(myTree,labels,[1,1])

[1, 1, 'yes']
{'yes': 1}
[1, 1, 'yes']
{'yes': 2}
[1, 0, 'no']
{'yes': 2, 'no': 1}
[0, 1, 'no']
{'yes': 2, 'no': 2}
[0, 1, 'no']
{'yes': 2, 'no': 3}
[1, 'no']
{'no': 1}
[1, 'no']
{'no': 2}
[1, 'yes']
{'yes': 1}
[1, 'yes']
{'yes': 2}
[0, 'no']
{'yes': 2, 'no': 1}
[1, 'no']
{'no': 1}
[1, 'yes']
{'yes': 1}
[1, 'yes']
{'yes': 2}
[0, 'no']
{'yes': 2, 'no': 1}
[0, 'no']
{'yes': 2, 'no': 2}
[1, 'yes']
{'yes': 1}
[1, 'yes']
{'yes': 2}
[0, 'no']
{'yes': 2, 'no': 1}
['no']
{'no': 1}
['yes']
{'yes': 1}
['yes']
{'yes': 2}


'yes'

CART（分类回归树）既可以用于分类还可以用于回归。构建两种树，回归树和模型树。回归树有树剪枝技术，回归树与分类树的思路类似，叶节点的数据类型不是离散型而是连续型。模型树的做法是可以在每个叶节点上都构建出一个线性模型。

In [76]:
def binSplitDataset(dataset,feature,value):
    mat0 = dataset[nonzero(dataset[:,feature]>value)[0],:] #左子树，右子树
    mat1 = dataset[nonzero(dataset[:,feature]<=value)[0],:]
    return mat0,mat1
from numpy import *

def regLeaf(dataset):
    return mean(dataset[:,-1]) #在回归树中该模型就是目标变量的均值

def regErr(dataset):
    return var(dataset[:,-1])* shape(dataset)[0]#目标变量的总方差：方差*样本个数

def chooseBestSplit(dataset,leafType=regLeaf,errType=regErr,ops=(1,4)):#找到数据的最佳二元切分方式
    tolS = ops[0];tolN = ops[1] #tolS容许的误差下降值，tolN切分的最少样本数
    if len(set(dataset[:,-1].T.astype(int16).tolist()))==1:#如果所有值相等则退出
        return None,leafType(dataset)
    m,n = shape(dataset)
    S = errType(dataset)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataset[:,featIndex]):
            mat0,mat1 = binSplitDataset(dataset,featIndex,splitVal)
            if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN) :continue
            newS = errType(mat0) + errType(mat1)
            if newS <bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S-bestS)<tolS: #如果误差减少不大则退出
        return None,leafType(dataset)
    mat0,mat1 = binSplitDataset(dataset,bestIndex,bestValue)
    if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):#如果切分出的数据集很小则退出
        return None,leafType(dataset)
    return bestIndex,bestValue

def createTree(dataset,leafType=regLeaf,errType=regErr,ops=(1,4)):#leafType给出建立叶节点的函数（回归树，这里是一个常数，模型树，这里是一个线性方程），errTYpe代表误差计算函数，ops包含树构建所需其他参数的元组
    feat,val = chooseBestSplit(dataset,leafType,errType,ops)
    if feat == None: return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataset(dataset,feat,val)
    retTree['left'] = createTree(lSet,leafType,errType,ops)
    retTree['right'] = createTree(rSet,leafType,errType,ops)
    return retTree

In [25]:
testMat = mat(eye(4))
testMat

matrix([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [29]:
mat0,mat1 = binSplitDataset(testMat,1,0.5)
print ('mat0',mat0)
print ('mat1',mat1)

mat0 [[0. 1. 0. 0.]]
mat1 [[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [35]:
from sklearn.datasets import load_iris
iris = load_iris()

In [54]:
#irisdata = mat((shape(iris['data'])[0],shape(iris['data'])[1]+1))
iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [78]:

irisdata = c_[iris['data'],iris['target']] #np.c_[a,b]给数组增加一列
irisdata.dtype

dtype('float64')

In [77]:
createTree(irisdata)

{'spInd': 2,
 'spVal': 1.9,
 'left': {'spInd': 3,
  'spVal': 1.7,
  'left': 1.9782608695652173,
  'right': {'spInd': 2,
   'spVal': 4.9,
   'left': 1.6666666666666667,
   'right': 1.0208333333333333}},
 'right': 0.0}

In [165]:
#用树回归进行预测
def regTreeEval(model,inDat):
    return float(model)

def modelTreeEval(model,inData):
    n = shape(inData)[1]
    X = mat(ones((1,n+1)))
    
    X[:,1:n+1] = inData
    #print (X[0,:5])
    return float(X[0,:5]*model)


def treeForeCast(tree, inData,modelEval=regTreeEval):
    if not type(tree).__name__=='dict': return modelEval(tree,inData)
    
    if inData[0,tree['spInd']]>tree['spVal']:
        if type(tree['left']).__name__=='dict':
            return treeForeCast(tree['left'],inData,modelEval)
        else: 
            return modelEval(tree['left'],inData)
    else:
        if type(tree['right']).__name__=='dict':
            return treeForeCast(tree['right'],inData,modelEval)
        else:
            return modelEval(tree['right'],inData)
        
def createForeCast(tree,testData,modelEval=regTreeEval):
    m = len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree,mat(testData[i]),modelEval)
    return yHat

In [173]:
#回归树剪枝函数
def isTree(obj):
    return (type(obj).__name__=='dict')
def getMean(tree):
    if isTree(tree['right']):tree['right']=getMean(tree['right'])#返回树的平均值，塌陷处理
    if isTree(tree['left']):tree['left']=getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0

def prune(tree,testData):
    if shape(testData)[0] == 0: return getMean(tree) #没有测试数据则对树进行塌陷处理
    if (isTree(tree['right']) or isTree(tree['left'])):
        lSet,rSet = binSplitDataset(testData,tree['spInd'],tree['spVal'])
    if isTree(tree['left']): tree['left']=prune(tree['left'],lSet)
    if isTree(tree['right']): tree['right']=prune(tree['right'],rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet,rSet = binSplitDataset(testData,tree['spInd'],tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1]-tree['left'],2)) + sum(power(rSet[:,-1]-tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(power(testData[:,-1]-treeMean,2))
        if errorMerge<errorNoMerge:
            print ("merging")
            return treeMean #如果两个分支已经不再是子树，就可以进行合并，具体的做法是对合并前后的误差进行比较，如果合并后的误差比不合并的
                            #误差小就进行合并
        else: return tree
    else: return tree
    

In [174]:
#使用回归树计算的结果
myTree = createTree(irisdata)
yHat_reg = createForeCast(myTree,irisdata)
#print (yHat_reg)
corrcoef(yHat_reg.A,irisdata[:,-1],rowvar=0)[0,1]

0.9834085576729287

In [175]:
myTree

{'spInd': 2,
 'spVal': 1.9,
 'left': {'spInd': 3,
  'spVal': 1.7,
  'left': 1.9782608695652173,
  'right': {'spInd': 2,
   'spVal': 4.9,
   'left': 1.6666666666666667,
   'right': 1.0208333333333333}},
 'right': 0.0}

In [176]:
prune(myTree,irisdata)

{'spInd': 2,
 'spVal': 1.9,
 'left': {'spInd': 3,
  'spVal': 1.7,
  'left': 1.9782608695652173,
  'right': {'spInd': 2,
   'spVal': 4.9,
   'left': 1.6666666666666667,
   'right': 1.0208333333333333}},
 'right': 0.0}

In [167]:
def linearSolve(dataset):
    m,n = shape(dataset)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))#常数项X0=1
    X[:,1:n] = dataset[:,0:n-1]; Y = dataset[:,-1].T; Y = Y.reshape((m,1))
    xTx = X.T*X
    #print (Y.shape)
    if linalg.det(xTx)==0.0:
        raise NameError("This matrix is singular,cannot do inverse,trying increasing the second value of ops")
    ws = xTx.I*(X.T*Y)
    return ws,X,Y

def modelLeaf(dataset):
    ws,X,Y = linearSolve(dataset)
    return ws

def modelErr(dataset):
    ws,X,Y = linearSolve(dataset)
    yHat = X*ws
    return sum(power(Y-yHat,2))


In [177]:
#使用模型树计算的结果
myTree_reg = createTree(irisdata,modelLeaf,modelErr) 
yHat_reg = createForeCast(myTree_reg,irisdata,modelTreeEval)
#print (yHat_reg)
corrcoef(yHat_reg.A,irisdata[:,-1],rowvar=0)[0,1]

0.9891787551366312

In [178]:
prune(myTree_reg,irisdata)

{'spInd': 3, 'spVal': 1.7, 'left': matrix([[ 1.70003713],
         [ 0.03047756],
         [-0.12990224],
         [ 0.03044713],
         [ 0.14394673]]), 'right': {'spInd': 2,
  'spVal': 4.9,
  'left': matrix([[ 8.01596885],
          [ 1.41418804],
          [-0.73748028],
          [-0.64873196],
          [-6.42347923]]),
  'right': matrix([[ 0.59469563],
          [-0.11148697],
          [-0.1209064 ],
          [ 0.18957339],
          [ 0.46032827]])}}