In [1]:
from numpy import *
from pathlib import Path

In [2]:
#决策树是一种贪心算法。在给定时间内做出最佳选择，并不关系能否达到全局最优。

In [3]:
#构建树节点

In [4]:
class treeNode():
    def __init__(self,feat,val,right,left):
        featureToSplitOn = feat
        valueOfSplit = val
        rightBranch = right
        leftBranch = left

In [5]:
#CART算法

In [46]:
def regLeaf(dataSet):
    return mean(dataSet[:,-1])

def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):  #选取最优特征的最优切分点
    tolS = ops[0]
    tolN = ops[1]
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        return None,leafType(dataSet)                #根据叶节点的计算方式不同，返回不同的值或结构
    m,n = shape(dataSet)
    S = errType(dataSet)
    bestS,bestIndex,bestValue = float(inf),0,0
    
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.tolist()[0]):
            mat0,mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
            
            if shape(mat0)[0] < tolN or shape(mat1)[0] < tolN:   #每个叶子节点的数据量阈值
                continue
                
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
                
    if S - bestS < tolS:                         #误差的阈值
        return None,leafType(dataSet)
    mat0,mat1 = binSplitDataSet(dataSet,bestIndex,bestValue)

    if shape(mat0)[0] < tolN or shape(mat1)[0] < tolN:
        return None,leafType(dataSet)
    
    return bestIndex,bestValue

In [47]:
def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet,feature,value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
    feat,val = chooseBestSplit(dataSet,leafType,errType,ops)
    if feat == None:
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet,rSet = binSplitDataSet(dataSet,feat,val)
    retTree['left'] = createTree(lSet,leafType,errType,ops)
    retTree['right'] = createTree(rSet,leafType,errType,ops)
    return retTree

In [38]:
data_path = Path('D:\python_algorithm\machinelearinginaction\《机器学习实战》Python3代码\Ch09')

In [39]:
myDat = loadDataSet(data_path / 'ex00.txt')

In [40]:
myMat = mat(myDat)

In [48]:
createTree(myMat)

{'spInd': 0,
 'spVal': 0.48813,
 'left': 1.0180967672413792,
 'right': -0.04465028571428572}

In [49]:
##9.4 树剪枝

In [58]:
def isTree(obj):
    return (type(obj).__name__=='dict')

def getMean(tree):
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right'])/2.0

def prune(tree,testData):
    if shape(testData)[0] == 0:
        return getMean(tree)
    if isTree(tree['right']) or isTree(tree['left']):
        lSet,rSet = binSplitDataSet(testData,tree['spInd'],tree['spVal'])
        
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'],lSet)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'],rSet)
        
    if not isTree(tree['right']) and not isTree(tree['left']):
        lSet,rSet = binSplitDataSet(testData,tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) + \
                        sum(power(rSet[:,-1] - tree['right'],2))
        
        treeMean = (tree['left'] + tree['right'])/2.0
        errorMerge = sum(power(testData[:,-1] - treeMean,2))
        
        if errorMerge < errorNoMerge:
            print('merging')
            return treeMean
        else:
            return tree
    else:
        return tree

In [51]:
myDat2 = loadDataSet(data_path / 'ex2.txt')

In [52]:
myMat2 = mat(myDat2)

In [56]:
myTree = createTree(myMat2, ops=(0,1))

In [54]:
myDatTest = loadDataSet(data_path / 'ex2test.txt')

In [55]:
myMat2Test = mat(myDatTest)

In [60]:
pruneTree = prune(myTree,myMat2Test)

In [61]:
##9.5 模型树

In [62]:
##线性模型对数据进行你和，然后计算真实的目标值与模型预测值间的差值。最后将这些差值的平方求和就得到所需的误差。

In [66]:
def linearSolve(dataSet):  #求解线性回归的参数
    m,n = shape(dataSet)
    X = mat(ones((m,n)))
    Y = mat(ones((m,1)))
    
    X[:,1:n] = dataSet[:,0:n-1]
    Y = dataSet[:,-1]
    
    xTx = X.T * X
    if linalg.det(xTx) == 0.0:
        raise NameError('this matrix is singular,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y

def modelLeaf(dataSet):
    ws,X,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat,2))

In [68]:
myTree2 = createTree(myMat2,modelLeaf,modelErr)

In [69]:
#9.6 树回归与标准回归的测试比较

In [73]:
def regTreeEval(model,inDat):
    return float(model)

def modelTreeEval(model,inDat):
    n = shape(inDat)[1]
    X = mat(ones((1,n+1)))
    X[:,1:n+1] = inDat
    return float(X * model)

def treeForeCast(tree,inData, modelEval = regTreeEval):
    if not isTree(tree):
        return modelEval(tree,inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'],inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'],inData)  

def createForeCast(tree,testData,modelEval = regTreeEval):
    m = len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree,mat(testData[i]),modelEval)
    return yHat

In [71]:
trainMat = mat(loadDataSet(data_path / 'bikeSpeedVsIq_train.txt'))
testMat = mat(loadDataSet(data_path / 'bikeSpeedVsIq_test.txt'))

In [72]:
##创建普通的回归树
myTree3 = createTree(trainMat,ops=(1,20))

In [74]:
yHat = createForeCast(myTree3,testMat[:,0])

In [77]:
corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]

0.9640852318222141

In [78]:
##创建模型树
myTree4 = createTree(trainMat,modelLeaf,modelErr,(1,20))

In [79]:
yHat = createForeCast(myTree4,testMat[:,0],modelTreeEval)

In [80]:
corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]

0.9760412191380604