In [1]:
from numpy import *

In [4]:
#7.3 基于单层决策树构建弱分类器

In [18]:
def loadSimpData():
    datMat = matrix([[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat,classLabels

In [19]:
dataMat,classLabels = loadSimpData()

In [7]:
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray = ones((shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

In [33]:
def buildStump(dataArr,classLabels,D):
    dataMatrix = mat(dataArr)
    labelMat = mat(classLabels).T
    
    m,n = shape(dataMatrix)
    
    numSteps = 10.0
    bestStump = {}
    bestClasEst = mat(zeros((m,1)))
    
    minError = inf
    
    for i in range(n):
        rangeMin = dataMatrix[:,i].min()
        rangeMax = dataMatrix[:,i].max()
        
        stepSize = (rangeMax - rangeMin)/numSteps
        
        for j in range(-1,int(numSteps) + 1):
            for inequal in ['lt','gt']:
                threshVal = rangeMin + float(j) * stepSize
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
                errArr = mat(ones((m,1)))
                
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T * errArr
                
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst
    

In [34]:
D = mat(ones((5,1))/5)

In [35]:
buildStump(dataMat,classLabels,D)

({'dim': 0, 'thresh': 1.3, 'ineq': 'lt'},
 matrix([[0.2]]),
 array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [36]:
#7.4 完整的多个弱分类器版的AdaBoost算法

In [37]:
def adaBoostTrainDS(dataArr,classLabels,numIt = 40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)
        
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        
        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
        D = multiply(D,exp(expon))
        
        D = D/D.sum()
        
        aggClassEst += alpha * classEst
        
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1)))
        errorRate = aggErrors.sum()/m
        
        print('total error: ',errorRate,'\n')
        if errorRate == 0.0:
            break
    return weakClassArr

In [38]:
adaBoostTrainDS(dataMat,classLabels,9)

total error:  0.2 

total error:  0.2 

total error:  0.0 



[{'dim': 0, 'thresh': 1.3, 'ineq': 'lt', 'alpha': 0.6931471805599453},
 {'dim': 1, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.9729550745276565},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.8958797346140273}]