In [1]:
import numpy as np  

In [2]:
def loadSimpData():  #一个简单的数据集和标签集
    datMat = np.matrix([[1.,2.1],
                        [2.,1.1],
                        [1.3,1.],
                        [1.,1.],
                        [2.,1.]])
    classLabels = [1.0,1.0,-1.0,-1.0,1.0]
    return datMat,classLabels

In [3]:
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): #分类函数
    retArray = np.ones((np.shape(dataMatrix)[0],1))  #初始化标签数组，令其全部为1
    if threshIneq == 'It':  #如果包含It
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #数据集给定列小于给定值，则令其标签为-1
    else:   
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0  #其他情况下，数据集给定列大于给定值，令其标签为-1
    return retArray

In [4]:
def buildStump(dataArr,classLabels,D):  #创建二叉树
    dataMatrix = np.mat(dataArr);labelMat = np.mat(classLabels).T #提取数据集和标签集，并转换为矩阵
    m,n = np.shape(dataMatrix)  #数据集行列
    numSetps = 10.0  #
    bestStump  = {}
    bestClasEst = np.mat(np.zeros((m,1)))
    minError = np.inf
    for i in range(n):
        rangeMin = dataMatrix[:,i].min();rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax - rangeMin)/numSetps
        for j in range(-1,int(numSetps) + 1):
            for inequal in ['lt','gt']:
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
                errArr = np.mat(np.ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr
                #print("split: dim %d,thresh %.2f,thresh ineqal: %s,the weighted error is %.3f" % (i,threshVal,inequal,weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst

In [5]:
dataMat,classLabel = loadSimpData()

In [6]:
D = np.mat(np.ones((5,1))/5);D

matrix([[0.2],
        [0.2],
        [0.2],
        [0.2],
        [0.2]])

In [7]:
buildStump(dataMat,classLabel,D)

({'dim': 0, 'thresh': 2.0, 'ineq': 'lt'}, matrix([[0.4]]), array([[1.],
        [1.],
        [1.],
        [1.],
        [1.]]))

In [18]:
def adaBoostTrainDS(dataArr,classLabels,numIt = 40):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m)
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)
        #print("D:",D.T)      
        alpha = float(0.5*np.log((1.0-error)/max(error,1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        #print("classEst: ",classEst.T)
        expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst)
        D = np.multiply(D,np.exp(expon))
        D = D/D.sum()
        aggClassEst += alpha * classEst
        #print("aggClassEst: ",aggClassEst.T)
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1)))
        errorRate = aggErrors.sum()/m
        print("total error: ",errorRate,"\n")
        if errorRate == 0.0: break
    return weakClassArr

In [9]:
classifierArray = adaBoostTrainDS(dataMat,classLabel,9)

D: [[0.2 0.2 0.2 0.2 0.2]]
classEst:  [[1. 1. 1. 1. 1.]]
aggClassEst:  [[0.20273255 0.20273255 0.20273255 0.20273255 0.20273255]]
total error:  0.4 

D: [[0.16666667 0.16666667 0.25       0.25       0.16666667]]
classEst:  [[1. 1. 1. 1. 1.]]
aggClassEst:  [[0.20273255 0.20273255 0.20273255 0.20273255 0.20273255]]
total error:  0.4 

D: [[0.16666667 0.16666667 0.25       0.25       0.16666667]]
classEst:  [[-1. -1. -1. -1. -1.]]
aggClassEst:  [[0.20273255 0.20273255 0.20273255 0.20273255 0.20273255]]
total error:  0.4 

D: [[0.16666667 0.16666667 0.25       0.25       0.16666667]]
classEst:  [[-1. -1. -1. -1. -1.]]
aggClassEst:  [[0.20273255 0.20273255 0.20273255 0.20273255 0.20273255]]
total error:  0.4 

D: [[0.16666667 0.16666667 0.25       0.25       0.16666667]]
classEst:  [[-1. -1. -1. -1. -1.]]
aggClassEst:  [[0.20273255 0.20273255 0.20273255 0.20273255 0.20273255]]
total error:  0.4 

D: [[0.16666667 0.16666667 0.25       0.25       0.16666667]]
classEst:  [[-1. -1. -1. -1. -1.]

In [10]:
classifierArray

[{'dim': 0, 'thresh': 2.0, 'ineq': 'lt', 'alpha': 0.2027325540540821},
 {'dim': 0, 'thresh': 2.0, 'ineq': 'lt', 'alpha': 1.1102230246251564e-16},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0},
 {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.0}]

In [11]:
def adaClassify(datToClass,classifierArr):
    dataMatrix = np.mat(datToClass)
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(len(classifierArray)):
        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
                                           classifierArr[i]['thresh'],\
                                           classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return np.sign(aggClassEst)

In [12]:
adaClassify([0,0],classifierArray)

[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]
[[0.20273255]]


matrix([[1.]])

In [13]:
def loadDataSet(fileName):
    numFeat = len(open(fileName).readline().split('\t'))
    dataMat = [];labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat - 1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat

In [19]:
datArr,labelArr = loadDataSet('horseColicTraining.txt')
classifierArray = adaBoostTrainDS(datArr,labelArr,10)

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 

total error:  0.40468227424749165 



In [21]:
testArr,testLabelArr = loadDataSet('horseColicTest.txt')
prediction10 = adaClassify(testArr,classifierArray)

[[0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]
 [0.1929965]]
[[0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [0.28949475]
 [

In [22]:
errArr = np.mat(np.ones((67,1)))
errArr[prediction10 != np.mat(testLabelArr).T].sum()

20.0