In [9]:
from numpy import *
def loadSimpData():
    datMat = matrix([
        [1. ,2.1],
        [2. ,1.1],
        [1.3,1.],
        [1., 1.],
        [2. ,1.]
    ])
    classLabels = [1.0,1.0,-1.0,-1.0,1.0]
    return datMat,classLabels
#stumpClassify()是通过阈值比较对数据进行分类的。 所有在阈值一边的数据会分到类别-1，而在另外一边的数据分到类别+1。
#该函数可以通过数组过滤来实现，，然后将所有不满足不等式要求的元素设置为1。 
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
    retArray = ones((shape(dataMatrix)[0],1))#首先将返回数组的全部元素设置为1
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

In [22]:
#这个函数是找到最合适的分类点（阈值），并确定大于阈值是1还是小于阈值是1（相反的是-1）
#实现对应于书本上P159的(a)和(b)步骤
def buildStump(dataArr,classLabels,D):
    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
    m,n = shape(dataMatrix) #m是样本数，n是特征
    numSteps = 10.0; #设定走多少步，用于在特征的所有可能值上进行遍历。
    bestStump = {};#这个字典用于存储给定权重向量D时，所得到的佳单层决策树的相关信息 
    bestClasEst = mat(zeros((m,1)))
    minError = inf #init error sum, to +infinity
    for i in range(n):#loop over all dimensions,遍历所有的特征
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        #因为是数值型的特征，所以可以通过计算小值和大值来了解应该需要多大的步长
        stepSize = (rangeMax-rangeMin)/numSteps #步长
        for j in range(-1,int(numSteps)+1):#范围是-1到int(numStep) loop over all range in current dimension
            for inequal in ['lt', 'gt']: #go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize) #阈值前进一步
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #call stump classify with i, j, lessThan
                errArr = mat(ones((m,1)))
                #python这个写法还真是妙，先把errArr全部赋值1，然后把预测对的位置改成0
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr  #calc total error multiplied by D
                #print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError #分类误差率e_m
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal #inequal是不等式的意思
    return bestStump,minError,bestClasEst


In [36]:
#基于决策树桩的AdaBoost训练过程 
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0] #样本数量
    D = mat(ones((m,1))/m)   #init D to all equal
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        #返回的则是利用D而得到的具有小错误率的单层决策树、小的错误率、估计的类别向量
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
        #print "D:",D.T
        #实现对应于书本上P159的(c)步骤
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
        bestStump['alpha'] = alpha  
        weakClassArr.append(bestStump)      #store Stump Params in Array
        #print "classEst: ",classEst.T
        #实现对应于书本上P159的(d)步骤
        #对应于公式(8.3)，alpha相当于α_m，classLabels相当于y_i，classEst相当于G_m(x_i)
        expon = multiply(-1*alpha*mat(classLabels).T,classEst) #multiply让矩阵对应元素位置相乘
        #对应于公式(8.4)
        D = multiply(D,exp(expon))                              #Calc New D for next iteration
        D = D/D.sum() #D.sum()相当于Z_m
        #calc training error of all classifiers, if this is 0 quit for loop early (use break)
        #错误率累加计算
        aggClassEst += alpha*classEst #aggClassEst记录每个数据点的类别估计累计值，当aggClassEst所有值的符号和真实类别标签都完全吻合，
                                      #那么训练错误率为0，便可以结束了
        #print "aggClassEst: ",aggClassEst.T
        #sign()，x>0时 sign(x)=1；x=0时 sign(x)=0；x<0时 sign(x)=-1
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1))) #平均被判断错的数目
        errorRate = aggErrors.sum()/m #错误率
        print("total error: %f" % (errorRate))
        if errorRate == 0.0: break
    return weakClassArr#,aggClassEst
    #return bestStump

In [37]:
def adaClassify(datToClass,classifierArr):
    dataMatrix = mat(datToClass)#将datToClass转换成了一个NumPy矩阵
    m = shape(dataMatrix)[0] #得到datToClass中的待分类样例的个数m
    aggClassEst = mat(zeros((m,1))) #构建0向量
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,\
                                 classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])#call stump classify
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return sign(aggClassEst)

#读取数据
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArray =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat-1): #表示从0到numFeat-2
            lineArray.append(float(curLine[i])) 
        dataMat.append(lineArray)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat
datMat,classLabels=loadDataSet('horseColicTraining2.txt')
classifierArray = adaBoostTrainDS(datMat,classLabels,40)

total error: 0.284281
total error: 0.284281
total error: 0.247492
total error: 0.247492
total error: 0.254181
total error: 0.240803
total error: 0.240803
total error: 0.220736
total error: 0.247492
total error: 0.230769
total error: 0.240803
total error: 0.214047
total error: 0.227425
total error: 0.217391
total error: 0.220736
total error: 0.217391
total error: 0.224080
total error: 0.224080
total error: 0.230769
total error: 0.224080
total error: 0.214047
total error: 0.207358
total error: 0.224080
total error: 0.224080
total error: 0.214047
total error: 0.220736
total error: 0.204013
total error: 0.207358
total error: 0.210702
total error: 0.217391
total error: 0.210702
total error: 0.217391
total error: 0.207358
total error: 0.210702
total error: 0.207358
total error: 0.207358
total error: 0.197324
total error: 0.190635
total error: 0.200669
total error: 0.197324


In [38]:
testArr,testLabelArr=loadDataSet('horseColicTest2.txt')
predic = adaClassify(testArr,classifierArray)

KeyError: 0