In [2]:
import numpy as np
import os
import sys

In [221]:
def loadDataSet(fileName):
    labelMat = []
    dataMat = []
    index = -1
    try:
        numFeat = len(open(fileName).readline().split('\t')) #get number of fields
        with open(fileName) as fr:
            for line in fr.readlines():
                index += 1
                curLine = line.strip().split('\t')
                for i in range(numFeat - 1):
                    dataMat.append(np.float(curLine[i]))            
                labelMat.append(np.float(curLine[numFeat-1]))
            
            dataMat = np.array(dataMat).reshape((-1,numFeat-1))
            labelMat = np.array(labelMat)
    except IndexError:
        print('IndexError')
        print(line, 'Line %d in Dataset only has %d attributes not %d !!!'%(index ,numFeat-1, numFeat) )
    except ValueError: #IndexError
        print("ValueError")
        print(line, 'Line %d has incomplete characters'%(index))
    except FileNotFoundError as err:
        print("PARAMETER fileName Error {0}".format(err))
    return dataMat, labelMat.reshape(-1,1)
    #except:
    #    print("Unexpected error:", sys.exc_info()[0])

In [92]:
filename = r'F:/GitHub/data-miner/Boost/horseColicTraining.txt'
x, y = loadDataSet(filename)
print(x.shape)
print(y.shape)

(299, 21)
(299,)


In [127]:
def stumpClassify(X, dimen, threshVal, threshIneq):
    retArray = np.ones((X.shape[0], 1))
    if threshIneq == 'lt':
        retArray[X[:, dimen] <= threshVal] = -1.0
    else:
        retArray[X[:, dimen] > threshVal] = -1.0
    return retArray

In [228]:
def buildStump(X, y, D):
    m,n = X.shape
    numSteps = 20.0; bestStump ={}; bestClasEst = np.zeros((m,1))
    minError = np.inf
    for dim_i in range(n):
        rangeMin = X[:,dim_i].min();
        rangeMax = X[:,dim_i].max()
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in range(-1, int(numSteps)+1 ):
            for inequal in ['lt','gt']:
                threshVal = (rangeMin + np.float(j) * stepSize)
                predictedVals = \
                    stumpClassify(X, dim_i, threshVal, inequal)
                errArr = np.ones((m,1))
                errArr[predictedVals == y] = 0              
                weightedError = np.squeeze( np.dot(D.T , errArr) )
                if weightedError < minError:
                    #print('split: dim %d, thresh %.2f, thresh inequal : %s\n \
                    #       the weighted error : %.3f'%(dim_i, threshVal, inequal,weightedError))
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = dim_i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
                    
    return bestStump, minError, bestClasEst

In [200]:
D = np.ones((5,1)) / 5.0

In [201]:
def loadSimpData():
    X = np.array([
        [1.0, 2.1],
        [2.0, 1.1],
        [1.3, 1.0],
        [1.0, 1.0],
        [2. , 1. ]
    ])
    y = np.array([1., 1., -1., -1., 1.]).reshape(-1,1)
    return X,y

In [202]:
X,y = loadSimpData()
buildStump(X, y, D)

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, array(0.2), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [203]:
np.max([1.0,1e-6])

1.0

In [233]:
def adaBoostTrainDS(X, y, numIt = 40):
    weakClassArr =[]
    m = X.shape[0]
    D = np.ones((m, 1)) / m
    aggClassEst = np.zeros((m, 1))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(X, y, D)
        #print("D: ",D.T)
        alpha = 0.5 * np.log((1.0 - error) / np.max([error, 1e-6]))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        #print("classEst",classEst.T)
        alpha_sign = -1.0 * alpha * classEst * y
        D_next = D * np.exp(alpha_sign)
        D = D_next / D_next.sum()
        aggClassEst += alpha * classEst
        #print("aggClassEst", aggClassEst.T)
        aggErrors = np.mean(np.sign(aggClassEst) != y)
        print("total error:",aggErrors)
        if aggErrors == 0.0:
            break
    return aggErrors, weakClassArr

In [234]:
X,y = loadSimpData()
adaBoostTrainDS(X, y, 9)

total error: 0.2
total error: 0.2
total error: 0.0


(0.0,
 [{'alpha': 0.69314718055994529, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
  {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
  {'alpha': 0.89587973461402726,
   'dim': 0,
   'ineq': 'lt',
   'thresh': 0.94999999999999996}])

In [242]:
filename = r'F:/GitHub/data-miner/Boost/horseColicTraining.txt'
X, y = loadDataSet(filename)
aggErrors, weakClassArr = adaBoostTrainDS(X, y, 50)
print(aggErrors)

total error: 0.284280936455
total error: 0.284280936455
total error: 0.257525083612
total error: 0.247491638796
total error: 0.274247491639
total error: 0.247491638796
total error: 0.244147157191
total error: 0.23745819398
total error: 0.240802675585
total error: 0.244147157191
total error: 0.250836120401
total error: 0.247491638796
total error: 0.234113712375
total error: 0.227424749164
total error: 0.220735785953
total error: 0.217391304348
total error: 0.217391304348
total error: 0.210702341137
total error: 0.220735785953
total error: 0.217391304348
total error: 0.224080267559
total error: 0.204013377926
total error: 0.23745819398
total error: 0.217391304348
total error: 0.227424749164
total error: 0.217391304348
total error: 0.227424749164
total error: 0.217391304348
total error: 0.227424749164
total error: 0.214046822742
total error: 0.224080267559
total error: 0.204013377926
total error: 0.217391304348
total error: 0.210702341137
total error: 0.224080267559
total error: 0.2040133

In [243]:
def adaClassify(X, CF_Arr):
    m = X.shape[0]
    aggClassEst = np.zeros((m,1))
    for classify in CF_Arr:
        classEst = stumpClassify(X, classify['dim'], \
                    classify['thresh'], classify['ineq'])
        aggClassEst += classify['alpha'] * classEst
    return np.sign(aggClassEst)

In [244]:
pred = adaClassify(X, weakClassArr)
err = np.mean(pred != y)
print("err = ",err)

err =  0.183946488294
