## Load simulation data

In [2]:
import numpy as np 
import scipy 
import pandas
import sklearn 

def loadsimData():
    dataArr = np.matrix([[1, 2.1],
                        [2, 1.1],
                        [1.3, 1.],
                        [1., 1.],
                        [2., 1.]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return dataArr, classLabels



In [4]:
dataArr, classLabels = loadsimData()
print(dataArr,classLabels)

[[ 1.   2.1]
 [ 2.   1.1]
 [ 1.3  1. ]
 [ 1.   1. ]
 [ 2.   1. ]] [1.0, 1.0, -1.0, -1.0, 1.0]


## Load zillow data

In [264]:

import pandas as pd

def loadDataSet():
    dataMat = []; labelMat = []
    
    zillow = pd.read_csv('binData.csv')
    q2 = np.percentile(zillow.iloc[:,1],95)
    q1 = np.percentile(zillow.iloc[:,1],5)
    
    zillow = zillow.loc[(zillow.iloc[:,1]  > q1) & (zillow.iloc[:,1] < q2) ]
    labelMat  = np.array(np.sign(zillow.iloc[:,1]))
    p = len(zillow.columns)
    dataMat = np.array(zillow[list(zillow.columns[2:p])])
    
    return dataMat, labelMat

In [265]:
dataMat, labelMat = loadDataSet()
print(labelMat)
print(dataMat.shape)


[ 1.  1.  1. ...,  1. -1.  1.]
(81157, 61)


## Decision stump 

In [243]:
def stumpClassify(dataMatrix, dimen, thresVal, thresIneq):
    recArray = np.ones((np.shape(dataMatrix)[0],1))
    if thresIneq == 'lt':
        recArray[dataMatrix[:,dimen] <= thresVal] = -1.0
    else:
        recArray[dataMatrix[:,dimen] > thresVal] = - 1.0
    return recArray

    

## Build best Stump

Set minError as $\infty$

``` python 
for f in features: 
    
    for every step:
    
       for every inequalities:
       
           create a decision stump
           
           If erorr rate < minError, set it as the best decision stump
           
return the best decision stump
```


In [244]:

import numpy as np

def buildStump(dataArr, classLabels,D):
    dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
    n,m = np.shape(dataMatrix)
    minError = np.inf; numSteps = 4; bestStump = {};
    for i in range(m):
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max()
        stepSize = ( rangeMax - rangeMin ) / numSteps
        for j in range(-1, int(numSteps) + 1 ):
            thresVal = rangeMin + float(j )* stepSize
            for thresIneq in ['lt','gt']:
                predictLabels = stumpClassify(dataMatrix, i, thresVal, thresIneq )
                errorArr = np.mat(np.ones((n,1)))
                errorArr[predictLabels == labelMat] = 0
                weightedError = D.T * errorArr 
              #  print("split: dim %d, thresh: %.2f, thres Ineq: %s, the weighted error: %.3f"\
              #      %(i, thresVal, thresIneq, weightedError))
                
                if weightedError < minError:
                    minError = weightedError 
                    bestClassEst = predictLabels
                    bestStump['dim'] = i
                    bestStump['thresVal'] = thresVal
                    bestStump['Ineq'] = thresIneq
                    
    return bestStump, minError, bestClassEst
                             

In [245]:

import numpy as np

D = np.mat(np.ones((5,1))/ 5)
bestStump, minError, bestClassEst = buildStump(dataArr,classLabels,D)

## Adaboost
``` python
for iter in range(numIter):
    process buildStump()
    add bestStump in weakClassArr
    calculate alpha
    update D
    update aggClassEst
    if error rate = 0.0: break
```  

In [246]:
def adaboostTrainDS(dataArr, classLabels,numIter):
    n = np.shape(dataArr)[0]
    D = np.mat(np.ones((n,1))/ n)
    weakClassArr = []
    aggClassEst = np.mat(np.zeros((n,1)))
    for i in range( numIter ):
        bestStump, error, classEst = buildStump(dataArr,classLabels, D)
        print("D: ", D.T )
        
        alpha =  float(0.5 * np.log((1.0 - error) / np.maximum(error,1e-16)))
        bestStump['alpha'] = alpha
        
        expon =   np.multiply( -1 * np.mat(classLabels).T, classEst)
        D = np.multiply(D , np.exp(alpha * expon))
        D = D / D.sum()
        
        aggClassEst += alpha * classEst
        aggClassError = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.mat(np.ones((n,1))))
        errorRate = aggClassError.sum() /  n
        
        
        print("classEst: ",classEst.T)
        print("aggClassEst: ", aggClassEst.T)
        print("errorRate: ", errorRate)
        
        if errorRate == 0.0:  break
    return weakClassArr
        
    

In [251]:
classsifyArr = adaboostTrainDS(dataArr,classLabels,9)


D:  [[ 0.2  0.2  0.2  0.2  0.2]]
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
errorRate:  0.2
D:  [[ 0.5    0.125  0.125  0.125  0.125]]
classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
errorRate:  0.2
D:  [[ 0.28571429  0.07142857  0.07142857  0.07142857  0.5       ]]
classEst:  [[ 1.  1.  1.  1.  1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
errorRate:  0.0


In [266]:
zillowClassifyArr = adaboostTrainDS(dataMat,labelMat,12)

D:  [[  1.23217960e-05   1.23217960e-05   1.23217960e-05 ...,   1.23217960e-05
    1.23217960e-05   1.23217960e-05]]
classEst:  [[ 1.  1.  1. ...,  1. -1. -1.]]
aggClassEst:  [[ 0.1156352  0.1156352  0.1156352 ...,  0.1156352 -0.1156352 -0.1156352]]
errorRate:  0.442438729869
D:  [[  1.10639735e-05   1.10639735e-05   1.10639735e-05 ...,   1.10639735e-05
    1.10639735e-05   1.39428190e-05]]
classEst:  [[ 1.  1.  1. ...,  1.  1.  1.]]
aggClassEst:  [[ 0.13371419  0.13371419  0.13371419 ...,  0.13371419 -0.09755622
  -0.09755622]]
errorRate:  0.442438729869
D:  [[  1.08696076e-05   1.08696076e-05   1.08696076e-05 ...,   1.08696076e-05
    1.12698224e-05   1.36978791e-05]]
classEst:  [[ 1.  1.  1. ..., -1.  1.  1.]]
aggClassEst:  [[ 0.14456752  0.14456752  0.14456752 ...,  0.12286086 -0.08670289
  -0.08670289]]
errorRate:  0.442438729869
D:  [[  1.07541424e-05   1.07541424e-05   1.07541424e-05 ...,   1.09901309e-05
    1.13947834e-05   1.35523698e-05]]
classEst:  [[-1. -1. -1. ...,  1. -1

NameError: name 'aggClassEst' is not defined