In [1]:
import random
import numpy as np
import pandas as pd
import math

In [3]:
def bestPoint(Xiy):
    #输入参数与实际值的组合矩阵，Xiy为n*2矩阵，已经按照参数降序排序
    length = len(Xiy)
    #预备一个矩阵用于存储mse计算的结果，对应结果的下标，对应结果的分割指标
    outcome = np.zeros((2,length-1))
    #对于组合矩阵中的每一组，分别将其选为评价指标，计算mse并储存结果，下标，分割指标
    for i in range(length-1):
        index1 = np.linspace(0,i,i+1,dtype='int')
        index2 = np.linspace(i+1,length-1,length-i-1,dtype='int')
        sub1 = Xiy[index1]
        sub2 = Xiy[index2]
        l1 = sub1[:,-1]
        c1 = l1.mean()
        l2 = sub2[:,-1]
        c2 = l2.mean()
        outcome[0,i] = np.dot(l1-c1,l1-c1)+np.dot(l2-c2,l2-c2)
        outcome[1,i] = Xiy[i,0]
    #寻找使得mse最小的下标，返回对应结果
    minIndex = np.argmin(outcome,axis=1)[0]
    return minIndex, outcome[0,minIndex], outcome[1,minIndex]

In [4]:
#（改版）优化了时间复杂度，虽然还是很烂
def betterPoint(Xiy):
    length = len(Xiy)
    outcome = np.zeros((2,length-1))
    #判断该指标是否为OneHot生成的，仅包含0和1
    dif = len(set(Xiy[:,-1].tolist()))
    if dif == 2:
        #不再进行遍历，而是直接寻找0与1的跳变位置
        counter = 0
        while Xiy[counter,-1] != Xiy[counter+1,-1]:
            counter += 1
        index1 = np.linspace(0,counter,counter+1,dtype='int')
        index2 = np.linspace(counter+1,length-1,length-counter-1,dtype='int')
        sub1 = Xiy[index1]
        sub2 = Xiy[index2]
        l1 = sub1[:,-1]
        c1 = l1.mean()
        l2 = sub2[:,-1]
        c2 = l2.mean()
        outcome = np.dot(l1-c1,l1-c1)+np.dot(l2-c2,l2-c2)
        return counter, outcome, Xiy[counter,0]
    
    for i in range(length-1):
        index1 = np.linspace(0,i,i+1,dtype='int')
        index2 = np.linspace(i+1,length-1,length-i-1,dtype='int')
        sub1 = Xiy[index1]
        sub2 = Xiy[index2]
        l1 = sub1[:,-1]
        c1 = l1.mean()
        l2 = sub2[:,-1]
        c2 = l2.mean()
        outcome[0,i] = np.dot(l1-c1,l1-c1)+np.dot(l2-c2,l2-c2)
        outcome[1,i] = Xiy[i,0]
    minIndex = np.argmin(outcome,axis=1)[0]
    return minIndex, outcome[0,minIndex], outcome[1,minIndex]

In [7]:
#（初版）选择最佳分类特征
def chooseIndicator(sample,leaf, time=-1):
    if len(set(sample[:,-1].tolist())) == 1:
        return -1, leaf(sample)
    if time==0:
        return -1,leaf(sample)
    if len(sample)<=5:
        return -1,leaf(sample)
    X = sample[:,:-1]
    y = sample[:,-1].reshape((len(sample),1))
    vecLen = X.shape[-1]
    minInd = 0
    loss = np.inf
    for i in range(vecLen):
        bunch = np.hstack((X[:,i].reshape((len(y),1)),y))
        Xiy = bunch[bunch[:,0].argsort()]
        minIndex, loss, indicator = bestPointEdited(Xiy)
        prep[0,i] = i
        prep[1,i] = loss
        prep[2,i] = indicator
    #print('prep:{}'.format(prep))
    minInd = np.argmin(prep,axis=1)[1]
    return int(prep[0,minInd]), prep[2,minInd]

In [8]:
#将数据按照指定特征和取值一分为二
def binSplit(sample, indicator, value):
    mat0 = sample[np.nonzero(sample[:,indicator] <= value)[0],:]
    mat1 = sample[np.nonzero(sample[:,indicator] > value)[0],:]
    return mat0,mat1

In [9]:
def err(Xiy):
    return np.var(Xiy[:, -1]) * Xiy.shape[0]

In [5]:
#生成叶节点
def regLeaf(dataSet):
    return np.mean(dataSet[:, -1])

In [96]:
#（终版）集合了特征选择和分类指标选择
def bestPointEdited(Xiy,leaf=regLeaf, time=-1):
    if len(set(Xiy[:-1].T.tolist()[1])) == 1:
        return -1, leaf(Xiy)
    if time == 0:
        return -1, leaf(Xiy)
    n = Xiy.shape[-1]
    bestS = np.inf

    bestIndex = 0
    bestValue = 0
    for index in range(n - 1):
        for splitVal in set(Xiy[:, index].T.tolist()):
            mat0, mat1 = binSplit(Xiy, index, splitVal)
            if mat0.shape[0] < 5 or mat1.shape[0] < 5:
                continue
            newS = err(mat0) + err(mat1)
            if newS < bestS:
                bestS = newS
                bestIndex = index
                bestValue = splitVal
    return bestIndex, bestValue

In [95]:
#创建树
def createTree(sample, leaf=regLeaf, time=-1):
    #print('time:{}'.format(time))
    indicator, value = bestPointEdited(sample, leaf, time=time) 
    if indicator == -1: return value 
    retTree = {} 
    retTree['spInd'] = indicator 
    retTree['spVal'] = value
    lSet, rSet = binSplit(sample, indicator, value) 
    retTree['left'] = createTree(lSet, leaf, time-1) 
    retTree['right'] = createTree(rSet, leaf, time-1) 
    return retTree

In [34]:
#对输入的一条数据，进行预测
def forecast(tree, data):
    if data[tree['spInd']] <= tree['spVal']:
        if isinstance(tree['left'],dict):
            return forecast(tree['left'], data)
        else:
            return tree['left']
    else:
        if isinstance(tree['right'],dict):
            return forecast(tree['right'], data)
        else:
            return tree['right']

In [None]:
#对输入的一组数据进行预测，返回nparray类型的预测结果向量
def forecastSerie(tree, data):
    result = np.zeros(len(data))
    for i in range(len(data)):
        #print(i)
        result[i] = forecast(tree, data[i])
    return result

In [15]:
#计算绝对误差
def percentError(preds, y):
    error = abs(preds - y)
    errorp = np.mean(100 - 100*(error/ytest))
    return errorp

In [114]:
#计算mse
def mError(preds, y):
    error = preds - y
    length = len(preds)
    return np.dot(error, error)/length

In [24]:
df = pd.read_csv('D:/不会学习/avocado.csv')
df.head()

Unnamed: 0,index,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015/12/27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015/12/20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015/12/13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015/12/6,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015/11/29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [25]:
df = df.drop(['Date','index'],axis=1)
df.head()

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [26]:
#数据预处理
from sklearn.preprocessing import OneHotEncoder
s = (df.dtypes == 'object')
obj = list(s[s].index)
n = (df.dtypes == ('float64','int64'))
num = list(n[n].index)
#print(obj)
oneHot = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
enc_col = pd.DataFrame(oneHot.fit_transform(df[obj]))
#print(enc_col)
enc_col.index = df.index

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

y = df['AveragePrice']
data = df.drop('AveragePrice',axis=1)
trainflights, testflights, ytrain, ytest = train_test_split(data, y, train_size=0.7,test_size=0.3, random_state=0)
s = (trainflights.dtypes == 'object')
object_cols = list(s[s].index)

n = (trainflights.dtypes == ('float64','int64'))
numerical_cols = list(n[n].index)

oneHot = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
oneHottrain = pd.DataFrame(oneHot.fit_transform(trainflights[object_cols]))
oneHottest = pd.DataFrame(oneHot.transform(testflights[object_cols]))

#reattaching index since OneHotEncoder removes them:
oneHottrain.index = trainflights.index
oneHottest.index = testflights.index 

#dropping the old categorical columns:
cattraincol = trainflights.drop(object_cols, axis=1)
cattestcol = testflights.drop(object_cols, axis=1)

#concatenating the new columns:
trainflights = pd.concat([cattraincol, oneHottrain], axis=1)
testflights = pd.concat([cattestcol, oneHottest], axis=1)

In [31]:
train = np.array(pd.concat([trainflights,ytrain],axis=1))
test = np.array(pd.concat([testflights,ytest],axis=1))
train.shape

(12774, 66)

In [116]:
#生成一系列深度不同的树，比较时间开销和误差
import time
for i in range(1,16):
    t1 = time.time()
    print('depth:{}'.format(i))
    tree = createTree(train, time=i)
    m = forecastSerie(tree, test)
    pErr = percentError(m,test[:,-1])
    print('pErr:{}'.format(pErr))
    mErr = mError(m, test[:,-1])
    print('mErr:{}'.format(mErr))
    t2 = time.time()
    print('time_elapse:{}'.format(t2-t1))

depth:1
pErr:81.35022039237349
mErr:0.10144699735850095
time_elapse:235.24500632286072
depth:2
pErr:82.50131254555707
mErr:0.09154570034792396
time_elapse:360.3196105957031
depth:3
pErr:83.50382053340094
mErr:0.08376438438906401
time_elapse:399.89800000190735
depth:4
pErr:84.14398741671098
mErr:0.07883165582866658
time_elapse:441.8126583099365
depth:5
pErr:84.70152265949137
mErr:0.07366804094580946
time_elapse:464.53108406066895
depth:6


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


pErr:85.49976503609476
mErr:0.06823837091827296
time_elapse:475.5716552734375
depth:7
pErr:85.98468792169484
mErr:0.06533055782163404
time_elapse:478.231050491333
depth:8
pErr:86.6822337229531
mErr:0.059821293543658234
time_elapse:492.6998176574707
depth:9
pErr:87.24835377495745
mErr:0.056065531415603935
time_elapse:511.20485043525696
depth:10
pErr:87.74754584468346
mErr:0.052952647651423844
time_elapse:529.1404714584351
depth:11
pErr:88.15277548287646
mErr:0.05161824499607215
time_elapse:529.4262487888336
depth:12
pErr:88.46561766356228
mErr:0.04883899844888639
time_elapse:518.1155443191528
depth:13
pErr:88.65772624942798
mErr:0.04836151263691902
time_elapse:536.8717489242554
depth:14
pErr:88.86447797353657
mErr:0.04753384674505505
time_elapse:554.5224933624268
depth:15
pErr:89.03459654901548
mErr:0.04682019865399181
time_elapse:562.4687962532043


In [89]:
#实现bootstrap抽样
def bootstrap(sample, size):
    ret = np.zeros((size,len(sample[0,:])))
    sampLen = len(sample)
    #print('sample:{}'.format(sampLen))
    for i in range(size):
        ind = random.randint(0,sampLen-1)
        #print('ind:{}'.format(ind))
        ret[i,:] = sample[ind, :]
    return ret

In [87]:
#生成一系列树，将预测结果取平均值
def forestForecast(train, test, num, time, batch):
    store = np.zeros([len(test),num])
    for i in range(num):
        print(i)
        data = bootstrap(train,batch)
        #print(len(data))
        tree = createTree(data, time=time)
        f = forecastSerie(tree,test)
        store[:,i] = f
    return store.mean(axis=1)

In [117]:
for i in range(1,11,2):
    ta = time.time()
    print('tree_num:{}'.format(i))
    result = forestForecast(train,test,i,7,4000)
    pErr = percentError(result,test[:,-1])
    print('pErr:{}'.format(pErr))
    mErr = mError(result, test[:,-1])
    print('mErr:{}'.format(mErr))
    tb = time.time()
    print('time_elapse:{}'.format(tb-ta))

tree_num:1
0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


pErr:85.77990120997086
mErr:0.06743738735415333
time_elapse:31.540224075317383
tree_num:3
0
1
2
pErr:86.49198544895978
mErr:0.06130418809386594
time_elapse:95.03527021408081
tree_num:5
0
1
2
3
4
pErr:86.82467562555034
mErr:0.05955160937616326
time_elapse:155.09567499160767
tree_num:7
0
1
2
3
4
5
6
pErr:87.29575997292685
mErr:0.05550667253885058
time_elapse:210.7071807384491
tree_num:9
0
1
2
3
4
5
6
7
8
pErr:86.95631474546137
mErr:0.057233445812760145
time_elapse:269.2562174797058


In [118]:
rv = pd.read_csv('D:/不会学习/avocado.csv')
rv.describe()

Unnamed: 0,index,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
count,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0
mean,24.232232,1.405978,850644.0,293008.4,295154.6,22839.74,239639.2,182194.7,54338.09,3106.426507,2016.147899
std,15.481045,0.402677,3453545.0,1264989.0,1204120.0,107464.1,986242.4,746178.5,243966.0,17692.894652,0.939938
min,0.0,0.44,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,10.0,1.1,10838.58,854.07,3008.78,0.0,5088.64,2849.42,127.47,0.0,2015.0
50%,24.0,1.37,107376.8,8645.3,29061.02,184.99,39743.83,26362.82,2647.71,0.0,2016.0
75%,38.0,1.66,432962.3,111020.2,150206.9,6243.42,110783.4,83337.67,22029.25,132.5,2017.0
max,52.0,3.25,62505650.0,22743620.0,20470570.0,2546439.0,19373130.0,13384590.0,5719097.0,551693.65,2018.0
