In [33]:
# use Computer Hardware Data Set link http://archive.ics.uci.edu/ml/datasets/Computer+Hardware 
# related papaers 《learning with continuous vlaues》
import pandas as pd
import numpy as np


In [14]:
names = ['vendor_name','model_name','myct','mmin','mmax','cach','chmin','chmax','prp','erp']

In [59]:
frame = pd.read_csv('./data/data.csv',header = None,names = names ,usecols = names[2:-1])

In [67]:
X = frame[names[2:-2]]
y = frame[names[-2]]

In [69]:
print(X.columns)
print(y.shape)

Index(['myct', 'mmin', 'mmax', 'cach', 'chmin', 'chmax'], dtype='object')
(209,)


In [70]:
cols = names[2:]
sd = np.std(frame['prp'].values)

![选择属性的条件](img/error.png)

- 这里最大化 error
- 和 CART 不同的是，CART使用的标准差或者偏差

## 先回顾一下线性回归
- 二元线性回归使用最小二乘法来算损失函数,得到参数的值为
$$\begin{split}
a &= \frac{\sum_{i}^{m}{x_{i}y_{i}-m\bar{x}\bar{y}}}{\sum_{i}^{m}x_{i}^{2}-m\bar{x}^{2}} \\ 
b &= \bar{y} - a\bar{x}
\end{split}
$$
- 评判标准
    * 均方误差(MSE)
        ![mse](img\mse.png)
    * 根均方误差(RMSE),其是均方误差的开方
    * 平均绝对误差(MAE)
        ![mae](img\mae.png)
    * R-square 
        ![r-square](img\R-square.png)
- 多元线性回归求解
    * 可以使用矩阵求导一步求得
        ![multivariate_linear_model](img\multivariate_linear_model.png)

### M5 model tree 特点
- 在每一个点使用标准回归技术建立一个多元回归模型
- 使用(n+v)/(n-v)进行模型的平均残差进行修正。n是训练，v是模型参数的个数
- 在每个叶子节点对线性模型进行简化，此处使用的是贪心的方法。(此处使用的是测试集)
- 对每一个非节点进行剪枝判断，根据评估误差，来决定该节点的子节点是一个叶子还是不变(后剪枝)。
- 对模型预测到的值进行从叶子往上进行一个平滑( 这个应该是对未知数据的)
    * 平滑方程
        ![smooth_function](img\smooth.png)
        其中PV($S_{i}$)是在$S_{i}$处的预测值$n_{i}$是包含的测试数据总数。
        
- 使用(1) relative error来做为而性能的比较，就是1 - $R^2$ (2) correlation 是指预测值和真实值的相关性 (3) 是percentage deviation 残差和目标比值的平均(不适用于目标为0的情况)。


In [504]:
## 先使用决策树看一下效果
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [749]:
# 自定义分数值
def score_func(reg, X, y):
    pred_y = reg.predict(X)
    # 这里应该是取绝对值
    return np.mean(abs(y - pred_y) / y) * 100

In [750]:
# 使用 1 - R^2 作为
# relative_error
def re(reg, X, y):
    pred_y = reg.predict(X)
    return np.sum((pred_y - y)**2) / np.sum((np.mean(y) - y)**2) * 100

In [751]:
kfold = KFold(n_splits=10, shuffle = True,random_state = 0) # 
# shuffle 决定划分前是否进行洗牌，而random_state决定划分是否一致

In [752]:
scores_cart = []
for train_index,test_index in kfold.split(X):
#     print((train_index).shape,(test_index).shape)
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y[train_index],y[test_index]
    regressor = DecisionTreeRegressor(criterion = 'mse',max_depth = 4)
    regressor.fit(X_train,y_train)
    scores_cart.append(re(regressor,X_test,y_test))
#     scores_cart.append(score_func(regressor,X_test,y_test))

In [753]:
np.min(scores_cart)

4.747008134245207

In [754]:
# 线性回归的效果
from sklearn.linear_model import LinearRegression
from numpy.linalg import *

# score = cross_val_score(lr,X,y,scoring = score_func,cv = 10)

In [755]:
lr = LinearRegression()
scores_lr = []
for train_index,test_index in kfold.split(X):
#     print((train_index).shape,(test_index).shape)
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y[train_index],y[test_index]
    lr.fit(X_train,y_train)
    scores_lr.append(re(lr,X_test,y_test))
#     scores_lr.append(score_func(lr,X_test,y_test))

In [756]:
np.min(scores_lr)

3.285910530311411

In [757]:
# just try
# 只用深度来进行预剪枝，只在叶子中训练一个多元线性模型，不进行后剪枝，而且没有对模型进行属性选择，不对其进行平滑，不进行残差修正
class Node(object):
    def __init__(self,w,split_feature,split_value,is_leaf):
        self.w = None
        self.split_index = split_feature # 划分特征的所以
        self.split_value = split_value # 划分值
        self.lef = None
        self.rig = None
        self.is_leaf = is_leaf
    def set(self,is_leaf,w):
        self.is_leaf = is_leaf
        self.w = w
        
class simpleM5(object):
    def __init__(self,max_depth):
        self.max_depth = max_depth
    def fit(self,train_data,train_label):
        self.n = train_data.shape[0]
        self.m = train_data.shape[1]
        self.attri = np.array(range(train_data.shape[1]))
        self.root = self._build_tree(0,train_data.values,train_label.values)
    def _build_tree(self, dep,data,label): #（np.ndarrays,np.ndarrays) 
        # 在该层确定划分的属性索引，还有属性值
        if(dep > self.max_depth):
            return None # 当达到最大深度时返回
        # 计算train_label的标准差，找到使得切分后，左右子空间标准差的期望 最小的最佳切分点
        mi_index = -1;
        mi_value = 1e9
        mi_index_value = -1
        for col in self.attri:
            data_col = data[:,col]
            sort_index = np.argsort(data_col)
            data_col_sort = data_col[sort_index]
            label_sort = label[sort_index]
            for value in (np.unique(data_col_sort)):
                index = data_col_sort < value
                less_data = data_col_sort[index]
                less_label = label_sort[index]
                tmp = len(less_data) / self.n * np.std(less_label)
                
                index = data_col_sort >= value
                more_data = data_col_sort[index]
                more_label = label_sort[index]
                tmp += len(more_label) / self.n * np.std(more_label)
                if(tmp < mi_value):
                    mi_value = tmp
                    mi_index = col
                    mi_index_value = value
        # 得到划分点，将数据集按照最佳划分点进行划分
        node = Node(None,mi_index,mi_index_value,False)
        index = data[:,mi_index] < mi_index_value
        left_data = data[index]
        left_label = label[index]
        index = data[:,mi_index] >= mi_index_value
        right_data = data[index]
        right_label = label[index]
        tmp = dep + 1
        node.left = self._build_tree(tmp,left_data,left_label)
        node.right = self._build_tree(tmp, right_data,right_label)
        if node.left == None and node.right == None: # 是叶子节点
            inv_data = 
            oneStepTheta = np.dot(np.dot(inv(np.dot(X2.transpose(), X2)), X2.transpose()), Y)
            data = data + np.diag(np.ones(()))
            node.set(True,lr) # 一步算法多元回归的的参数
        return node
    def predict(self,test):
        result = []
        root = self.root
        for t in test.values:
           result.append(self._dfs(root,t)[0])
        return np.array(result)

    def _dfs(self,root,test):
#         print(root.is_leaf)
        if root.is_leaf:
            return root.w.predict(test.reshape(1,-1))
        if test[root.split_index] < root.split_value:
            return self._dfs(root.left,test)
        else:
            return self._dfs(root.right,test) #important

In [758]:
from sklearn.model_selection import KFold

In [759]:
scores_m5 = []
for train_index,test_index in kfold.split(X):
#     print((train_index).shape,(test_index).shape)
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y[train_index],y[test_index]
    m5 = simpleM5(max_depth = 3)
    m5.fit(X_train, y_train)
    scores_m5.append(re(m5,X_test,y_test))
#     scores_m5.append(score_func(m5,X_test,y_test))

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [760]:
np.min(scores_m5)

3.0947273192690123

In [761]:
import xgboost as xgb

In [762]:
scores_xgb = []
for train_index,test_index in kfold.split(X):
#     print((train_index).shape,(test_index).shape)
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y[train_index],y[test_index]
    xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=0)
    xgb_model.fit(X_train, y_train)
    scores_xgb.append(re(xgb_model,X_test,y_test))
#     scores_m5.append(score_func(m5,X_test,y_test))

In [763]:
np.min(scores_xgb)

4.264912466167563

### conclusion(针对Computer_HardWare)
- 使用不同的评判标准时产生的结论是不一样的，比如当使用平均绝对误差时，决策树回归最好，但是当使用relative error 时，Model 树表现最好。
- 模型树在指标上都是比普通的回归要好，因为Model树可以能加关注局部线性，这点还是可以肯定的，但是模型树和决策树到底哪个好，就不知道了。

In [None]:
### 