# 4.决策树

## 4. 1数据生成

In [None]:
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# Create a random dataset X为2维，100个
rng = np.random.RandomState(1)
X = 10*rng.rand(100,2)
# 注意y_val受到x[0],x[1]的影响
y_val = np.dot(X,[1.5,1])

In [None]:
y_val.mean()

In [None]:
y = np.zeros(100)
y[y_val>y_val.mean()] = 1
#y最终是一个分类标签，但是来自于y_val，来确定y收到x[0] x[1]的影响
y = y.astype(int)
y[::20] = 1-y[::20] 
#加入一些噪音，加了几个？咋加的？

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')
plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')
#画一下

## 4.2 决策树分类

In [None]:
from sklearn.tree import DecisionTreeClassifier
#导入决策树的类

In [None]:
DTC = DecisionTreeClassifier()
#新建一个决策树
DTC.fit(X,y)
#训练这个决策树

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay
#导入决策树绘图，导入出错的建议更新版本，更新方式建议直接重装anaconda

In [None]:

DecisionBoundaryDisplay.from_estimator(
        DTC,
        X,
        response_method="predict",
        alpha=0.5, 
        cmap=plt.cm.coolwarm  #决定颜色
    )
plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')  
plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')

#画一个决策树，之后补上散点图

In [None]:
DC_1 = DecisionTreeClassifier(max_depth=1).fit(X,y)
#限制最大深度为1

DecisionBoundaryDisplay.from_estimator(
        DC_1,
        X,
        response_method="predict",
        alpha=0.5, 
        cmap=plt.cm.coolwarm
    )
plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')
plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')

In [None]:
from sklearn.tree import plot_tree
#导入决策树画图类

plt.figure(figsize=(8,8))
plot_tree(DC_1, filled=True)
plt.title("Decision tree-1")
plt.show()

In [None]:
DC_2 = DecisionTreeClassifier(max_depth=2).fit(X,y)
plt.figure(figsize=(25, 12))
DecisionBoundaryDisplay.from_estimator(
        DC_2,
        X,
        response_method="predict",
        alpha=0.5, 
        cmap=plt.cm.coolwarm
    )
plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')
plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plot_tree(DC_2, filled=True)
plt.title("Decision tree-2")
plt.show()

## 4.3 决策树回归

In [None]:
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(120, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(24))
#生成数据，120个点，在sin(x)基础上加入噪音

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(X, y, s=20,  c="red", label="data")
plt.savefig('raw_pois.png')
plt.show()
#打印出来看看

In [None]:
from sklearn.tree import DecisionTreeRegressor
#导入用于回归的决策树

In [None]:
dtrs = []
dtrs_mae = []
for dep in range(1,8):
    treg = DecisionTreeRegressor(max_depth=dep)
    treg.fit(X, y)
    dtrs.append(treg)
    #尝试1-8不同深度的树，将模型保存在一个list中
    
    treg_mae = DecisionTreeRegressor(max_depth=dep,criterion='absolute_error')
    treg_mae.fit(X, y)
    dtrs_mae.append(treg_mae)
    #尝试1-8不同深度的树，并把loss改为MAE，将模型保存在一个list中

In [None]:
for dep in range(1,8):
    plt.figure(figsize=(12,8))
    plt.scatter(X, y, s=20,  c="red", label="data")
    #画出原始数据
    X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
    yt = dtrs[dep-1].predict(X_test)
    #按照稠密点，画出测试值，注意这里的画图方式，需要稠密test
    
    plt.plot(X_test, yt, color="cornflowerblue",  linewidth=2)
    plt.xlabel("data")
    plt.ylabel("target")
    plt.title("Decision Tree Regression Max_depth="+str(dep))
    
    plt.savefig('DT_reg_'+str(dep)+'.png')
    #保存图片
    plt.show()
    

## 4.4 高维空间回归

In [None]:
rng = np.random.RandomState(1)
X_5 = 10*rng.rand(500,5)
X_20 = 10*rng.rand(500,20)
X_50 = 10*rng.rand(500,50)

In [None]:
weight_l = []
for li in range(50):
    weight_l.append(0.95**li)
weight_l
#定义一个收缩的权重list

In [None]:
y_5 = np.dot(X_5,weight_l[:5])
y_20 = np.dot(X_20,weight_l[:20])
y_50 = np.dot(X_50,weight_l[:50])
#生成5维 20维 50维 加总

In [None]:
y_5[::5] +=0.5*y_5.mean()*(0.5-rng.rand(100))
y_20[::5] +=0.5*y_20.mean()*(0.5-rng.rand(100))
y_50[::5] +=0.5*y_50.mean()*(0.5-rng.rand(100))
#加入一些噪音

In [None]:
train_sp = 400
#一共500条数据，我们使用前400条作为训练，后100作为测试

In [None]:
dt_regs_5 = []
dt_regs_20 = []
dt_regs_50 = []
for dpi in range(2,100):
    dt_reg_5 = DecisionTreeRegressor(max_leaf_nodes=dpi)
    #定义模型
    dt_reg_5.fit(X_5[:train_sp],y_5[:train_sp])
    #训练模型
    dt_regs_5.append(dt_reg_5)
    #把模型保存到list中
    
    dt_reg_20 = DecisionTreeRegressor(max_leaf_nodes=dpi)
    dt_reg_20.fit(X_20[:train_sp],y_20[:train_sp])
    dt_regs_20.append(dt_reg_20)
    
    dt_reg_50 = DecisionTreeRegressor(max_leaf_nodes=dpi)
    dt_reg_50.fit(X_50[:train_sp],y_50[:train_sp])
    dt_regs_50.append(dt_reg_50)

In [None]:
tr_scores_5 = [clf.score(X_5[:train_sp],y_5[:train_sp]) for clf in dt_regs_5]
#样本内误差
test_scores_5 = [clf.score(X_5[train_sp:],y_5[train_sp:]) for clf in dt_regs_5]
#样本外误差
tr_scores_20 = [clf.score(X_20[:train_sp],y_20[:train_sp]) for clf in dt_regs_20]
test_scores_20 = [clf.score(X_20[train_sp:],y_20[train_sp:]) for clf in dt_regs_20]

tr_scores_50 = [clf.score(X_50[:train_sp],y_50[:train_sp]) for clf in dt_regs_50]
test_scores_50 = [clf.score(X_50[train_sp:],y_50[train_sp:]) for clf in dt_regs_50]

In [None]:
len(test_scores_50)

In [None]:
nodes = list(range(2,100))

In [None]:
import pandas as pd
scores = {'v5_train':tr_scores_5,'v5_test':test_scores_5,'v20_train':tr_scores_20,'v20_test':test_scores_20,'v50_train':tr_scores_50,'v50_test':test_scores_50}
sc_df = pd.DataFrame(scores,index=nodes)



In [None]:
sc_df.plot(figsize=(14,10))
plt.xlabel('number of Leafs')
plt.ylabel('accurate');
#画图