# 5.随机森林

## 5.1 分类问题

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
rng = np.random.RandomState(1)
#通过使用randomstate来保证随机结果的可复现
X = 10*rng.rand(100,2)
y_val = np.dot(X,[1.5,1])

In [None]:
y = np.zeros(100)
y[y_val>y_val.mean()] = 1
y = y.astype(int)
y[::20] = 1-y[::20] 

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')
plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')

In [None]:
from sklearn.ensemble import RandomForestClassifier
#随机森林分类器
from sklearn.model_selection import cross_val_score
#用于计算样本外误差的cross validation
from sklearn.inspection import DecisionBoundaryDisplay
#导入决策树绘图，导入出错的建议更新版本，更新方式建议直接重装anaconda

In [None]:
num_trees = [1,2,3,4,5,7,10,15,20,30,40,50,75,100,125,150]
#训练不同规模的森林

In [None]:
rf_eval = []
for ntr in num_trees:
    clf = RandomForestClassifier(n_estimators=ntr, max_depth=2, random_state=0)
    #训练模型
    scores = cross_val_score(clf, X, y, cv=5)
    #计算样本外（通过cross validation计算）
    clf.fit(X,y)
    
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="predict",
        alpha=0.5, 
        cmap=plt.cm.coolwarm
    )
    #画图，和上面的决策树很像 
    
    plt.scatter(X[y>0.5, 0], X[y>0.5, 1], c='red')
    plt.scatter(X[y<0.5, 0], X[y<0.5, 1], c='blue')
    plt.title('Random Forest with '+str(ntr)+' trees')
    plt.savefig('RF_'+str(ntr)+'.png')
    #画图 保存
    rf_eval.append(scores.mean())
    #保存样本外误差

In [None]:
cv5_ac = pd.DataFrame({'cv5_ac':rf_eval},index=num_trees)

In [None]:
cv5_ac.plot(figsize=(12,8))
plt.xlabel('number of Trees')
plt.ylabel('5-fold cross validation accuration');
#画随机森林的图

## 5.2 随机森林用于回归

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rng = np.random.RandomState(1)
X_5 = 10*rng.rand(500,5)
X_20 = 10*rng.rand(500,20)
X_50 = 10*rng.rand(500,50)

In [None]:
weight_l = []
for li in range(50):
    weight_l.append(0.95**li)
weight_l

In [None]:
y_5 = np.dot(X_5,weight_l[:5])
y_20 = np.dot(X_20,weight_l[:20])
y_50 = np.dot(X_50,weight_l[:50])
#生成数据

In [None]:
y_5[::5] +=0.5*y_5.mean()*(0.5-rng.rand(100))
y_20[::5] +=0.5*y_20.mean()*(0.5-rng.rand(100))
y_50[::5] +=0.5*y_50.mean()*(0.5-rng.rand(100))
#在数据上面加噪音

In [None]:
train_sp = 400
#数据生成、训练、测试分割同上

In [None]:
#需要一些时间
rf_regs_5 = []
rf_regs_20 = []
rf_regs_50 = []
for dpi in range(1,20):
    print(dpi)
    rf_reg_5 = RandomForestRegressor(n_estimators=50*dpi,  random_state=0,max_features='sqrt')
    #注意random_state，算法开始走向随机。注意max_features
    rf_reg_5.fit(X_5[:train_sp],y_5[:train_sp])
    rf_regs_5.append(rf_reg_5)
    
    rf_reg_20 = RandomForestRegressor(n_estimators=50*dpi, random_state=0,max_features='sqrt')
    rf_reg_20.fit(X_20[:train_sp],y_20[:train_sp])
    rf_regs_20.append(rf_reg_20)
    
    rf_reg_50 = RandomForestRegressor(n_estimators=50*dpi,  random_state=0,max_features='sqrt')
    rf_reg_50.fit(X_50[:train_sp],y_50[:train_sp])
    rf_regs_50.append(rf_reg_50)

In [None]:
tr_scores_5 = [clf.score(X_5[:train_sp],y_5[:train_sp]) for clf in rf_regs_5]
test_scores_5 = [clf.score(X_5[train_sp:],y_5[train_sp:]) for clf in rf_regs_5]

tr_scores_20 = [clf.score(X_20[:train_sp],y_20[:train_sp]) for clf in rf_regs_20]
test_scores_20 = [clf.score(X_20[train_sp:],y_20[train_sp:]) for clf in rf_regs_20]

tr_scores_50 = [clf.score(X_50[:train_sp],y_50[:train_sp]) for clf in rf_regs_50]
test_scores_50 = [clf.score(X_50[train_sp:],y_50[train_sp:]) for clf in rf_regs_50]

In [None]:
ntress = [ii*50 for ii in range(1,20)]

In [None]:
rf_scores = {'v5_train':tr_scores_5,'v5_test':test_scores_5,'v20_train':tr_scores_20,'v20_test':test_scores_20,'v50_train':tr_scores_50,'v50_test':test_scores_50}
rf_sc_df = pd.DataFrame(rf_scores,index=ntress)

In [None]:
rf_sc_df.plot(figsize=(10,6))
plt.fontsize=24
plt.xlabel('number of Trees')
plt.ylabel('accurate_ratio');
#这个图和课件上的不一样，是因为我在做课件时限制了森林中的树的max_depth=4,虽然这个限制很宽松，但对比看看结果，RF需要正则化么？