# 随机森林的简单实现
> numpy, pandas和scikit-learn来实现随机森林
>
> 使用iris数据集

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

  from numpy.core.umath_tests import inner1d


## 读取并构建数据集

In [2]:
iris_data = load_iris()  # 加载数据集
df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75  # 划分数据集和测试集
df['species'] = pd.Categorical.from_codes(iris_data.target, iris_data.target_names)  # 分类标签

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),is_train,species
0,5.1,3.5,1.4,0.2,True,setosa
1,4.9,3.0,1.4,0.2,True,setosa
2,4.7,3.2,1.3,0.2,True,setosa
3,4.6,3.1,1.5,0.2,True,setosa
4,5.0,3.6,1.4,0.2,False,setosa


## 切分数据集

In [3]:
train, test = df[df['is_train'] == True], df[df['is_train'] == False]  # 训练集，测试集
features = df.columns[:4]  # 选取特征

## 开始训练

In [4]:
clf = RandomForestClassifier(n_jobs=2)  # 并行数量为2
y, _ = pd.factorize(train['species'])  # 转换为整数编码
clf.fit(train[features], y)  # 训练

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## 预测

In [5]:
preds = iris_data.target_names[clf.predict(test[features])]  # 预测结果
pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])  # 输出结果

preds,setosa,versicolor,virginica
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,8,0,0
versicolor,0,14,1
virginica,0,3,10
