theory

랜덤 포레스트는 여러 개의 결정 트리들을 임의적으로 학습하는 방식의 앙상블 방법이다.

가장 핵심적인 특징은 임의성(randomness)에 의해 서로 조금씩 다른 특성을 갖는 트리들로 구성된다는 점이다.

### Importing Libraries


In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)

### Importing the Dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names) 

### Preprocessing

In [4]:
X = dataset.iloc[:, :-1].values  
y = dataset.iloc[:, 4].values 

### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)  

In [7]:
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:',len(X_test))

Number of observations in the training data: 120
Number of observations in the test data: 30


In [8]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(X_train, y_train)  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
y_pred = clf.predict(X_test)  

In [14]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.93      1.00      0.96        13
 Iris-virginica       1.00      0.83      0.91         6

    avg / total       0.97      0.97      0.97        30



### MNIST

In [22]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

In [23]:
import numpy as np

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [24]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
y_pred = clf.predict(X_test)

In [28]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

[[ 971    1    1    2    0    2    2    1    0    0]
 [   0 1123    2    1    1    1    5    0    2    0]
 [  10    1  993    4    3    1    2   10    7    1]
 [   3    0   14  945    0   19    2    9   13    5]
 [   6    4    5    1  930    0    6    1    7   22]
 [   9    1    3   33    6  821    7    2    6    4]
 [  11    3    5    0    9    6  921    0    3    0]
 [   2    8   24    3    5    0    0  971    7    8]
 [   7    4    8   28   10   12    9    4  881   11]
 [   9    4    3   12   25    6    2    6    8  934]]
             precision    recall  f1-score   support

        0.0       0.94      0.99      0.97       980
        1.0       0.98      0.99      0.98      1135
        2.0       0.94      0.96      0.95      1032
        3.0       0.92      0.94      0.93      1010
        4.0       0.94      0.95      0.94       982
        5.0       0.95      0.92      0.93       892
        6.0       0.96      0.96      0.96       958
        7.0       0.97      0.94      0.96  