# Random Forest 사용해보기

- Decision Tree의 단점, **Overfitting(과적합)!!**

    학습 데이터를 과하게 학습한 나머지 학습 상황에서는 오차가 줄지만 실제 데이터들에 대해 오차가 증가하는 현상


- 과적합 문제 해결을 위한 **Random Forest**
    
    여러 개의 Decision Tree를 만들고 연결하여 결과를 취합한 후 평균을 내어 성능을 높인 모델. 즉, Ensemble의 원리를 이용한 방법

## Stage1. featured data 불러오기

In [1]:
import pandas as pd
import numpy as np

In [2]:
# train data 불러오기
df = pd.read_csv('data/titanic_train_featured.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,FamilySize,IsAlone
0,0,0,3,0,2.0,7.25,0,0,1,0
1,1,1,1,1,3.0,71.2833,0,1,1,0
2,2,1,3,1,2.0,7.925,0,0,0,1
3,3,1,1,1,3.0,53.1,0,0,1,0
4,4,0,3,0,3.0,8.05,0,0,0,1


In [3]:
# test data 불러오기
df_tst = pd.read_csv('data/titanic_test_featured.csv')
df_tst.head()

Unnamed: 0.1,Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,FamilySize,IsAlone
0,0,3,0,3.0,7.8292,0,2,0,1
1,1,3,1,4.0,7.0,0,0,1,0
2,2,2,0,5.0,9.6875,0,2,0,1
3,3,3,0,2.0,8.6625,0,0,0,1
4,4,3,1,2.0,12.2875,0,0,2,0


## Stage2-1. Decision Tree를 Ensemble 해보기

### training set과 validation set 나누기

In [4]:
from sklearn.model_selection import train_test_split

# train data에는 Survived 없애주기
train_data = df.drop('Survived', axis=1)
# target으로 삼는 데이터는 Survived
target_data = df['Survived']

#자동으로 75%, 25%로 나눠줌
x_train, x_valid, y_train, y_valid = train_test_split(train_data, target_data)

print(train_data.shape, x_train.shape, x_valid.shape)
print(train_data.shape, y_train.shape, y_valid.shape)

(891, 9) (668, 9) (223, 9)
(891, 9) (668,) (223,)


### 여러 개의 Decision Tree 만들기

In [5]:
from sklearn.tree import DecisionTreeClassifier

tree1=DecisionTreeClassifier() 
tree1.fit(x_train, y_train) 
print('tree1')
print('train set accuracy:', tree1.score(x_train, y_train))
print('valid set accuracy:', tree1.score(x_valid, y_valid)) 
print('-'*50)

tree2=DecisionTreeClassifier() 
tree2.fit(x_train, y_train) 
print('tree2')
print('train set accuracy:', tree2.score(x_train, y_train))
print('valid set accuracy:', tree2.score(x_valid, y_valid)) 
print('-'*50)

tree3=DecisionTreeClassifier() 
tree3.fit(x_train, y_train) 
print('tree3')
print('train set accuracy:', tree3.score(x_train, y_train))
print('valid set accuracy:', tree3.score(x_valid, y_valid)) 

tree1
train set accuracy: 1.0
valid set accuracy: 0.7668161434977578
--------------------------------------------------
tree2
train set accuracy: 1.0
valid set accuracy: 0.7757847533632287
--------------------------------------------------
tree3
train set accuracy: 1.0
valid set accuracy: 0.7847533632286996


### 여러 개의 Decision Tree를 Ensemble하여 결과 예측하기

In [6]:
prediction1 = tree1.predict(df_tst)
prediction2 = tree2.predict(df_tst)
prediction3 = tree3.predict(df_tst)

prediction_ensemble = (prediction1 + prediction2 + prediction3) / 3

# ensemble한 결과가 0.5보다 크면 1로, 작으면 0으로 계산
prediction_ensemble[prediction_ensemble > 0.5] = 1
prediction_ensemble[prediction_ensemble <= 0.5] = 0

# 평균을 내면서 소수형태로 바뀐 데이터형식을 다시 정수로 바꿈
# 생존자는 1.0이 아니라 1로 표현돼야 채점이 가능하므로
prediction_ensemble = prediction_ensemble.astype(int)

## Stage2-2. Scikit-learn으로 Random Forest 구현하기

In [7]:
from sklearn.ensemble import RandomForestClassifier
forest1=RandomForestClassifier(n_estimators=10) 
forest1.fit(x_train, y_train) 
print('train set accuracy:', forest1.score(x_train, y_train))
print('valid set accuracy:', forest1.score(x_valid, y_valid))

train set accuracy: 0.9805389221556886
valid set accuracy: 0.8475336322869955


In [8]:
prediction1  = forest1.predict(df_tst)
prediction1

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [9]:
from sklearn.ensemble import RandomForestClassifier
forest2=RandomForestClassifier(n_estimators=3000) 
forest2.fit(x_train, y_train) 
print('train set accuracy:', forest2.score(x_train, y_train))
print('valid set accuracy:', forest2.score(x_valid, y_valid))

train set accuracy: 1.0
valid set accuracy: 0.8654708520179372


In [10]:
prediction2  = forest2.predict(df_tst)
prediction2

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,