## 데이터 로드

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# 데이터 불러오기
data = pd.read_csv("./data/otto_train.csv")
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [6]:
nCar = data.shape[0]
nVar = data.shape[1]
print(nCar, nVar)

61878 95


* 의미가 없다고 판단되는 변수 제거

In [7]:
data = data.drop(['id'], axis=1)

* 타겟 변수의 문자열을 숫자로 변환

In [9]:
mapping_dict = {'Class_1' : 1,
               'Class_2' : 2,
               'Class_3' : 3,
               'Class_4' : 4,
               'Class_5' : 5,
               'Class_6' : 6,
               'Class_7' : 7,
               'Class_8' : 8,
               'Class_9' : 9,}

In [10]:
after_mapping_target = data['target'].apply(lambda x : mapping_dict[x])

In [13]:
after_mapping_target.sample(5)

45534    6
18599    3
13067    2
35159    6
29484    5
Name: target, dtype: int64

* 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [15]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


## 학습 데이터를 랜덤포레스트 모형에 적합 후 평가 데이터로 검증¶

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=0)
clf.fit(train_x, train_y)

RandomForestClassifier(max_depth=5, n_estimators=20, random_state=0)

In [22]:
predict1 = clf.predict(test_x)
accuracy_score(test_y, predict1)

0.5929217840982547

* 트리를 증가시킬 경우

In [23]:
clf = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=0)
clf.fit(train_x, train_y)
predict1 = clf.predict(test_x)
accuracy_score(test_y, predict1)

0.608678086619263

성능이 크게 상승하지는 않았다.

* 트리의 깊이를 늘릴 경우

In [24]:
clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
clf.fit(train_x, train_y)
predict1 = clf.predict(test_x)
accuracy_score(test_y, predict1)

0.7798157724628313

트리의 깊이를 늘릴 경우, 성능이 개선되었다.

* 트리의 깊이를 최대로 늘릴 경우

In [27]:
clf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
clf.fit(train_x, train_y)
predict1 = clf.predict(test_x)
accuracy_score(test_y, predict1)

0.8119747899159664