In [None]:
import pandas as pd
# 데이터 시각화
import matplotlib.pyplot as plt
import matplotlib
# 데이터 분할:train, test
from sklearn.model_selection import train_test_split
# 회귀 Decision Tree
from sklearn.tree import DecisionTreeRegressor
# 최적 모델, 파라미터 탐색
from sklearn.model_selection import GridSearchCV
# 나무구조 시각화
from sklearn.tree import plot_tree

In [None]:
# 한글 맑은 고딕 적용
matplotlib.rc("font", family = "Malgun Gothic")
# 음수값 표시
matplotlib.rc("axes", unicode_minus = False)

In [None]:
# 파일명, 변수, 값 등에 한글 포함시 encoding=“euc-kr" 지정
df_raw = pd.read_csv("../data/body.csv")
df_raw.head()
# 목표변수, 설명변수 분리
df_raw_y = df_raw["FAT"]
df_raw_x = df_raw.drop("FAT", axis = 1, inplace = False)
# 학습용/평가용 데이터 분리
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y,
test_size = 0.3, random_state = 1234)
print("학습용 데이터의 X 크기 : {}".format(df_train_x.shape))
print("학습용 데이터의 Y 크기 : {}".format(df_train_y.shape))
print("평가용 데이터의 X 크기 : {}".format(df_test_x.shape))
print("평가용 데이터의 Y 크기 : {}".format(df_test_y.shape))

In [None]:
tree_uncustomized = DecisionTreeRegressor(random_state=1234)
tree_uncustomized.fit(df_train_x, df_train_y)
# 학습용 데이터 설명력
print("학습용 데이터의 설명력 : {:.3f}".format(tree_uncustomized.score(df_train_x, df_train_y)))
# test 데이터 설명력
print("평가용 데이터의 설명력 : {:.3f}".format(tree_uncustomized.score(df_test_x, df_test_y)))

In [None]:
# 학습용 및 평가용 설명력 결과 저장
train_score = []; test_score = [ ]
# min_samples_leaf: 잎사귀 최소 자료 수
para_leaf = [n_leaf * 1 for n_leaf in range(1, 20)]
for v_min_samples_leaf in para_leaf:
    tree = DecisionTreeRegressor(random_state=1234, min_samples_leaf=v_min_samples_leaf)
    tree.fit(df_train_x, df_train_y)
    train_score.append(tree.score(df_train_x, df_train_y))
    test_score.append(tree.score(df_test_x, df_test_y))
# 결과 저장
df_score_leaf = pd.DataFrame()
df_score_leaf["MinSamplesLeaf"] = para_leaf
df_score_leaf["TrainScore"] = train_score
df_score_leaf["TestScore"] = test_score
# 모델 설명력 확인
df_score_leaf.round(3)

In [None]:
# 모델 설명력 그래프 확인
plt.plot(para_leaf, train_score, linestyle = "-", label = "Train Score")
plt.plot(para_leaf, test_score, linestyle = "--", label = "Test Score")
plt.legend()

In [None]:
# 변수명 저장
v_feature_name = df_train_x.columns
# 노드의 최소 자료 수=8 모델
tree_graph = DecisionTreeRegressor(random_state=1234, min_samples_leaf=8 )
tree_graph.fit(df_train_x, df_train_y)
plt.figure(figsize = (10, 7))
plot_tree(tree_graph, feature_names = v_feature_name, filled = True);

In [None]:
# 노드의 최소 자료 수=16 모델
tree_graph = DecisionTreeRegressor(random_state=1234, min_samples_leaf=16 )
tree_graph.fit(df_train_x, df_train_y)
plt.figure(figsize = (10, 7))
plot_tree(tree_graph, feature_names = v_feature_name, filled = True);

In [None]:
# 학습용 및 평가용 설명력 결과 저장
train_score = []; test_score = []
# min_samples_split: 분리 노드의 최소 자료 수
para_split = [n_split * 2 for n_split in range(2, 20)]
for v_min_samples_split in para_split:
    tree = DecisionTreeRegressor(random_state=1234, min_samples_leaf=8, min_samples_split=v_min_samples_split )
    tree.fit(df_train_x, df_train_y)
    train_score.append(tree.score(df_train_x, df_train_y))
    test_score.append(tree.score(df_test_x, df_test_y))
# 결과 저장
df_score_split = pd.DataFrame()
df_score_split["MinSamplesSplit"] = para_split
df_score_split["TrainScore"] = train_score
df_score_split["TestScore"] = test_score
# 모델의 설명력
df_score_split.round(3)

In [None]:
# 학습용 및 평가용 설명력 결과 저장
train_score = []; test_score = []
# max_depth: 최대 깊이 변경
para_depth = [depth for depth in range(1, 11)]
for v_max_depth in para_depth:
    tree = DecisionTreeRegressor(random_state=1234, min_samples_leaf=8, min_samples_split=20,
                                 max_depth = v_max_depth)
    tree.fit(df_train_x, df_train_y)
    train_score.append(tree.score(df_train_x, df_train_y))
    test_score.append(tree.score(df_test_x, df_test_y))
# 데이터 테이블로 저장
df_score_depth = pd.DataFrame()
df_score_depth["Depth"] = para_depth
df_score_depth["TrainScore"] = train_score
df_score_depth["TestScore"] = test_score
# 모델 설명력 확인
df_score_depth.round(3)

In [None]:
tree_final = DecisionTreeRegressor(min_samples_leaf = 8, min_samples_split = 20, max_depth = 4,random_state=1234)
tree_final.fit(df_train_x, df_train_y)

In [None]:
plt.figure(figsize = (10, 7))
plot_tree(tree_final)
plot_tree(tree_final, feature_names = v_feature_name, filled = True);

In [None]:
# tree.feature_importances로 설명변수 중요도 확인 및 테이블로 저장
df_importance = pd.DataFrame()
df_importance["Feature"] = df_train_x.columns
df_importance["Importance"] = tree_final.feature_importances_
# df_feature_importance의 테이블을 중요도별로 정렬
df_importance.sort_values("Importance", ascending=False, inplace = True)
df_importance.round(3)

In [None]:
# 설명변수 중요도 그래프
df_importance.sort_values("Importance", ascending=True, inplace = True)
coordinates = range(len(df_importance))
plt.barh(y = coordinates, width = df_importance["Importance"])
plt.yticks(coordinates, df_importance["Feature"])
plt.xlabel("변수 중요도")
plt.ylabel("변수")

In [None]:
estimator = DecisionTreeRegressor()
# 구하고자 하는 parameter와 범위
param_grid = {"criterion": ["mse", "friedman_mse", "mae"], "max_features": ["auto", "sqrt", "log2"],
              "max_depth": para_depth, "min_samples_split": para_split,"min_samples_leaf": para_leaf}
# 설명력이 높은 최적 parameter 찾기
grid_dt = GridSearchCV(estimator, param_grid, scoring="r2", n_jobs = -1)
grid_dt.fit(df_train_x, df_train_y)
print("best estimator model: \n{}".format(grid_dt.best_estimator_))
print("\nbest parameter: \n{}".format(grid_dt.best_params_))
print("\nbest score: \n{}".format(grid_dt.best_score_.round(3)))