<a href="https://colab.research.google.com/github/HyeonhoonLee/KIOM_KDC/blob/master/09_KDC_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#KIOM 한의임상정보은행 data 분석하기

- RandomizedSearchCV를 통해 최적의 model과 parameter를 찾아 **Regression**을 수행하여 저밀도콜레스테롤의 결측치를 처리하고,
- RandomizedSearchCV를 통해 최적의 model과 parameter를 찾아 **Classification**을 수행하여, 한의사체질진단을 분류한다.

# 라이브러리 로드

In [1]:
# 분석에 사용할 pandas, 수치계산에 사용할 numpy, 시각화에 사용할 seaborn 을 불러옵니다.
# 또, 구 버전의 주피터 노트북에서는 %matplotlib inline 설정을 되어야 노트북 안에서 그래프를 시각화 합니다.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

  import pandas.util.testing as tm


#한글 폰트 설정

In [2]:
# Google Colab 사용 시 아래 주석을 풀고 폰트설정을 합니다.
# 로컬 아나콘다 사용 시에는 그대로 주석처리 해놓으시면 됩니다.
# 나눔고딕 설치
# 이 코드를 사용시 아래에 있는 폰트를 로드할 경우 colab에서는 오류가 발생하니
# 아래에 있는 폰트 설정은 꼭 주석처리를 해주세요. 
!apt -qq -y install fonts-nanum > /dev/null

import matplotlib.font_manager as fm

fontpath = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

fm._rebuild()

# Colab 의 한글 폰트 설정
plt.rc('font', family='NanumGothic') 
# 마이너스 폰트 깨지는 문제에 대한 대처
plt.rc("axes", unicode_minus=False)





In [3]:
# 레티나 설정을 해주면 글씨가 좀 더 선명하게 보입니다.
# 폰트의 주변이 흐릿하게 보이는 것을 방지합니다.
%config InlineBackend.figure_format = 'retina'

#데이터 불러오기

- 데이터 로드 주소: https://www.data.go.kr/data/3072201/fileData.do
- 임상정보은행 사이트에서 상세이용방법 확인가능 (https://kdc.kiom.re.kr/html/?pmode=subpage&MMC_pid=200&spSeq=22)

In [4]:
# 다운로드 받은 파일을 판다스의 read_csv 를 통해 읽어옵니다.
# 파일을 읽어온 후 shape 로 행과 열의 수를 출력합니다.

##새로만든 csv에는 인코딩을 따로 지정하지 않아도된다.
df = pd.read_csv("/content/drive/My Drive/DataCollection/KIOM_KDC/data/KDC_feature.csv")
df.shape

(500, 82)

In [6]:
# "저밀도콜레스테롤"의 결측치를 처리한 데이터를 불러온다.
df_ldl = pd.read_csv("/content/drive/My Drive/DataCollection/KIOM_KDC/data/diabetes_fill_insulin.csv")
# 위에서 로드한 df(KDC_feature.csv)에다가 결측치 처리한 데이터를 넣어준다.
df["저밀도콜레스테롤"] = df_ldl["저밀도콜레스테롤"]

In [8]:
# sample, head, tail을 통해 데이터를 미리보기 합니다.
df.head()

Unnamed: 0,대상자식별코드,성별,만나이,직업분류,학력,결혼유무,성격_대범_섬세,성격_행동빠름_행동느림,성격_적극_소극,성격_직접_간접,성격_포기빠름_포기느림,성격_외향_내성,성격_동적_정적,성격_쉽게_어렵게,성격_남성적_여성적,성격_참을성부족_잘찾음,성격_큰편_적은편,성격_잘드러냄_안드러냄,성격_잘표현_표현안하는편,성격_가끔흥분_이성적,성격_덜렁_꼼꼼,식사량,식사속도,소화여부,소화입맛,땀정도,땀기분,대변습관,대변굳기,배변시긴박감,배변후잔변감,소변야간뇨,한열민감도,한열손부위,한열발부위,한열배부위,음수정도,음수온다,한열점수,한열그룹,...,고지혈증_진단,일반건강,수면시간(시간),수면시간(분),피로정도,피로_기상시,피로_오전,피로_오후,피로_밤,피로_하루종일,한의사체질진단,키,몸무게,BMI,이마둘레,목둘레,겨드랑이둘레,가슴둘레,늑골둘레,허리둘레,장골둘레,곡골둘레,수축기혈합,이완기혈압,혈당,총콜레스테롤,중성지방,고밀도콜레스테롤,저밀도콜레스테롤,수면시간(시간)_부족,늑골둘레_low,늑골둘레_middle,늑골둘레_high,혈당_nan,총콜레스테롤_nan,중성지방_nan,고밀도콜레스테롤_nan,저밀도콜레스테롤_nan,혈당_log,중성지방_log
0,KDCT00001,2,0.036746,2,5,2,2,2,1,2,1,1,1.0,2.0,2,3,2,2,2,1,1,2.0,2.0,1,3,3,1.0,1,2,2.490637,2.314607,1.0,1,2,2,2,2.0,2,12.0,2.0,...,1,3.0,5,0,2,0,0,0,1,0,2,-0.372588,-0.551481,-0.421862,-1.411128,-0.532671,0.266128,0.606435,89.0,-0.140773,0.16726,0.414628,0.017181,0.342702,0.220807,0.964152,-0.652711,0.941597,128.0,1.542199,-0.376235,-1.040833,1.344987,0.219656,0.963293,-0.652738,0.937537,0.671852,4.65396,4.290459
1,KDCT00002,2,2.013834,14,3,2,1,1,1,1,1,1,1.0,2.0,1,3,1,1,1,1,3,2.0,2.0,1,3,1,2.0,1,2,2.490637,2.314607,1.0,2,2,2,2,2.0,3,13.0,3.0,...,1,3.0,5,0,4,0,0,1,0,0,3,-0.74894,0.221659,0.869741,1.713833,0.07763,0.774588,1.211515,93.0,0.9093,0.814851,0.871402,0.267996,-1.005102,0.453146,0.376917,-0.343301,-0.19663,114.0,1.542199,-0.376235,-1.040833,1.344987,0.451957,0.376095,-0.343379,-0.199521,0.209862,4.70953,4.584967
2,KDCT00003,1,0.119384,3,3,2,3,1,1,1,1,1,1.0,1.0,1,2,1,1,1,1,3,3.0,2.0,3,2,3,1.0,1,2,2.490637,2.314607,1.0,3,2,3,2,2.0,2,12.0,2.0,...,1,3.0,7,0,2,0,0,1,0,0,2,-0.121687,-0.164911,-0.114338,0.672179,0.687931,0.647473,-0.119661,83.0,-0.350787,-1.127923,-1.869239,0.079885,-0.106566,-0.747269,0.876066,-0.454689,0.047276,132.0,-0.648425,-0.376235,0.960769,-0.743502,-0.748265,0.875213,-0.454748,0.044134,0.803849,4.382027,4.488636
3,KDCT00004,2,0.152821,14,4,2,1,1,1,1,1,1,1.0,2.0,1,1,1,2,1,1,2,2.078385,1.0,3,2,2,3.0,2,2,2.490637,2.314607,1.0,2,2,2,2,1.0,2,15.0,3.0,...,1,4.0,7,0,4,0,0,1,0,0,2,0.505566,1.478013,1.392533,1.713833,0.687931,1.283048,1.574564,90.0,0.804292,0.555815,0.871402,-0.421745,0.432555,0.065915,0.93479,0.139377,-0.359233,136.0,-0.648425,-0.376235,-1.040833,1.344987,0.064789,0.933933,0.139222,-0.361958,0.935846,4.615121,4.919981
4,KDCT00005,1,0.09309,8,2,2,3,2,3,2,2,3,2.0,2.0,2,3,3,2,2,2,1,2.0,1.0,1,3,1,2.0,1,2,2.490637,2.314607,0.0,1,3,3,2,1.0,2,11.0,2.0,...,1,2.0,5,0,4,0,0,1,0,0,1,0.505566,0.898158,0.715979,2.23466,1.603383,2.045738,1.332531,99.0,1.539343,1.073888,1.02366,0.581515,-0.376127,-0.24387,-0.004785,-0.788851,-1.253554,118.0,1.542199,-0.376235,-1.040833,1.344987,-0.244946,-0.005584,-0.788856,-1.255361,0.341859,4.532599,4.127134


# 학습과 예측에 사용할 데이터셋 만들기

In [9]:
df.columns

Index(['대상자식별코드', '성별', '만나이', '직업분류', '학력', '결혼유무', '성격_대범_섬세',
       '성격_행동빠름_행동느림', '성격_적극_소극', '성격_직접_간접', '성격_포기빠름_포기느림', '성격_외향_내성',
       '성격_동적_정적', '성격_쉽게_어렵게', '성격_남성적_여성적', '성격_참을성부족_잘찾음', '성격_큰편_적은편',
       '성격_잘드러냄_안드러냄', '성격_잘표현_표현안하는편', '성격_가끔흥분_이성적', '성격_덜렁_꼼꼼', '식사량',
       '식사속도', '소화여부', '소화입맛', '땀정도', '땀기분', '대변습관', '대변굳기', '배변시긴박감',
       '배변후잔변감', '소변야간뇨', '한열민감도', '한열손부위', '한열발부위', '한열배부위', '음수정도', '음수온다',
       '한열점수', '한열그룹', '고혈압_진단', '당뇨_진단', '고지혈증_진단', '일반건강', '수면시간(시간)',
       '수면시간(분)', '피로정도', '피로_기상시', '피로_오전', '피로_오후', '피로_밤', '피로_하루종일',
       '한의사체질진단', '키', '몸무게', 'BMI', '이마둘레', '목둘레', '겨드랑이둘레', '가슴둘레', '늑골둘레',
       '허리둘레', '장골둘레', '곡골둘레', '수축기혈합', '이완기혈압', '혈당', '총콜레스테롤', '중성지방',
       '고밀도콜레스테롤', '저밀도콜레스테롤', '수면시간(시간)_부족', '늑골둘레_low', '늑골둘레_middle',
       '늑골둘레_high', '혈당_nan', '총콜레스테롤_nan', '중성지방_nan', '고밀도콜레스테롤_nan',
       '저밀도콜레스테롤_nan', '혈당_log', '중성지방_log'],
      dtype='object')

In [10]:
# 이 중 feature engineeringd에서 score가 괜찮았던 것만 고른다.
# 즉, train에 사용할 column명만 가져온다.
X = df[['성별', '만나이', '직업분류', '학력', '결혼유무', '성격_대범_섬세',
       '성격_행동빠름_행동느림', '성격_적극_소극', '성격_직접_간접', '성격_포기빠름_포기느림', '성격_외향_내성',
       '성격_동적_정적', '성격_쉽게_어렵게', '성격_남성적_여성적', '성격_참을성부족_잘찾음', '성격_큰편_적은편',
       '성격_잘드러냄_안드러냄', '성격_잘표현_표현안하는편', '성격_가끔흥분_이성적', '성격_덜렁_꼼꼼', '식사량',
       '식사속도', '소화여부', '소화입맛', '땀정도', '땀기분', '대변습관', '대변굳기', '배변시긴박감',
       '배변후잔변감', '소변야간뇨', '한열민감도', '한열손부위', '한열발부위', '한열배부위', '음수정도', '음수온다',
       '한열점수', '한열그룹', '고혈압_진단', '당뇨_진단', '고지혈증_진단', '일반건강', '피로정도', '피로_기상시', 
       '피로_오전', '피로_오후', '피로_밤', '피로_하루종일','키', '몸무게', 'BMI', '이마둘레', '목둘레', '겨드랑이둘레', 
       '가슴둘레','허리둘레', '장골둘레', '곡골둘레', '수축기혈합', '이완기혈압', '수면시간(시간)_부족', '늑골둘레_low', 
       '늑골둘레_middle', '늑골둘레_high', '총콜레스테롤_nan', '중성지방_nan', '고밀도콜레스테롤_nan',
       '저밀도콜레스테롤_nan', '혈당_log', '중성지방_log']]
X.shape

(500, 71)

In [11]:
y = df['한의사체질진단']
y.shape

(500,)

In [12]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만듭니다.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [13]:
# train 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_train.shape, y_train.shape

((400, 71), (400,))

In [14]:
# test 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_test.shape, y_test.shape

((100, 71), (100,))

# 여러 개의 알고리즘을 사용해서 비교하기

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)
            ]
estimators

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=42, splitter='best'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 GradientBoost

In [None]:
# max_depth = np.random.randint(2, 20, 10)
# max_depth

array([ 9, 14, 10, 15, 12, 13, 19, 14, 14, 10])

In [None]:
# max_features = np.random.uniform(0.3, 1.0, 10)
# max_features

array([0.48973513, 0.5069293 , 0.64662576, 0.49227339, 0.59475933,
       0.67377037, 0.32271838, 0.68039516, 0.89152585, 0.44009743])

In [None]:
# results = []
# for estimator in estimators:
#     result = []
#     result.append(estimator.__class__.__name__)
#     results.append(result)
# results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [None]:
# param_distributions["n_estimators"] = np.random.randint(100, 1000, 10)
# param_distributions

In [16]:
from sklearn.model_selection import RandomizedSearchCV

## 이 셀 안에서 매번 다른 랜덤 값을 생성하기 위해, 위에서 작성한 코드를 여기다가 copy/paste함.
max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
      #n_estimators값(만드는 tree의 갯수)은 많으면 많을수록 좋다.
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
    
    # trainset을 5번(cv값) 나눠서 총 100번(n_iter값) 돌려서, 최종적으로 500번 fit한다.
    # n_iter와 n_jobs는 더 늘어날 수록 좋다.    
    clf = RandomizedSearchCV(estimator, 
                       param_distributions, 
                       n_iter=100,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5, 
                       verbose=2
                      )

    clf.fit(X_train, y_train)
    #class name을 불러온다.
    result.append(estimator.__class__.__name__)
    #각 clf model별 best parameter
    result.append(clf.best_params_)
    #각 clf model별 best score
    result.append(clf.best_score_)
    #본 예제에서는 y_test의 값을 알기 때문에 score도 반복문에 넣어준다.
    result.append(clf.score(X_test, y_test))
    #각 clf.model별 cv 결과값
    result.append(clf.cv_results_)
    #각 모델들(리스트 형태)을 다시 results라는 이름의 리스트형태로 만들어준다.
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.5min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.3min finished


In [17]:
df = pd.DataFrame(results, 
             columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])
df
# 아래의 best_params의 결과를 보고, 위의 np.random 범위를 수정해주면 더 좋은 parameter를 찾는 구간을 만들어 줄 수 있다.

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.6490240694416163, 'max_dept...",0.525,0.46,"{'mean_fit_time': [0.009935426712036132, 0.008..."
1,RandomForestClassifier,"{'n_estimators': 143, 'max_features': 0.589983...",0.5825,0.51,"{'mean_fit_time': [0.9676416397094727, 0.74272..."
2,GradientBoostingClassifier,"{'n_estimators': 197, 'max_features': 0.489039...",0.585,0.52,"{'mean_fit_time': [1.7797107696533203, 1.71578..."


In [18]:
# 모델에 각각 접근하여 상위에 있는 모델 순서대로 보거나 정확한 parameter 설정 수치를 볼 수 있습니다.
# test_score가 가장 높았던 GradientBoostingClassifier의 parameter 설정 수치를 봅니다.
pd.DataFrame(df.loc[2, "cv_result"]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,1.945760,0.028460,0.004417,0.000083,197,0.489039,10,"{'n_estimators': 197, 'max_features': 0.489039...",0.5750,0.5375,0.5875,0.6000,0.6250,0.5850,0.028940,1
17,1.093967,0.014158,0.004887,0.002688,180,0.529272,2,"{'n_estimators': 180, 'max_features': 0.529271...",0.6250,0.5375,0.5625,0.5375,0.6000,0.5725,0.034821,2
55,0.902158,0.011623,0.003270,0.000097,148,0.529272,2,"{'n_estimators': 148, 'max_features': 0.529271...",0.6125,0.5500,0.5625,0.5375,0.6000,0.5725,0.028940,2
68,1.631314,0.041834,0.004131,0.000091,104,0.589984,12,"{'n_estimators': 104, 'max_features': 0.589983...",0.5875,0.5250,0.5750,0.5375,0.6250,0.5700,0.035882,4
98,1.764201,0.042436,0.004137,0.000145,124,0.649024,12,"{'n_estimators': 124, 'max_features': 0.649024...",0.5500,0.5250,0.6000,0.5250,0.6500,0.5700,0.048477,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2.250046,0.077805,0.004156,0.000062,155,0.885827,10,"{'n_estimators': 155, 'max_features': 0.885826...",0.5000,0.4750,0.5875,0.4375,0.5625,0.5125,0.055340,96
80,2.294817,0.081488,0.004162,0.000070,189,0.885827,10,"{'n_estimators': 189, 'max_features': 0.885826...",0.5000,0.4750,0.5875,0.4375,0.5625,0.5125,0.055340,96
70,2.078104,0.045519,0.004113,0.000098,155,0.885827,12,"{'n_estimators': 155, 'max_features': 0.885826...",0.4750,0.4625,0.5625,0.4375,0.6000,0.5075,0.062550,98
38,2.009741,0.077792,0.004080,0.000196,124,0.885827,12,"{'n_estimators': 124, 'max_features': 0.885826...",0.4750,0.4625,0.5625,0.4375,0.6000,0.5075,0.062550,98


In [None]:
# 좋은 성능이 나오는 구간으로 계속 iteration을 돌릴 필요가 있습니다.
# 하이퍼 파라미터 튜닝을 여러 번 할수록 좋은 성능을 얻을 수 있습니다