## 의사결정트리 귀납법에 대한 Alternatives
# Random Forest
- 기법에 의한 속성중요도(feature importance) 측정

In [2]:
# ch4-2.py
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

In [3]:
# 데이터로드 (NHIS_OPEN_GJ_2017.csv : 데이터 원본 파일)
# encoding : 윈도우즈 환경에서의 한글 처리
# engine : python 3.6에서 한글이 포함된 파일이름 사용
rawData_org = pd.read_csv(r'C:\Users\Lee seohyun\Desktop\데이터청년캠퍼스\전처리\Python-SQL-Preprocessing-master\datasets\국민건강정보\NHIS_OPEN_GJ_2017\NHIS_OPEN_GJ_2017.CSV', encoding='CP949', engine='python')

In [4]:
# 원본데이터의 튜플 수와 컬럼수 출력
rawData_org.shape

(1000000, 34)

## 전처리 

In [7]:
# 의사결정트리에 사용할 속성리스트
feature_columns_to_use = ['성별코드', '연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg 단위)', '허리둘레', '시력(좌)', '시력(우)',
                    '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '트리글리세라이드', 'HDL콜레스테롤',
                    'LDL콜레스테롤', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태', '음주여부']

rawData = rawData_org[feature_columns_to_use]

In [12]:
# 속성값이 NULL인 튜플 제외
rawData = rawData.dropna()

# 인덱스 재설정
rawData.reset_index(inplace=True, drop=True)

# 원본데이터의 튜플 수와 컬럼수 출력
rawData.shape

(990910, 22)

50000개 데이터 샘플링 

In [14]:
import random as rd  # 샘플링을 위한 random 패키지 임포트

# 1) 샘플링을 통한 수량 축소
# 원본데이터 중 50,000개를 튜플만 샘플링
# len() : 데이터프레임의 크기를 구하는 함수
sample_idx = rd.sample(range(0, len(rawData)), 50000)

# 인덱스 값 정렬(오름차순)
sample_idx.sort()

sample_idx

[0,
 46,
 47,
 58,
 78,
 98,
 123,
 142,
 145,
 160,
 168,
 176,
 212,
 255,
 258,
 281,
 312,
 315,
 335,
 412,
 440,
 467,
 477,
 494,
 505,
 522,
 526,
 554,
 558,
 622,
 627,
 628,
 641,
 642,
 653,
 654,
 669,
 671,
 678,
 797,
 804,
 849,
 876,
 894,
 902,
 909,
 932,
 948,
 960,
 982,
 1001,
 1002,
 1009,
 1029,
 1046,
 1064,
 1078,
 1142,
 1165,
 1171,
 1172,
 1191,
 1228,
 1231,
 1245,
 1267,
 1270,
 1271,
 1278,
 1284,
 1314,
 1352,
 1366,
 1382,
 1383,
 1390,
 1423,
 1429,
 1468,
 1471,
 1481,
 1529,
 1539,
 1552,
 1602,
 1604,
 1613,
 1650,
 1656,
 1703,
 1742,
 1747,
 1758,
 1782,
 1788,
 1815,
 1830,
 1838,
 1846,
 1854,
 1871,
 1872,
 1879,
 1922,
 1926,
 1964,
 1988,
 2003,
 2034,
 2037,
 2061,
 2096,
 2134,
 2147,
 2150,
 2156,
 2185,
 2193,
 2205,
 2225,
 2230,
 2246,
 2251,
 2255,
 2309,
 2311,
 2337,
 2348,
 2352,
 2367,
 2400,
 2437,
 2439,
 2445,
 2468,
 2515,
 2560,
 2581,
 2622,
 2688,
 2690,
 2712,
 2764,
 2794,
 2804,
 2807,
 2822,
 2911,
 2932,
 2956,
 2962,


In [15]:
# 샘플링 된 인덱스로 구성된 샘플 데이터프레임 생성
rawData_sample = rawData.loc[sample_idx]

rawData_sample.reset_index(inplace=True, drop=True) #  인덱스 재설정

# 첫 10개의 행만 출력
rawData_sample.head(10)

Unnamed: 0,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,흡연상태,음주여부
0,1,13,170.0,65.0,91.0,1.0,1.2,1.0,1.0,158.0,...,161.0,43.0,102.0,1.0,1.0,19.0,41.0,25.0,3.0,0.0
1,2,10,155.0,50.0,70.0,1.2,1.2,1.0,1.0,115.0,...,69.0,50.0,144.0,1.0,0.9,18.0,13.0,15.0,1.0,0.0
2,2,13,155.0,75.0,90.0,0.9,1.0,1.0,1.0,150.0,...,148.0,37.0,49.0,1.0,0.8,23.0,22.0,21.0,1.0,0.0
3,1,9,175.0,80.0,84.0,1.2,1.0,1.0,1.0,130.0,...,97.0,52.0,127.0,1.0,1.1,29.0,22.0,25.0,2.0,1.0
4,2,9,160.0,70.0,82.0,1.5,1.2,1.0,1.0,120.0,...,89.0,74.0,137.0,1.0,0.8,18.0,10.0,14.0,1.0,0.0
5,2,6,165.0,55.0,68.5,0.9,0.9,1.0,1.0,108.0,...,55.0,58.0,70.0,1.0,0.6,24.0,14.0,5.0,1.0,0.0
6,2,15,150.0,55.0,85.0,0.4,0.6,1.0,1.0,140.0,...,134.0,55.0,120.0,1.0,0.4,44.0,37.0,42.0,1.0,0.0
7,1,10,160.0,60.0,81.0,1.2,1.0,1.0,1.0,124.0,...,111.0,64.0,143.0,1.0,0.9,36.0,38.0,46.0,2.0,0.0
8,1,9,170.0,70.0,84.0,1.0,0.8,1.0,1.0,117.0,...,110.0,40.0,127.0,1.0,0.9,19.0,22.0,19.0,2.0,0.0
9,1,16,165.0,55.0,78.0,0.8,0.9,1.0,1.0,124.0,...,176.0,63.0,94.0,1.0,1.2,22.0,12.0,37.0,3.0,1.0


In [16]:
rawData_sample.shape

(50000, 22)

인코딩 - 값이 숫자가 아닌 데이터에 대하여 숫자로 변환

In [17]:
# nonnumeric 속성을 categrical 데이터로 변환
nonnumeric_columns = ['성별코드','음주여부']

#인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in nonnumeric_columns:
    rawData_sample[feature] = le.fit_transform(rawData_sample[feature])   

In [18]:
rawData_sample.head()

Unnamed: 0,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,흡연상태,음주여부
0,0,13,170.0,65.0,91.0,1.0,1.2,1.0,1.0,158.0,...,161.0,43.0,102.0,1.0,1.0,19.0,41.0,25.0,3.0,0
1,1,10,155.0,50.0,70.0,1.2,1.2,1.0,1.0,115.0,...,69.0,50.0,144.0,1.0,0.9,18.0,13.0,15.0,1.0,0
2,1,13,155.0,75.0,90.0,0.9,1.0,1.0,1.0,150.0,...,148.0,37.0,49.0,1.0,0.8,23.0,22.0,21.0,1.0,0
3,0,9,175.0,80.0,84.0,1.2,1.0,1.0,1.0,130.0,...,97.0,52.0,127.0,1.0,1.1,29.0,22.0,25.0,2.0,1
4,1,9,160.0,70.0,82.0,1.5,1.2,1.0,1.0,120.0,...,89.0,74.0,137.0,1.0,0.8,18.0,10.0,14.0,1.0,0


## 학습수행

In [19]:
# 소스 데이터프레임에서 분류(classification)을 위한 속성 집합
X = rawData_sample.loc[:, feature_columns_to_use[:-1]]  
y = rawData_sample.loc[:, '음주여부']  # 분류 클래스(class)

from sklearn.model_selection import train_test_split  # 분석모형 선택에 관련된 모듈
    
# 자동으로 데이터셋을 트레이닝셋과 테스트셋으로 분리해주는 함수로
# 트레이닝셋과 데이터셋의 비율을 7:3으로 세팅함
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [20]:
# RMSLE 계산하는 사용자지정의 함수
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values) :
    # 넘파이로 배열 형태로 변환
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제값에 1을 더하고 로그를 씌움
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 함
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균값 구함
    mean_difference = difference.mean()
    
    # 다시 루트를 씌움
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

In [21]:
# Scikit-Learn 패키지 : 머신 러닝 교육 및 실무를 위한 패키지로 샘플 데이터셋,
# 다양한 기계학습 기법에 대한 함수 등을 포함하고 있음
from sklearn.ensemble import RandomForestClassifier  # 랜덤포리스트 기법에 관련된 모듈


random_forest = RandomForestClassifier(n_estimators=20, random_state=0)
random_forest

#n_estimators= 결정트리 개수

RandomForestClassifier(n_estimators=20, random_state=0)

In [22]:
# KFold 교차검증
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

# 트레이닝셋에 대한 KFold 교차검증 수
%time score = cross_val_score(random_forest, X_train, y_train, cv=k_fold, scoring=rmsle_scorer)
score = score.mean()

# 0에 근접할수록 좋은 데이터
print("Score= {0:.5f}".format(score))

Wall time: 28.3 s
Score= 0.37262


## 의사결정트리 학습 

In [25]:
# fit() : 트레이닝 데이터셋을 대상으로 의사결정트리 학습 진행
random_forest.fit(X_train, y_train)

# tree.predict() 함수를 활용하여 의사결정트리를 대상으로 테스트셋을 예측
y_pred_tr = random_forest.predict(X_test)

#predict_proba() : 각 input 개체에 대한 예측클래스 확률 구함 (클래스 예측확률)
y_pred_tr_prob = random_forest.predict_proba(X_test)

In [26]:
# 점수 출력
print("Train Set Score1 : {:.2f}".format(random_forest.score(X_train, y_train)))
print("Test Set Score1 : {:.2f}".format(random_forest.score(X_test, y_test)))

Train Set Score1 : 1.00
Test Set Score1 : 0.72


정확도 

In [27]:
from sklearn.metrics import accuracy_score  # 분류 정확도(classification accuracy)를 계산하는 모듈

# accuracy_score() 함수를 활용하여 테스트셋의 실제 클래스와 예측된 클래스 간 정확도 측정
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_tr))

#위와 같은 것 두 방법 다 가능 

Accuracy: 0.72


중요도

In [28]:
#random_forest.feature_importances_

array([0.04808159, 0.07295811, 0.05989165, 0.03959783, 0.0564314 ,
       0.03480446, 0.03396292, 0.00200659, 0.00219808, 0.05216828,
       0.04994487, 0.05693784, 0.06409995, 0.06904181, 0.06333295,
       0.0058072 , 0.03882942, 0.04908633, 0.05705596, 0.08934957,
       0.0544132 ])

In [29]:
# 속성(feature) 별 중요도를 저장하는 데이터프레임 생성 
sel_feature = pd.DataFrame({'중요도' : random_forest.feature_importances_}, index = feature_columns_to_use[:-1])

# 중요도의 내림차순으로 정렬
sel_feature.sort_values(by='중요도', ascending=False)

Unnamed: 0,중요도
감마지티피,0.08935
연령대코드(5세단위),0.072958
HDL콜레스테롤,0.069042
트리글리세라이드,0.0641
LDL콜레스테롤,0.063333
신장(5Cm단위),0.059892
(혈청지오티)ALT,0.057056
식전혈당(공복혈당),0.056938
허리둘레,0.056431
흡연상태,0.054413


In [30]:
#예측결과와 실제결과 비교 
appData_anal_comp = pd.DataFrame(data={'actual': y_test, 'pred':y_pred_tr}, columns ={'actual','pred'})
appData_anal_comp

Unnamed: 0,pred,actual
11841,0,0
19602,0,0
45519,0,0
25747,1,1
42642,1,1
...,...,...
38344,1,1
49984,1,1
32624,1,0
46437,0,1


In [31]:
#예측 클래스와 클래스 예측확률을 X_test 데이터 프레임에 붙임
X_test['음주여부'] = y_test
X_test['음주여부예측'] = y_pred_tr

X_test['음주확률'] = y_pred_tr_prob[:,1]
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['음주여부'] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['음주여부예측'] = y_pred_tr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['음주확률'] = y_pred_tr_prob[:,1]


Unnamed: 0,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기혈압,...,LDL콜레스테롤,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,흡연상태,음주여부,음주여부예측,음주확률
11841,1,14,150.0,55.0,75.4,0.9,1.0,1.0,1.0,142.0,...,217.0,1.0,0.8,19.0,13.0,14.0,1.0,0,0,0.25
19602,1,9,150.0,45.0,68.0,1.0,1.2,1.0,1.0,115.0,...,118.0,1.0,0.9,21.0,13.0,12.0,1.0,0,0,0.35
45519,0,8,170.0,65.0,85.0,1.0,1.2,1.0,1.0,122.0,...,146.0,2.0,1.1,33.0,48.0,22.0,1.0,0,0,0.30
25747,0,11,170.0,70.0,83.0,1.2,1.0,1.0,1.0,103.0,...,158.0,1.0,1.0,29.0,39.0,22.0,2.0,1,1,0.60
42642,1,8,155.0,55.0,72.0,0.7,0.7,1.0,1.0,112.0,...,105.0,1.0,0.8,17.0,13.0,22.0,1.0,1,1,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38344,0,8,175.0,115.0,105.0,0.5,0.5,1.0,1.0,123.0,...,88.0,1.0,1.3,31.0,47.0,34.0,1.0,1,1,0.65
49984,0,6,175.0,80.0,88.0,1.2,1.5,1.0,1.0,119.0,...,81.0,1.0,1.1,22.0,19.0,28.0,3.0,1,1,0.95
32624,0,6,180.0,75.0,86.0,0.5,0.5,1.0,1.0,140.0,...,126.0,1.0,0.9,20.0,35.0,29.0,2.0,0,1,0.55
46437,1,10,170.0,70.0,77.0,1.0,1.0,1.0,1.0,113.0,...,100.0,1.0,0.7,16.0,11.0,13.0,1.0,1,0,0.45


In [None]:
#0.5가 넘으면 음주를 한다고 판단 - 음주확률