In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/슬기로운 건강생활/data/국가건강검진_혈압혈당데이터.csv', encoding='cp949')
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
0,1,1,116,78,94,4,16.6
1,1,1,100,60,79,4,22.3
2,1,1,100,60,87,4,21.9
3,1,1,111,70,72,4,20.2
4,1,1,120,80,98,4,20.0
...,...,...,...,...,...,...,...
999995,2,27,120,70,81,2,23.1
999996,2,27,110,70,104,2,27.2
999997,2,27,115,53,110,1,25.2
999998,2,27,120,70,90,2,19.7


In [None]:
df.isnull().sum()

SEX      0
BTH_G    0
SBP      0
DBP      0
FBS      0
DIS      0
BMI      0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   SEX     1000000 non-null  int64  
 1   BTH_G   1000000 non-null  int64  
 2   SBP     1000000 non-null  int64  
 3   DBP     1000000 non-null  int64  
 4   FBS     1000000 non-null  int64  
 5   DIS     1000000 non-null  int64  
 6   BMI     1000000 non-null  float64
dtypes: float64(1), int64(6)
memory usage: 53.4 MB


In [None]:
# 정수 변환
df = df.round(0).astype(int)

# 반올림
df = df.astype(int)

In [None]:
train=df

In [None]:
# 정답 4 모두 제거
train = df.loc[df['DIS']!=4]

In [None]:
# feature와 target 분리
yt=np.array(train['DIS'])
xt=train.drop(['DIS'], axis=1)
xt=np.array(xt)

# 데이터 정규화
scaler=MinMaxScaler()
xt=scaler.fit_transform(xt)

# train test 데이터 split
xtrain,xtest,ytrain,ytest=train_test_split(xt, yt, test_size=0.2, random_state=42)

In [None]:
# 랜덤포레스트 without tuning
rf = RandomForestClassifier(random_state=17)
rf.fit(xtrain, ytrain)
forest_predictions = rf.predict(xtest)

accuracy_score(ytest, forest_predictions)
forest_predictions = rf.predict(xtest)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# 데이터 100만 개 사용

print(confusion_matrix(ytest, forest_predictions))

[[  3271   2793   1301   3471]
 [  1530  10915    506  19409]
 [  1743   1124   1758   3965]
 [  1382   9550   1285 135997]]


In [None]:
print(classification_report(ytest, forest_predictions))

              precision    recall  f1-score   support

           1       0.41      0.30      0.35     10836
           2       0.45      0.34      0.38     32360
           3       0.36      0.20      0.26      8590
           4       0.84      0.92      0.87    148214

    accuracy                           0.76    200000
   macro avg       0.51      0.44      0.47    200000
weighted avg       0.73      0.76      0.74    200000



# 하이퍼파라미터 튜닝

In [None]:
# RandomForest의 하이퍼 파라미터 default 상태
model = RandomForestClassifier()
model

RandomForestClassifier()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10],  #100
          'max_depth' : [6],   #8, 10, 12
          'min_samples_leaf' : [8, 12, 18],
          'min_samples_split' : [8, 16, 20]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 18, 'min_samples_split': 8, 'n_estimators': 10}
최고 예측 정확도: 0.7699


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10],  #100
          'max_depth' : [6],   #8, 10, 12
          'min_samples_leaf' : [18, 24, 32],
          'min_samples_split' : [2, 5, 8]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 32, 'min_samples_split': 2, 'n_estimators': 10}
최고 예측 정확도: 0.7700


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10],  #100
          'max_depth' : [6],   #8, 10, 12
          'min_samples_leaf' : [32, 64, 128],
          'min_samples_split' : [0.5, 1, 2]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 32, 'min_samples_split': 2, 'n_estimators': 10}
최고 예측 정확도: 0.7700


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10, 20],  #100
          'max_depth' : [6, 8],   #8, 10, 12
          'min_samples_leaf' : [32, 35],
          'min_samples_split' : [ 2]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 8, 'min_samples_leaf': 35, 'min_samples_split': 2, 'n_estimators': 20}
최고 예측 정확도: 0.7758


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [20, 30],  #100
          'max_depth' : [7, 8],   #8, 10, 12
          'min_samples_leaf' : [35, 40],
          'min_samples_split' : [2]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 8, 'min_samples_leaf': 35, 'min_samples_split': 2, 'n_estimators': 30}
최고 예측 정확도: 0.7761


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [30, 40, 50],  #100
          'max_depth' : [8, 9],   #8, 10, 12
          'min_samples_leaf' : [35],
          'min_samples_split' : [2]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 9, 'min_samples_leaf': 35, 'min_samples_split': 2, 'n_estimators': 50}
최고 예측 정확도: 0.7769


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [30, 40, 50],  #100
          'max_depth' : [8, 9],   #8, 10, 12
          'min_samples_leaf' : [35],
          'min_samples_split' : [2]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 2022, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 3, 
                       n_jobs = -1)
grid_cv.fit(xtrain, ytrain)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [None]:
# 위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 
# 테스트 세트 데이터에서 예측 성능을 측정
rf_clf1 = RandomForestClassifier(n_estimators = 50, 
                                max_depth = 9,
                                min_samples_leaf = 35,
                                min_samples_split = 2,
                                random_state = 0,
                                n_jobs = -1)
rf_clf1.fit(xtrain, ytrain)
pred = rf_clf1.predict(xtest)
print('예측 정확도: {:.4f}'.format(accuracy_score(ytest,forest_predictions)))

예측 정확도: 0.7509


In [None]:
print(classification_report(ytest, forest_predictions))

              precision    recall  f1-score   support

           1       0.39      0.30      0.34     10847
           2       0.43      0.34      0.38     32647
           3       0.33      0.20      0.25      8514
           4       0.83      0.91      0.87    147992

    accuracy                           0.75    200000
   macro avg       0.50      0.43      0.46    200000
weighted avg       0.72      0.75      0.73    200000



In [None]:
print('최고 예측 정확도: '{0:.4f}'.format(grid_cv.best_score_))
'{0:.4f}'.format(f1)

In [None]:
# 각 피처 중요도 시각화
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file ="tree.dot", class_names = ["악성","양성"], feature_names = cancer.feature_names , 
                impurity=False, filled=True)
                
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

import graphviz 

with open("tree.dot" ,encoding="UTF-8") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

# 이미지로 저장 
graphviz.Source(dot_graph).render('tree', format="png")

# 정답 4 제외

In [None]:
# 데이터 100만 개 사용

print(confusion_matrix(ytest, forest_predictions))

[[ 3668  5281  1815]
 [ 2194 28997  1261]
 [ 2112  3726  2814]]


In [None]:
print(classification_report(ytest, forest_predictions))

              precision    recall  f1-score   support

           1       0.46      0.34      0.39     10764
           2       0.76      0.89      0.82     32452
           3       0.48      0.33      0.39      8652

    accuracy                           0.68     51868
   macro avg       0.57      0.52      0.53     51868
weighted avg       0.65      0.68      0.66     51868

