# __[ SK Hynix Data Analytics ] <br><br>Predictive Data Analytics 7: Ensemble__

# <br>__1. Data: Pima Indians Diabetes Database__

- 데이터 description <br>
    - This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
<br><br>
- 변수 설명
    - 독립 변수 (8개)
        - Pregnancies: Number of times pregnant
        - Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
        - BloodPressure: Diastolic blood pressure ($\text{mm Hg}$)
        - SkinThickness: Triceps skin fold thickness ($\text{mm}$)
        - Insulin: 2-Hour serum insulin ($\text{mu U/ml}$)
        - BMI: Body mass index ($\text{weight in kg}$/$(\text{height in m})^2$)
        - DiabetesPedigreeFunction: Diabetes pedigree function
        - Age: Age ($\text{years}$)
    - 종속 변수 (1개)
        - Class = 1 (tested positive for diabetes) or 0 (tested negative for diabetes)
<br><br>
- 출처: https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # data split

### Step 1. 데이터 불러오기

In [2]:
# 데이터 불러오기
data = pd.read_csv('./data/pima-indians-diabetes.data.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# 변수명 변경
data.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', \
                'DiabetesPedigreeFunction', 'Age', 'Class']
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data_arr = data.values # 손 쉬운 indexing을 위하여 array로 변형
data_arr

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

### Step 2. 데이터 Split

In [5]:
X = data_arr[:,:-1].astype(float)  # 0-7 column은 독립변수
y = data_arr[:, -1].astype(int) # 마지막 column은 종속변수

print('X:', X[:3])
print('y:', y[:3])

X: [[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]
 [  1.     85.     66.     29.      0.     26.6     0.351  31.   ]
 [  8.    183.     64.      0.      0.     23.3     0.672  32.   ]]
y: [1 0 1]


In [6]:
# 데이터 분할 train:test = 7:3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('Number of train set:', len(X_train))
print('Number of test set:', len(X_test))

Number of train set: 537
Number of test set: 231


---

# __2. Base Model: Decision Tree__

In [7]:
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.metrics import accuracy_score

- DecisionTreeClassifier 설명
    - criterion: split quality 평가 함수 ('gini': 지니계수, 'entropy': information gain). 디폴트 'gini'
    - max_depth: 의사결정나무의 최대 깊이. 디폴트 None
    - min_samples_split: split하기 위해 필요한 최소 샘플 개수. 디폴트 2

### Step 1. 모델 선언 및 학습

In [8]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

### Step 2. 예측 및 모델 평가

In [9]:
dt_y_pred = dt_model.predict(X_test)
dt_y_pred

array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [10]:
# 테스트 데이터에 대한 성능 평가 (=0.7012)
dt_acc = accuracy_score(y_true=y_test, y_pred=dt_y_pred)
dt_acc

0.7012987012987013

---

# __3. Ensemble Model: Bagging__

## 앙상블의 목적: 다수의 모델을 학습하여 오류의 감소를 추구
>**분산의 감소에 의한 오류 감소: 배깅(Bagging), 랜덤포레스트(Random Forest)** <br>
>편향의 감소에 의한 오류 감소: 부스팅(Boosting)

# __3-1. 비복원 추출을 통한 앙상블__

### K-fold Data Split
>전체 데이터를 k개의 block 으로 나누고 개별 모델을 서로 다른 (k-1)개의 subset에 대해 학습한 뒤 최종 결과를 결합

<img src="https://imgur.com/RBJMT7n.jpg" width="500">

In [11]:
from collections import Counter

### Step 1. k-fold Data Split

In [12]:
def split_data(n_data, k, random_state=42):
    np.random.seed(random_state)
    permutation = list(np.random.permutation(n_data))
    
    k_fold_data_idx = {}
    idx = [int(round(i)) for i in np.linspace(0, n_data, k + 1)]
    for i in range(k):
        k_fold_idx = permutation[:idx[i]] + permutation[idx[i + 1]:]
        k_fold_data_idx[i] = k_fold_idx
    
    return k_fold_data_idx

In [13]:
# 10-fold data split
k_fold_data_idx = split_data(X_train.shape[0], k=10, random_state=42)
k_fold_data_idx

{0: [525,
  176,
  104,
  211,
  140,
  148,
  196,
  312,
  76,
  532,
  70,
  481,
  469,
  364,
  369,
  255,
  78,
  218,
  448,
  489,
  68,
  497,
  63,
  452,
  433,
  82,
  341,
  33,
  84,
  90,
  0,
  11,
  407,
  22,
  352,
  365,
  470,
  172,
  153,
  367,
  18,
  195,
  411,
  15,
  293,
  39,
  493,
  524,
  46,
  93,
  323,
  321,
  305,
  536,
  506,
  192,
  348,
  180,
  301,
  137,
  441,
  325,
  268,
  533,
  244,
  426,
  69,
  377,
  318,
  185,
  131,
  154,
  228,
  464,
  124,
  375,
  447,
  424,
  182,
  355,
  290,
  472,
  227,
  245,
  421,
  19,
  494,
  56,
  396,
  483,
  362,
  461,
  316,
  25,
  275,
  42,
  333,
  225,
  184,
  302,
  126,
  272,
  357,
  31,
  113,
  210,
  382,
  167,
  57,
  229,
  24,
  17,
  437,
  66,
  304,
  261,
  248,
  376,
  94,
  408,
  509,
  23,
  442,
  284,
  237,
  5,
  116,
  45,
  204,
  422,
  16,
  507,
  390,
  3,
  257,
  307,
  60,
  516,
  110,
  349,
  286,
  29,
  220,
  277,
  26,
  7,
  181,
  495,
  

### Step 2. 모델 선언 및 학습

In [14]:
def train_k_fold_ensemble(k_fold_idx, X_train, y_train):
    k_fold_models = {}
    for i in k_fold_idx:
        k_fold_model = DecisionTreeClassifier(random_state=42)
        k_fold_model.fit(X_train[k_fold_idx[i]], y_train[k_fold_idx[i]])
        k_fold_models[i] = k_fold_model
    
    return k_fold_models

In [15]:
k_fold_models = train_k_fold_ensemble(k_fold_data_idx, X_train, y_train)
k_fold_models

{0: DecisionTreeClassifier(random_state=42),
 1: DecisionTreeClassifier(random_state=42),
 2: DecisionTreeClassifier(random_state=42),
 3: DecisionTreeClassifier(random_state=42),
 4: DecisionTreeClassifier(random_state=42),
 5: DecisionTreeClassifier(random_state=42),
 6: DecisionTreeClassifier(random_state=42),
 7: DecisionTreeClassifier(random_state=42),
 8: DecisionTreeClassifier(random_state=42),
 9: DecisionTreeClassifier(random_state=42)}

### Step 3. 예측 및 모델 평가

In [16]:
def majority_voting(numbers):
    c = Counter(numbers)
    mode = [k for k, v in c.items() if max(c.values()) == v]
    return mode[-1]

In [17]:
def test_k_fold_ensemble(k_fold_models, X_test, y_test):
    # testing
    y_preds = []
    for i in k_fold_models:
        y_pred = k_fold_models[i].predict(X_test)
        y_preds.append(y_pred)
    
    # aggregation
    y_preds = np.array(y_preds).T
    k_fold_y_pred = []
    for y_pred in y_preds:
        result = majority_voting(y_pred)
        k_fold_y_pred.append(result)
    
    return k_fold_y_pred

In [18]:
k_fold_y_pred = test_k_fold_ensemble(k_fold_models, X_test, y_test)
k_fold_y_pred

[0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0]

In [19]:
# 테스트 데이터에 대한 성능 평가 (=0.7142)
k_fold_acc = accuracy_score(y_true=y_test, y_pred=k_fold_y_pred)
k_fold_acc

0.7142857142857143

---

# __3-2. Bagging__

### Bagging: Bootstrap Aggregating
> 앙상블의 각 멤버(모델)은 서로 다른 학습 데이터셋을 이용 <br>
> 각 데이터셋은 복원 추출 (sampling with replacement)을 통해 원래 데이터의 수만큼의 크기를 갖도록 샘플링 <br>
> 개별 데이터셋을 붓스트랩(bootstrap)이라 부름

<img src="https://imgur.com/ZyhJKLm.jpg" width="450">

In [20]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
from sklearn.ensemble import BaggingClassifier # bagging
from sklearn.model_selection import GridSearchCV # cross-validation
from sklearn.metrics import accuracy_score

- BaggingClassifier 설명
    - base_estimator: 기본 모형
    - n_estimators: 모형 갯수. 디폴트 10
    - bootstrap: 데이터의 중복 사용 여부. 디폴트 True
    - max_samples: 데이터 샘플 중 선택할 샘플의 수 혹은 비율. 디폴트 1.0
    - bootstrap_features: 특징 차원의 중복 사용 여부. 디폴트 False
    - max_features: 다차원 독립 변수 중 선택할 차원의 수 혹은 비율 1.0

### Step 1. Base 모델 선언

In [21]:
base_model = DecisionTreeClassifier(random_state=42)
base_model

DecisionTreeClassifier(random_state=42)

### Step 2. 모델 선언 및 학습

In [22]:
bag_model = BaggingClassifier(base_estimator=base_model, random_state=42)
bag_model.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  random_state=42)

In [23]:
# bagging 모델을 구성하는 10개의 base 모델 확인
bag_model.estimators_

[DecisionTreeClassifier(random_state=1952926171),
 DecisionTreeClassifier(random_state=1761383086),
 DecisionTreeClassifier(random_state=1449071958),
 DecisionTreeClassifier(random_state=1910541088),
 DecisionTreeClassifier(random_state=1341730541),
 DecisionTreeClassifier(random_state=1286572245),
 DecisionTreeClassifier(random_state=1005142668),
 DecisionTreeClassifier(random_state=502852014),
 DecisionTreeClassifier(random_state=186414760),
 DecisionTreeClassifier(random_state=1956263048)]

### Step 3. 예측 및 모델 평가

In [24]:
bag_y_pred = bag_model.predict(X_test)
bag_y_pred

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [25]:
# 테스트 데이터에 대한 성능 평가 (=0.7272)
bag_acc = accuracy_score(y_true=y_test, y_pred=bag_y_pred)
bag_acc

0.7272727272727273

==> Bagging 모델이 base 모델보다 향상된 성능을 도출함
<br><br>

### Step 4. Hyperparameter 탐색
- GridSearchCV에 대한 설명
    - estimator: 탐색 모형
    - param_grid: 탐색할 hyperparameter (dictionary 형태)
    - cv: cross-validation split 방법
        - None: to use the default 5-fold cross validation,
        - integer: to specify the number of folds in a (Stratified)KFold

In [26]:
# Hyperparameters 후보 설정
bag_param_grid = {'n_estimators': [5, 10, 15]}

In [27]:
# Hyperparameter 탐색 진행
base_model = DecisionTreeClassifier(random_state=42)
bag_model = BaggingClassifier(base_estimator=base_model, random_state=42)

In [28]:
bag_grid_search = GridSearchCV(bag_model, param_grid=bag_param_grid, cv=5, scoring='accuracy')
bag_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                                         random_state=42),
             param_grid={'n_estimators': [5, 10, 15]}, scoring='accuracy')

In [29]:
# 최적 hyperparameter 확인 및 최적 모델 구축
bag_grid_search.best_params_

{'n_estimators': 15}

In [30]:
bag_opt_model = bag_grid_search.best_estimator_
bag_opt_model

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  n_estimators=15, random_state=42)

### Step 5. 예측 및 모델 평가

In [31]:
bag_opt_y_pred = bag_opt_model.predict(X_test)
bag_opt_y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [32]:
# 테스트 데이터에 대한 성능 평가 (=0.7316)
bag_opt_acc = accuracy_score(y_true=y_test, y_pred=bag_opt_y_pred)
bag_opt_acc

0.7316017316017316

### Step 6. 변수 중요도 확인

In [33]:
def get_variable_importance(model):
    var_imp = [tree.feature_importances_ for tree in model.estimators_]
    var_imp = np.mean(var_imp, axis=0)
    return var_imp

In [34]:
bag_opt_var_imp = pd.Series(get_variable_importance(bag_opt_model), index=data.columns[:-1])
bag_opt_var_imp.sort_values(ascending=False)

Glucose                     0.359248
BMI                         0.136523
Age                         0.135626
DiabetesPedigreeFunction    0.121842
BloodPressure               0.078994
SkinThickness               0.063885
Insulin                     0.052865
Pregnancies                 0.051016
dtype: float64

---

# __3-3. Random Forest__

### Random forest
> Random forest: 의사결정나무 기반 앙상블의 특수한 형태 <br>
> Random forest 모델: subsample들의 **랜덤하게 선택된 변수**를 사용해 모델 구성 <br>
> Bagging 모델: subsample들의 **모든 변수**를 사용해 모델 구성 

In [35]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

- RandomForestClassifier 설명
    - n_estimators: 트리 개수. 디폴트 100
    - criterion: split quality 평가 함수 ('gini': 지니계수, 'entropy': information gain). 디폴트 'gini'
    - max_depth: 의사결정나무의 최대 깊이. 디폴트 None
    - min_samples_split: split하기 위해 필요한 최소 샘플 개수. 디폴트 2
    - max_fetures: split하기 위해 고려하는 feature 개수 ('auto'&'sqrt': max_features=sqrt(n_features), 'log2': max_features=log2(n_features), 'None': max_features=n_features). 디폴트 'auto'

### Step 1. 모델 선언 및 학습

In [36]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

### Step 2. 예측 및 모델 평가

In [37]:
rf_y_pred = rf_model.predict(X_test)
rf_y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [38]:
# 테스트 데이터에 대한 성능 평가 (=0.7532)
rf_acc = accuracy_score(y_true=y_test, y_pred=rf_y_pred)
rf_acc

0.7532467532467533

==> Random forest 모델이 base 모델보다 향상된 성능을 도출함
<br><br>

### Step 3. Hyperparameter 탐색

In [39]:
# Hyperparameters 후보 설정
rf_param_grid = {'n_estimators': [50, 100, 150]}

In [40]:
rf_model = RandomForestClassifier(random_state=42)

In [41]:
# Hyperparameter 탐색 진행
rf_grid_search = GridSearchCV(rf_model, param_grid=rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'n_estimators': [50, 100, 150]}, scoring='accuracy')

In [42]:
# 최적 hyperparameter 확인 및 최적 모델 구축
rf_grid_search.best_params_

{'n_estimators': 100}

In [43]:
rf_opt_model = rf_grid_search.best_estimator_
rf_opt_model

RandomForestClassifier(random_state=42)

### Step 4. 예측 및 모델 평가

In [44]:
rf_opt_y_pred = rf_opt_model.predict(X_test)
rf_opt_y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [45]:
# 테스트 데이터에 대한 성능 평가 (=0.7532)
rf_opt_model_acc = accuracy_score(y_true=y_test, y_pred=rf_opt_y_pred)
rf_opt_model_acc

0.7532467532467533

### Step 5. 변수 중요도 확인

In [46]:
rf_opt_model.feature_importances_

array([0.08055242, 0.28208895, 0.08405168, 0.07055914, 0.0693849 ,
       0.15811963, 0.11312693, 0.14211636])

In [47]:
rf_opt_var_imp = pd.Series(rf_opt_model.feature_importances_, index=data.columns[:-1])
rf_opt_var_imp.sort_values(ascending=False)

Glucose                     0.282089
BMI                         0.158120
Age                         0.142116
DiabetesPedigreeFunction    0.113127
BloodPressure               0.084052
Pregnancies                 0.080552
SkinThickness               0.070559
Insulin                     0.069385
dtype: float64

---