### Greedy Forward Selection를 사용한 특징 선택

### 학습 내용
 * 데이터 준비
 * evaluate 함수 정의
 * Greedy Forward Selection
 * 단순화한 Greedy Forward Selection

### 데이터 준비

In [7]:
# ---------------------------------
# 데이터 등의 사전 준비
# ----------------------------------
import numpy as np
import pandas as pd

# train_x는 학습 데이터, train_y는 목적 변수, test_x는 테스트 데이터
# pandas의 DataFrame, Series로 유지합니다.(numpy의 array로 유지하기도 합니다)

train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [8]:
train.shape, train_x.shape, train_y.shape, test_x.shape

((10000, 29), (10000, 28), (10000,), (10000, 28))

### 데이터 나누기

In [9]:
# 학습 데이터를 학습 데이터와 검증 데이터로 나누기
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

tr_x.shape, va_x.shape, tr_y.shape, va_y.shape

((7500, 28), (2500, 28), (7500,), (2500,))

### evaluate 함수 정의

In [10]:
# 특징의 리스트에 대해 정밀도를 평가하는 evaluate 함수 정의
import xgboost as xgb
from sklearn.metrics import log_loss


def evaluate(features):
    dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
    dvalid = xgb.DMatrix(va_x[features], label=va_y)
    # params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}  # 기존
    params = {'objective': 'binary:logistic', 'verbosity': 0, 'random_state': 71}    # 이슈 대응
    num_round = 10     # 실제로는 더 많은 round수가 필요함
    early_stopping_rounds = 3
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, dtrain, num_round,
                      evals=watchlist, early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=0)
    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)

    return score

### Greedy Forward Selection
 * 각 특징에 대해 evaluate 함수를 호출하여 점수를 계산하고, 가장 좋은 점수를 가지는 특징을 선택하여 추가
 * 선택된 특징과 해당 특징의 점수를 출력하고, 점수가 개선되지 않을 경우 종료.
 * 최종적으로 선택된 특징들을 출력.

In [11]:
# ---------------------------------
# Greedy Forward Selection
# ----------------------------------
best_score = 9999.0
selected = set([])

print('start greedy forward selection')

while True:
    if len(selected) == len(train_x.columns):
        # 모든 특징이 선정되어 종료
        break

    scores = []
    for feature in train_x.columns:
        if feature not in selected:
            # 특징의 리스트에 대해서 정도를 평가하는 evaluate 함수로 수행
            fs = list(selected) + [feature]
            score = evaluate(fs)
            scores.append((feature, score))

    # 점수는 낮은 쪽이 좋다고 가정
    b_feature, b_score = sorted(scores, key=lambda tpl: tpl[1])[0]
    if b_score < best_score:
        selected.add(b_feature)
        best_score = b_score
        print(f'selected:{b_feature}')
        print(f'score:{b_score}')
    else:
        # 어떤 특징을 추가해도 점수가 오르지 않으므로 종료
        break

print(f'selected features: {selected}')

start greedy forward selection
selected:medical_info_a1
score:0.4638764099634843
selected:weight
score:0.4346644402457856
selected:height
score:0.39207130266144324
selected:medical_keyword_5
score:0.37057836959580936
selected:age
score:0.3486370122790483
selected:medical_info_a2
score:0.34086903247488837
selected:medical_keyword_4
score:0.331715060720503
selected:medical_keyword_2
score:0.32059751101174727
selected:medical_keyword_3
score:0.31387996595027773
selected:product
score:0.30854146987522024
selected:sex
score:0.2988285641023615
selected features: {'age', 'weight', 'medical_keyword_2', 'sex', 'medical_keyword_4', 'medical_keyword_3', 'medical_info_a2', 'medical_info_a1', 'medical_keyword_5', 'product', 'height'}


### 단순화한 Greedy Forward Selection

In [12]:
# ---------------------------------
# Greedy Forward Selection 단순화한 기법
# ----------------------------------

best_score = 9999.0
candidates = np.random.RandomState(71).permutation(train_x.columns)
selected = set([])

print('start simple selection')
for feature in candidates:
    # 특징의 리스트에 대해서 정밀도를 평가하는 evaluate 함수로 수행
    fs = list(selected) + [feature]
    score = evaluate(fs)

    # 점수는 낮은 쪽이 좋다고 가정
    if score < best_score:
        selected.add(feature)
        best_score = score
        print(f'selected:{feature}')
        print(f'score:{score}')

print(f'selected features: {selected}')

start simple selection
selected:product
score:0.48586676302888027
selected:weight
score:0.46565555967529015
selected:height
score:0.4196039056537238
selected:medical_keyword_4
score:0.4172736471297666
selected:medical_keyword_1
score:0.4127028714151444
selected:medical_keyword_2
score:0.4013172811195767
selected:age
score:0.37954424205939585
selected:medical_info_a1
score:0.3471448002356393
selected:medical_keyword_3
score:0.33311759233573063
selected:sex
score:0.33044960464462875
selected:medical_keyword_5
score:0.3093475101700445
selected:medical_keyword_10
score:0.30531590318201407
selected features: {'weight', 'age', 'medical_keyword_2', 'sex', 'medical_keyword_4', 'medical_keyword_10', 'medical_keyword_1', 'medical_keyword_3', 'medical_info_a1', 'medical_keyword_5', 'product', 'height'}
