In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv(r"kaggle\porto seguro\train.csv", index_col='id')
test = pd.read_csv(r"kaggle\porto seguro\test.csv", index_col='id')
submission = pd.read_csv(r"kaggle\porto seguro\sample_submission.csv", index_col='id')

### 피처 엔지니어링과 하이퍼파라미터 최적화 추가 적용

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) # 타깃값 제거
all_features = all_data.columns # 전체 피처

In [3]:
from sklearn.preprocessing import OneHotEncoder
# 명목형 피처 추출
cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

#### 1. 파생 피처 추가

- 결측값 개수를 파생 피처로 만들기

In [4]:
# '데이터 하나당 결측값 개수'를 파생 피처로 추가
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [5]:
remaining_features = [feature for feature in all_features
                     if ('cat' not in feature and 'calc' not in feature)]
remaining_features.append('num_missing')

- 명목형 피처(원-핫 인코딩), calc 분류는 필요없는 피처라서 제외
- 파생 피처(num_missing)도 추가

#### 2. 모든 ind 피처 값을 연결해서 새로운 피처 만들기 -> mix_ind

In [6]:
# 분류가 ind인 피처
ind_features = [feature for feature in all_features if 'ind' in feature]

# 피처들을 순회하면서 모든 값을 연결
is_first_feature = True
for ind_feature in ind_features :
    if is_first_feature :
        all_data['mix_ind'] = all_data[ind_feature].astype(str)+'_'
        is_first_feature = False
        
    else :
        all_data['mix_ind'] += all_data[ind_feature].astype(str) +'_'

In [7]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

#### 3. 명목형 피처의 고윳값별 개수를 새로운 피처로 추가

In [8]:
all_data['ps_ind_02_cat'].value_counts()

 1    1079327
 2     309747
 3      70172
 4      28259
-1        523
Name: ps_ind_02_cat, dtype: int64

In [9]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [10]:
# 명목형 피처의 고윳값별 개수를 파생 피처로 만들기 
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: val_counts_dict[x])
    
    cat_count_features.append(f'{feature}_count')

In [11]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

- encoded_cat_matrix : 원-핫 인코딩된 명목형 피처
- remaining_features : 명목형 피처와 calc 분류의 피처를 제외한 피처들(+num_missing)
- cat_count_features : mix_ind를 포함한 명목형 피처의 고윳값별 개수 파생 피처

In [12]:
from scipy import sparse
# 필요 없는 피처들
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

# remaining_features, cat_count_features에서 drop_features를 제거한 데이터
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                              encoded_cat_matrix], format='csr')

In [13]:
num_train = len(train) # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
x = all_data_sprs[:num_train]
x_test = all_data_sprs[num_train:]

y = train['target'].values

#### 데이터셋 준비

In [14]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=0)

# 베이지안 최적화용 데이터셋
bayes_dtrain = lgb.Dataset(x_train, y_train)
bayes_dvalid = lgb.Dataset(x_valid, y_valid)

#### 하이퍼파라미터 범위 설정 방법
1. 하이퍼파라미터 범위를 점점 좁히는 방법
2. 다른 상위권 캐글러가 설정한 하이퍼파라미터 참고

In [15]:
# 베이지안 최적화를 위한 하이퍼파라미터 범위
param_bounds = {'num_leaves':(30, 40),
               'lambda_l1':(0.7, 0.9),
               'lambda_l2':(0.9, 1),
               'feature_fraction':(0.6, 0.7),
               'bagging_fraction':(0.6, 0.9),
               'min_child_samples':(6, 10),
               'min_child_weight':(10, 40)}

# 값이 고정된 하이퍼파라미터
fixed_params = {'objective':'binary',
               'learning_rate':0.005,
               'bagging_freq':1,
               'force_row_wise':True,
               'random_state':1991}

#### (베이지안 최적화용) 평가지표 계산 함수 작성

In [16]:
def eval_gini(y_true, y_pred):
    # 실제값과 예측값의 크기가 서로 같은지 확인(값이 다르면 오류 발생)
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0] # 데이터 개수
    l_mid = np.linspace(1/n_samples, 1, n_samples) # 대각선 값
    
    # 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()] # y_true 크기순으로 y_true 값 정렬
    l_pred = np.cumsum(pred_order) / np.sum(pred_order) # 로렌츠 곡선
    g_pred = np.sum(l_mid - l_pred) # 예측값에 대한 지니계수
    
    # 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()] # y_true 크기순으로 y_true 값 정렬
    l_true = np.cumsum(true_order) / np.sum(true_order) # 로렌츠 곡선
    g_true = np.sum(l_mid - l_true) # 예측이 완벽할 때 지니계수 
    
    # 정규화된 지니계수
    return g_pred / g_true

In [17]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

In [18]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction,
                 bagging_fraction, min_child_samples, min_child_weight):
    '''최적화하려는 평가지표(지니계수) 계산 함수'''
    
    params = {'num_leaves':int(round(num_leaves)),
             'lambda_l1' : lambda_l1,
             'lambda_l2' : lambda_l2,
             'feature_fraction' : feature_fraction,
             'bagging_fraction' : bagging_fraction,
             'min_child_samples' : int(round(min_child_samples)),
             'min_child_weight' : min_child_weight,
             'feature_pre_filter' : False}
    
    params.update(fixed_params) # 원소 추가 
    
    print('하이퍼파라미터:', params)
    
    # LightGBM 모델 훈련
    lgb_model = lgb.train(params=params, train_set=bayes_dtrain, num_boost_round=2500,
                         valid_sets = bayes_dvalid, feval=gini, early_stopping_rounds=300, verbose_eval=False)
    
    preds = lgb_model.predict(x_valid)
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')
    
    return gini_score

#### 최적화 수행 : 최적 예측기(최적 하이퍼파라미터 값들로 훈련된 모델)를 제공하지 않음
- 베이지안 최적화로 찾은 하이퍼파라미터를 활용해 LightGBM 모델을 다시 훈련

In [19]:
from bayes_opt import BayesianOptimization

# 베이지안 최적화 객체 생성
optimizer = BayesianOptimization(f=eval_function,  # 평가지표 계산 함수
                                 pbounds=param_bounds, # 하이퍼파라미터 범위 
                                 random_state=0)

In [20]:
# 베이지안 최적화 수행
optimizer.maximize(init_points=3,  # 무작위로 하이퍼파라미터 탐색 횟수
                   n_iter=6)      # 베이지안 최적화 반복 횟수  

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 34, 'lambda_l1': 0.8205526752143287, 'lambda_l2': 0.9544883182996897, 'feature_fraction': 0.6715189366372419, 'bagging_fraction': 0.7646440511781974, 'min_child_samples': 8, 'min_child_weight': 29.376823391999682, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}




[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2855811556220905

| [0m 1       [0m | [0m 0.2856  [0m | [0m 0.7646  [0m | [0m 0.6715  [0m | [0m 0.8206  [0m | [0m 0.9545  [0m | [0m 7.695   [0m | [0m 29.38   [0m | [0m 34.38   [0m |
하이퍼파라미터: {'num_leaves': 39, 'lambda_l1': 0.7766883037651555, 'lambda_l2': 0.9791725038082665, 'feature_fraction': 0.6963662760501029, 'bagging_fraction': 0.867531900234624, 'min_child_samples': 8, 'min_child_weight': 27.04133683281797, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightG



[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2828993761731121

| [0m 4       [0m | [0m 0.2829  [0m | [0m 0.8978  [0m | [0m 0.6594  [0m | [0m 0.8445  [0m | [0m 0.9234  [0m | [0m 8.619   [0m | [0m 10.55   [0m | [0m 30.09   [0m |
하이퍼파라미터: {'num_leaves': 37, 'lambda_l1': 0.7738449330497988, 'lambda_l2': 0.9032695189818599, 'feature_fraction': 0.6606341064409726, 'bagging_fraction': 0.7666713964943057, 'min_child_samples': 9, 'min_child_weight': 29.306172421380474, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555




[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.28513273331754563

| [0m 5       [0m | [0m 0.2851  [0m | [0m 0.7667  [0m | [0m 0.6606  [0m | [0m 0.7738  [0m | [0m 0.9033  [0m | [0m 8.769   [0m | [0m 29.31   [0m | [0m 36.6    [0m |
하이퍼파라미터: {'num_leaves': 33, 'lambda_l1': 0.8781371569203059, 'lambda_l2': 0.9, 'feature_fraction': 0.6949291823969249, 'bagging_fraction': 0.6580770160451177, 'min_child_samples': 10, 'min_child_weight': 35.85661117604572, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2853170766671574

| [0m 6       [0m | [0m 0.2853  [0m | [0m 0.6581  [0m | [0m 0.6949  [0m | [0m 0.8781  [0m | [0m 0.9     [0m | [0m 9.826   [0m | [0m 35.86   [0m | [0m 32.8    [0m |
하이퍼파라미터: {'num_leaves': 37, 'lambda_l1': 0.8433793375135147, 'lambda_l2': 0.9479651949974717, 'feature_fraction': 0.6859622896374784, 'bagging_fraction': 0.8362539818721497, 'min_child_samples': 6, 'min_child_weight': 39.77484183530247, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2854766974907317

| [0m 7       [0m | [0m 0.2855  [0m | [0m 0.8363  [0m | [0m 0.686   [0m | [0m 0.8434  [0m | [0m 0.948   [0m | [0m 6.002   [0m | [0m 39.77   [0m | [0m 36.8    [0m |
하이퍼파라미터: {'num_leaves': 30, 'lambda_l1': 0.7, 'lambda_l2': 0.9, 'feature_fraction': 0.7, 'bagging_fraction': 0.6, 'min_child_samples': 10, 'min_child_weight': 27.93092783176277, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2853631089269565

| [0m 8       [0m | [0m 0.2854  [0m | [0m 0.6     [0m | [0m 0.7     [0m | [0m 0.7     [0m | [0m 0.9     [0m | [0m 10.0    [0m | [0m 27.93   [0m | [0m 30.0    [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7060727349466539, 'lambda_l2': 0.9122263686184788, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'min_child_samples': 10, 'min_child_weight': 40.0, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.28490600502878993

| [0m 9       [0m | [0m 0.2849  [0m | [0m 0.6     [0m | [0m 0.6     [0m | [0m 0.7061  [0m | [0m 0.9122  [0m | [0m 10.0    [0m | [0m 40.0    [0m | [0m 40.0    [0m |


In [21]:
# 평가함수 점수가 최대일 때 하이퍼파라미터 
max_params = optimizer.max['params']
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9.112627003799401,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 39.78618342232764}

In [22]:
# 정수형 하이퍼파라미터 변환
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))

In [23]:
# 값이 고정된 하이퍼파라미터 추가
max_params.update(fixed_params)

In [24]:
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 40,
 'objective': 'binary',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'force_row_wise': True,
 'random_state': 1991}

In [28]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

oof_val_preds = np.zeros(x.shape[0])
oof_test_preds = np.zeros(x_test.shape[0])

for idx, (train_idx, valid_idx) in enumerate(folds.split(x, y)):
    print('#'*40, f'폴드 {idx+1}/폴드 {folds.n_splits}', '#'*40)
    
    x_train, y_train = x[train_idx], y[train_idx]
    x_valid, y_valid = x[valid_idx], y[valid_idx]
    
    dtrain = lgb.Dataset(x_train, y_train)
    dvalid = lgb.Dataset(x_valid, y_valid)
    
    lgb_model = lgb.train(params = max_params, train_set = dtrain, num_boost_round=2500,
                         valid_sets = dvalid, feval=gini, early_stopping_rounds=300, verbose_eval=100)
    
    oof_test_preds += lgb_model.predict(x_test)/folds.n_splits
    oof_val_preds[valid_idx] += lgb_model.predict(x_valid)
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'폴드 {idx+1} 지니계수 : {gini_score}\n')

######################################## 폴드 1/폴드 5 ########################################




[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.154239	valid_0's gini: 0.270944
[200]	valid_0's binary_logloss: 0.153176	valid_0's gini: 0.275764
[300]	valid_0's binary_logloss: 0.152584	valid_0's gini: 0.279501
[400]	valid_0's binary_logloss: 0.152222	valid_0's gini: 0.282893
[500]	valid_0's binary_logloss: 0.151986	valid_0's gini: 0.286058
[600]	valid_0's binary_logloss: 0.151824	valid_0's gini: 0.288805
[700]	valid_0's binary_logloss: 0.151712	valid_0's gini: 0.290719
[800]	valid_0's binary_logloss: 0.151622	valid_0's gini: 0.292581
[900]	valid_0's binary_logloss: 0.151552	valid_0's gini: 0.294212
[1000]	va

폴드 4 지니계수 : 0.2805136229288192

######################################## 폴드 5/폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458815
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 476170, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274766
[LightGBM] [Info] Start training from score -3.274766
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.15439	valid_0's gini: 0.26681
[200]	valid_0's binary_logloss: 0.15338	valid_0's gini: 0.272186
[300]	valid_0's binary_logloss: 0.152821	valid_0's gini: 0.275897
[400]	valid_0's binary_logloss: 0.1525	valid_0's gini: 0.278734
[500]	valid_0's binary_logloss: 0.152277	valid_0's gini: 0.282151
[600]	valid_0's binary_logloss: 0.15212	valid_0's gini: 0.285039
[700]	valid_0's binary_logloss: 0.152009	valid_0's gini: 0.287435
[800]	valid_0's binary_

In [29]:
print('oof 검증 데이터 지니계수 :', eval_gini(y, oof_val_preds))

oof 검증 데이터 지니계수 : 0.2889651000887542


In [30]:
submission['target'] = oof_test_preds
submission.to_csv(r"C:\Users\82109\OneDrive\바탕 화면\python study\kaggle\porto seguro\sample_submission2.csv")