# 3(5)(134). Feature Selection

### Imports

In [1]:
# for read data
import os
import pandas as pd
import warnings; warnings.filterwarnings("ignore")

# feature selection
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier 
from lightgbm import LGBMClassifier 

# validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, make_scorer

## Exclude Low Importance Features
모든 Feature를 집어넣어 학습시킨 뒤 Feature Importance가 낮은 Feature는 제외하고 Selection을 진행한다.

- **[Catboost]**

In [2]:
path = (os.path.abspath("./input"))

feature_train1 = pd.read_csv(path +'/feature_train_transformation_1.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train4 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train1, feature_train3, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train4, on='custid', how='outer')

feature_test1 = pd.read_csv(path +'/feature_test_transformation_1.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test4 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test1, feature_test3, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test4, on='custid', how='outer')

y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

del feature_train['custid'], feature_test['custid']

In [21]:
X_train, X_dev, y__train, y_dev = train_test_split(feature_train, y_train, test_size=0.3, random_state=0)

In [22]:
# learning_rate만 지정한다.
model=CatBoostClassifier(iterations=100, learning_rate=0.03, objective='MultiClass', bootstrap_type ='Bayesian',
                           devices='0:1', task_type="GPU", random_state=0)

In [23]:
model.fit(X_train, y__train)

0:	learn: 2.0507196	total: 12.5ms	remaining: 1.23s
1:	learn: 2.0248167	total: 23.5ms	remaining: 1.15s
2:	learn: 2.0008893	total: 34.6ms	remaining: 1.12s
3:	learn: 1.9799998	total: 45.5ms	remaining: 1.09s
4:	learn: 1.9601500	total: 56.7ms	remaining: 1.08s
5:	learn: 1.9418501	total: 67.5ms	remaining: 1.06s
6:	learn: 1.9242522	total: 78.6ms	remaining: 1.04s
7:	learn: 1.9077793	total: 89.6ms	remaining: 1.03s
8:	learn: 1.8928060	total: 100ms	remaining: 1.02s
9:	learn: 1.8788363	total: 111ms	remaining: 1s
10:	learn: 1.8655935	total: 123ms	remaining: 991ms
11:	learn: 1.8534541	total: 134ms	remaining: 983ms
12:	learn: 1.8415374	total: 145ms	remaining: 972ms
13:	learn: 1.8300213	total: 157ms	remaining: 962ms
14:	learn: 1.8190566	total: 168ms	remaining: 951ms
15:	learn: 1.8091528	total: 179ms	remaining: 941ms
16:	learn: 1.7995277	total: 190ms	remaining: 929ms
17:	learn: 1.7903311	total: 201ms	remaining: 917ms
18:	learn: 1.7815180	total: 216ms	remaining: 919ms
19:	learn: 1.7736232	total: 227ms	re

<catboost.core.CatBoostClassifier at 0x182588b3070>

In [24]:
# 전체 Feature 사용: 1.6034232816171814
# 상위 211개 사용:  1.602436056352858, redefine_1 결과
# 상위 149개 사용:  1.6020110771039615, redefine_2 결과, STOP!
# 상위 127개 사용:  1.6020957613253453, redefine_3 결과

print(f'상위 {feature_train.shape[1]}개 사용: ',log_loss(y_dev, model.predict_proba(X_dev)))

상위 127개 사용:  1.6020957613253453


In [19]:
# Feature Importance 확인
FI = pd.DataFrame({'Feature_Importances':model.feature_importances_})
display(FI.value_counts().sort_index())

Feature_Importances
0.000000               21
0.090321                1
0.092056                1
0.100453                1
0.108723                1
                       ..
2.612200                1
3.110287                1
3.351606                1
5.278293                1
9.646181                1
Length: 129, dtype: int64

In [20]:
# redefine features
# Log loss가 올라가는 importance 이면 그 사이값 재탐색 후 결정한다.
# limit = round(FI.value_counts().sort_index().index[1][0], 6)
# redefine_1 = FI.query('Feature_Importances > @limit').index
# redefine_2 = FI.query('Feature_Importances > @limit').index
# redefine_3 = FI.query('Feature_Importances > @limit').index

# feature_train = feature_train.iloc[:, redefine_3]
# feature_test = feature_test.iloc[:, redefine_3]

- **[LGBM]**

In [43]:
feature_train1 = pd.read_csv(path +'/feature_train_transformation_1.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train4 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train1, feature_train3, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train4, on='custid', how='outer')

feature_test1 = pd.read_csv(path +'/feature_test_transformation_1.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test4 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test1, feature_test3, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test4, on='custid', how='outer')

y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

del feature_train['custid'], feature_test['custid']

In [63]:
X_train, X_dev, y__train, y_dev = train_test_split(feature_train, y_train, test_size=0.3, random_state=0)

In [64]:
# learning_rate만 지정한다.
model=LGBMClassifier(learning_rate=0.03,
                     objective='multiclass', metrics='multi_logloss', num_gpu=1, random_state=0)

In [65]:
model.fit(X_train, y__train)

LGBMClassifier(learning_rate=0.03, metrics='multi_logloss', num_gpu=1,
               objective='multiclass', random_state=0)

In [66]:
# 전체 Feature 사용: 1.585266831079759
# 상위 4757개 사용:  1.5835110258811884, redefine_1 결과
# 상위 4135개 사용:  1.5819675888731328, redefine_2 결과, STOP!
# 상위 3812개 사용:  1.5848623668352895, redefine_3 결과

print(f'상위 {feature_train.shape[1]}개 사용: ',log_loss(y_dev, model.predict_proba(X_dev)))

상위 3812개 사용:  1.5848623668352895


In [61]:
# Feature Importance 확인
FI = pd.DataFrame({'Feature_Importances':model.feature_importances_})
display(FI.value_counts().sort_index())

Feature_Importances
0                       77
1                      246
2                      534
3                      630
4                      645
5                      521
6                      429
7                      291
8                      206
9                      134
10                      82
11                      57
12                      36
13                      36
14                      24
15                      23
16                      14
17                      16
18                      14
19                      11
20                       6
21                       6
22                       6
23                       9
24                       2
25                       5
26                       5
27                       6
28                       2
29                       6
30                       5
31                       6
33                       2
34                       3
35                       1
36                       4
37      

In [62]:
# redefine features
# Log loss가 올라가는 importance 직전에서 멈춘다.
# REDEFINE_1 = FI.query('Feature_Importances > 1').index 
# REDEFINE_2 = FI.query('Feature_Importances > 1').index 
# REDEFINE_3 = FI.query('Feature_Importances > 1').index 

# feature_train = feature_train.iloc[:, REDEFINE_3]
# feature_test = feature_test.iloc[:, REDEFINE_3]

### $~~~~$ Save data

In [67]:
feature_train1 = pd.read_csv(path +'/feature_train_transformation_1.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train4 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train1, feature_train3, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train4, on='custid', how='outer')

feature_test1 = pd.read_csv(path +'/feature_test_transformation_1.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test4 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test1, feature_test3, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test4, on='custid', how='outer')

train_ID, test_ID = feature_train.custid, feature_test.custid
del feature_train['custid'], feature_test['custid']

In [68]:
# Catboost
# logloss: 1.6020110771039615
cat_train = pd.concat([train_ID, 
                       feature_train.iloc[:, redefine_1].iloc[:, redefine_2]], axis=1)
cat_train.to_csv(path+'/134.imp_CAT_train.csv', index=False, encoding='cp949')

cat_test = pd.concat([test_ID, 
                      feature_test.iloc[:, redefine_1].iloc[:, redefine_2]], axis=1)
cat_test.to_csv(path+'/134.imp_CAT_test.csv', index=False, encoding='cp949')

In [69]:
# lgbmboost
# logloss: 1.5819675888731328
lgbm_train = pd.concat([train_ID, feature_train.iloc[:, REDEFINE_1].iloc[:, REDEFINE_2]], axis=1)
lgbm_train.to_csv(path+'/134.imp_LGBM_train.csv', index=False, encoding='cp949')

lgbm_test = pd.concat([test_ID, feature_test.iloc[:, REDEFINE_1].iloc[:, REDEFINE_2]], axis=1)
lgbm_test.to_csv(path+'/134.imp_LGBM_test.csv', index=False, encoding='cp949')