## Imports

In [1]:
# for read data
import os
import pandas as pd
import warnings; warnings.filterwarnings("ignore")

# feature selection
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier 
from lightgbm import LGBMClassifier 

# validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, make_scorer

## Exclude Low Importance Features
모든 Feature를 집어넣어 학습시킨 뒤 Feature Importance가 낮은 Feature는 제외하고 Selection을 진행한다.

- **[Catboost]**

In [2]:
path = (os.path.abspath("./input"))

feature_train1 = pd.read_csv(path +'/feature_train_transformation.csv', encoding='cp949')
feature_train2 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train1, feature_train2, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train3, on='custid', how='outer')

feature_test1 = pd.read_csv(path +'/feature_test_transformation.csv', encoding='cp949')
feature_test2 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test1, feature_test2, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test3, on='custid', how='outer')

y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

del feature_train['custid'], feature_test['custid']

In [21]:
X_train, X_dev, y__train, y_dev = train_test_split(feature_train, y_train, test_size=0.3, random_state=0)

In [22]:
# learning_rate만 지정한다.
model=CatBoostClassifier(iterations=100, learning_rate=0.03, objective='MultiClass', bootstrap_type ='Bayesian',
                           devices='0:1', task_type="GPU", random_state=0)

In [23]:
model.fit(X_train, y__train)

0:	learn: 2.0491174	total: 11.9ms	remaining: 1.17s
1:	learn: 2.0225404	total: 22.4ms	remaining: 1.1s
2:	learn: 1.9984641	total: 33.3ms	remaining: 1.08s
3:	learn: 1.9759895	total: 44.2ms	remaining: 1.06s
4:	learn: 1.9553916	total: 55ms	remaining: 1.04s
5:	learn: 1.9366652	total: 65.7ms	remaining: 1.03s
6:	learn: 1.9181629	total: 76.5ms	remaining: 1.01s
7:	learn: 1.9020130	total: 87.8ms	remaining: 1.01s
8:	learn: 1.8865269	total: 98.5ms	remaining: 996ms
9:	learn: 1.8718661	total: 109ms	remaining: 984ms
10:	learn: 1.8579473	total: 120ms	remaining: 970ms
11:	learn: 1.8450638	total: 131ms	remaining: 958ms
12:	learn: 1.8327675	total: 141ms	remaining: 946ms
13:	learn: 1.8214510	total: 152ms	remaining: 935ms
14:	learn: 1.8104039	total: 163ms	remaining: 923ms
15:	learn: 1.8000808	total: 173ms	remaining: 910ms
16:	learn: 1.7895658	total: 184ms	remaining: 899ms
17:	learn: 1.7805525	total: 195ms	remaining: 887ms
18:	learn: 1.7714452	total: 207ms	remaining: 882ms
19:	learn: 1.7626551	total: 218ms	r

<catboost.core.CatBoostClassifier at 0x1cc48a71610>

In [24]:
# 전체 Feature 사용: 1.5727539589558865
# 상위 171개 사용:  1.572028648061302, redefine_1 결과
# 상위 120개 사용:  1.5705520554742651, redefine_2 결과, STOP!
# 상위 104개 사용:  1.5726428439180589, redefine_3 결과
print(f'상위 {feature_train.shape[1]}개 사용: ',log_loss(y_dev, model.predict_proba(X_dev)))

상위 104개 사용:  1.5726428439180589


In [19]:
# Feature Importance 확인
FI = pd.DataFrame({'Feature_Importances':model.feature_importances_})
display(FI.value_counts().sort_index())

Feature_Importances
0.000000               14
0.109186                1
0.109186                1
0.115790                1
0.118929                1
                       ..
4.284226                1
4.487170                1
4.803143                1
4.821632                1
9.188756                1
Length: 107, dtype: int64

In [20]:
# redefine features
# Log loss가 올라가는 importance 이면 그 사이값 재탐색 후 결정한다.
# limit = round(FI.value_counts().sort_index().index[1][0], 6)
# redefine_1 = FI.query('Feature_Importances > @limit').index
# redefine_2 = FI.query('Feature_Importances > @limit').index
# redefine_3 = FI.query('Feature_Importances > @limit').index

# feature_train = feature_train.iloc[:, redefine_3]
# feature_test = feature_test.iloc[:, redefine_3]

- **[LGBM]**

In [25]:
feature_train0 = pd.read_csv(path +'/feature_train_transformation_0.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train4 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train0, feature_train3, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train4, on='custid', how='outer')

feature_test0 = pd.read_csv(path +'/feature_test_transformation_0.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test4 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test0, feature_test3, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test4, on='custid', how='outer')

y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

del feature_train['custid'], feature_test['custid']

In [38]:
X_train, X_dev, y__train, y_dev = train_test_split(feature_train, y_train, test_size=0.3, random_state=0)

In [39]:
# learning_rate만 지정한다.
model=LGBMClassifier(learning_rate=0.03,
                     objective='multiclass', metrics='multi_logloss', num_gpu=1, random_state=0)

In [40]:
model.fit(X_train, y__train)

LGBMClassifier(learning_rate=0.03, metrics='multi_logloss', num_gpu=1,
               objective='multiclass', random_state=0)

In [41]:
# 전체 Feature 사용: 1.5479148067922546
# 상위 4656개 사용:  1.5472279449561581, redefine_1 결과, STOP!
# 상위 4021개 사용:  1.548362171129472, redefine_2 결과

print(f'상위 {feature_train.shape[1]}개 사용: ',log_loss(y_dev, model.predict_proba(X_dev)))

상위 4021개 사용:  1.548362171129472


In [42]:
# Feature Importance 확인
FI = pd.DataFrame({'Feature_Importances':model.feature_importances_})
display(FI.value_counts().sort_index())

Feature_Importances
0                       83
1                      252
2                      510
3                      648
4                      612
                      ... 
106                      1
160                      1
194                      1
209                      1
372                      1
Length: 69, dtype: int64

In [37]:
# redefine features
# Log loss가 올라가는 importance 직전에서 멈춘다.
# REDEFINE_1 = FI.query('Feature_Importances > 1').index 
# REDEFINE_2 = FI.query('Feature_Importances > 1').index 

# feature_train = feature_train.iloc[:, REDEFINE_2]
# feature_test = feature_test.iloc[:, REDEFINE_2]

### $~~~~$ Save data

In [45]:
feature_train1 = pd.read_csv(path +'/feature_train_transformation.csv', encoding='cp949')
feature_train2 = pd.read_csv(path +'/feature_train_W2V.csv', encoding='cp949')
feature_train3 = pd.read_csv(path +'/feature_train_BOW.csv', encoding='cp949')
feature_train = pd.merge(feature_train1, feature_train2, on='custid', how='outer')
feature_train = pd.merge(feature_train, feature_train3, on='custid', how='outer')

feature_test1 = pd.read_csv(path +'/feature_test_transformation.csv', encoding='cp949')
feature_test2 = pd.read_csv(path +'/feature_test_W2V.csv', encoding='cp949')
feature_test3 = pd.read_csv(path +'/feature_test_BOW.csv', encoding='cp949')
feature_test = pd.merge(feature_test1, feature_test2, on='custid', how='outer')
feature_test = pd.merge(feature_test, feature_test3, on='custid', how='outer')

train_ID, test_ID = feature_train.custid, feature_test.custid
del feature_train['custid'], feature_test['custid']

In [46]:
# Catboost
# logloss: 1.5705520554742651
cat_train = pd.concat([train_ID, 
                       feature_train.iloc[:, redefine_1].iloc[:, redefine_2]], axis=1)
cat_train.to_csv(path+'/CAT_train.csv', index=False, encoding='cp949')

cat_test = pd.concat([test_ID, 
                      feature_test.iloc[:, redefine_1].iloc[:, redefine_2]], axis=1)
cat_test.to_csv(path+'/CAT_test.csv', index=False, encoding='cp949')

In [47]:
# lgbmboost
# logloss: 1.5472279449561581
lgbm_train = pd.concat([train_ID, feature_train.iloc[:, REDEFINE_1]], axis=1)
lgbm_train.to_csv(path+'/LGBM_train.csv', index=False, encoding='cp949')

lgbm_test = pd.concat([test_ID, feature_test.iloc[:, REDEFINE_1]], axis=1)
lgbm_test.to_csv(path+'/LGBM_test.csv', index=False, encoding='cp949')