## IEEE-CV options
- [IEEE - CV options](https://www.kaggle.com/kyakovlev/ieee-cv-options)
- 모델링은 lightGBM
- 여러 cross validation 모델 모으고, 시계열 데이터 반영(뒷부분)
- 이것도 결국 [data minification](https://www.kaggle.com/kyakovlev/ieee-data-minification) 참고한 커널. 일단 원 데이터로 해 보기
- 여기서 궁금한 점
    - train의 일부를 test라는 이름으로 뗀 것(val set인가?)
    - DT_M의 최대값(17)인 데이터를 test로 뗀 것

In [7]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import lightgbm as lgb

import math
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk("../input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input\df_id.pkl
../input\df_pca.pkl
../input\df_test.pkl
../input\df_train.pkl
../input\df_trans.pkl
../input\sample_submission.csv
../input\test.pkl
../input\test_0823.pkl
../input\test_identity.csv
../input\test_transaction.csv
../input\train_0823.pkl
../input\train_identity.csv
../input\train_transaction.csv


In [8]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [9]:
SEED = 41
seed_everything(SEED)
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [23]:
# These parameters we will keep untouched
# for each lgbm model
# the unique param that we will look at
# is n_estimators
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':2000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [11]:
print('Load data')
train_df = pd.read_pickle("../input/df_train.pkl")

# We will prepare simulation here
# Last month will be our test
train_df['DT_M'] = train_df['TransactionDT'].apply(
    lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train_df['DT_M'] = (train_df['DT_M'].dt.year-2017)*12 + train_df['DT_M'].dt.month 

test_df = train_df[
    train_df['DT_M'] == train_df['DT_M'].max()].reset_index(drop=True)
train_df = train_df[
    train_df['DT_M'] < train_df['DT_M'].max()].reset_index(drop=True)
    
print('Shape control:', train_df.shape, test_df.shape)

Load data


In [13]:
test_df['DT_M'].head() # 원래 train에서 DT_M이 17(max, last month)인 것만 모음

0    17
1    17
2    17
3    17
4    17
Name: DT_M, dtype: int64

In [14]:
train_df['DT_M'].head()

0    12
1    12
2    12
3    12
4    12
Name: DT_M, dtype: int64

In [15]:
# object data 타입 변경
for col in list(train_df):
    if train_df[col].dtype=='O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])
        
        train_df[col] = train_df[col].astype('category')
        test_df[col] = test_df[col].astype('category')

ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo


In [16]:
# Remove Some Features
rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    TARGET,                          # Not target in features))
    'DT_M'                           # Column that we used to simulate test set
]

# Remove V columns (for faster training)
rm_cols += ['V'+str(i) for i in range(1,340)] #V1 ~ V339 제외시킴
# print(rm_cols)

# Final features
features_columns = [col for col in list(train_df) if col not in rm_cols]

## CV(cross-validation)

In [17]:
RESULTS = test_df[['TransactionID',TARGET]]

# We will always use same number of splits
# for training model
# Number of splits depends on data structure
# and in our case it is better to use 
# something in range 5-10
# 5 - is a common number of splits
# 10+ is too much (we will not have enough diversity in data)
# Here we will use 3 for faster training
# but you can change it by yourself
N_SPLITS = 3

### 1. No validation

In [24]:
# Main Data
# We will take whole train data set
# and will NOT use any early stopping 
X,y = train_df[features_columns], train_df[TARGET]

# Test Data (what we need to predict)
P = test_df[features_columns]

# We don't know where to stop
# so we will try to guess 
# number of boosting rounds
for n_rounds in [500,1000,2500,5000]:
    print('#'*20)
    print('No Validation training...', n_rounds, 'boosting rounds')
    corrected_lgb_params = lgb_params.copy()
    corrected_lgb_params['n_estimators'] = n_rounds
    corrected_lgb_params['early_stopping_rounds'] = None

    train_data = lgb.Dataset(X, label=y)
    
    estimator = lgb.train(
                corrected_lgb_params,
                train_data
            )

    RESULTS['no_validation_'+str(n_rounds)] = estimator.predict(P)
    print('AUC score', metrics.roc_auc_score(RESULTS[TARGET], RESULTS['no_validation_'+str(n_rounds)]))
    print('#'*20)
    
# Be careful. We are printing auc results
# for our simulated test set
# but in real Data set we do not have True labels (obviously)
# and can't be sure that we stopped in right round
# lb probing can give you some idea how good our training is
# but this leads to nowhere -> overfits or completely bad results
# bad practice for real life problems!

####################
No Validation training... 500 boosting rounds
AUC score 0.9277271781014739
####################
####################
No Validation training... 1000 boosting rounds
AUC score 0.9326734191953692
####################
####################
No Validation training... 2500 boosting rounds
AUC score 0.9331337450590427
####################
####################
No Validation training... 5000 boosting rounds
AUC score 0.9306524679300963
####################


### 2. KFold

In [18]:
print('#'*20)
print('KFold training...')

# You can find oof name for this strategy
# oof - Out Of Fold
# as we will use one fold as validation
# and stop training when validation metric
# stops improve
from sklearn.model_selection import KFold
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# Main Data
X,y = train_df[features_columns], train_df[TARGET]

# Test Data
P = test_df[features_columns]
RESULTS['kfold'] = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 1000,
        )

    RESULTS['kfold'] = estimator.predict(P)

print('AUC score', metrics.roc_auc_score(RESULTS[TARGET], RESULTS['kfold']))
print('#'*20)

## We have two "problems" here
## 1st: Training score goes upto 1 and it's not normal situation
## It's nomally means that model did perfect or
## almost perfect match between "data fingerprint" and target
## we definitely should stop before to generalize better
## 2nd: Our LB probing gave 0.936 and it is too far away from validation score
## some difference is normal, but such gap is too big

####################
KFold training...
Fold: 1
Training until validation scores don't improve for 100 rounds.
[1000]	training's auc: 0.998253	valid_1's auc: 0.959708
[2000]	training's auc: 0.99992	valid_1's auc: 0.962512
[3000]	training's auc: 0.999998	valid_1's auc: 0.963803
[4000]	training's auc: 1	valid_1's auc: 0.96462
Early stopping, best iteration is:
[4004]	training's auc: 1	valid_1's auc: 0.964628
Fold: 2
Training until validation scores don't improve for 100 rounds.
[1000]	training's auc: 0.998309	valid_1's auc: 0.957031
[2000]	training's auc: 0.999921	valid_1's auc: 0.960994
[3000]	training's auc: 0.999997	valid_1's auc: 0.962324
[4000]	training's auc: 1	valid_1's auc: 0.96288
Early stopping, best iteration is:
[4062]	training's auc: 1	valid_1's auc: 0.962895
Fold: 3
Training until validation scores don't improve for 100 rounds.
[1000]	training's auc: 0.998342	valid_1's auc: 0.958528
[2000]	training's auc: 0.999929	valid_1's auc: 0.96149
[3000]	training's auc: 0.999997	valid_

### 3. Stratified KFold

In [None]:
print('#'*20)
print('StratifiedKFold training...')

# Same as normal kfold but we can be sure
# that our target is perfectly distribuited
# over folds
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# Main Data
X,y = train_df[features_columns], train_df[TARGET]

# Test Data and expport DF
P = test_df[features_columns]
RESULTS['stratifiedkfold'] = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 1000,
        )

    # we are not sure what fold is best for us
    # so we will average prediction results 
    # over folds
    RESULTS['stratifiedkfold'] += estimator.predict(P)/N_SPLITS

print('AUC score', metrics.roc_auc_score(RESULTS[TARGET], RESULTS['stratifiedkfold']))
print('#'*20)

## We have same "problems" here as in normal kfold
## 1st: Training score goes upto 1 and it's not normal situation
## we definitely should stop before 
## 2nd: Our LB probing gave 0.936 and it is too far away from validation score
## some difference is normal, but such gap is too big

### 4. LBO (last block out)
- 시계열의 경우 last time block을 validation subset으로 사용
- track은 early stopping round 의미

In [19]:
print('#'*20)
print('LBO training...') 

## We need Divide Train Set by Time blocks
## Convert TransactionDT to Months
## And use last month as Validation
train_df['DT_M'] = train_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train_df['DT_M'] = (train_df['DT_M'].dt.year-2017)*12 + train_df['DT_M'].dt.month 

# 아래부터 주석 전체 제거!!
# main_train_set = train_df[train_df['DT_M']<(train_df['DT_M'].max())].reset_index(drop=True)
# validation_set = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True)

# ## We will use oof kfold to find "best round"
# folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# # Main Data
# X,y = main_train_set[features_columns], main_train_set[TARGET]

# # Validation Data
# v_X, v_y = validation_set[features_columns], validation_set[TARGET]

# estimators_bestround = []
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
#     print('Fold:',fold_+1)
#     tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
#     train_data = lgb.Dataset(tr_x, label=tr_y)
#     valid_data = lgb.Dataset(v_X, label=v_y)  

#     estimator = lgb.train(
#             lgb_params,
#             train_data,
#             valid_sets = [train_data, valid_data],
#             verbose_eval = 1000,
#         )
#     estimators_bestround.append(estimator.current_iteration())

# ## Now we have "mean Best round" and we can train model on full set
# corrected_lgb_params = lgb_params.copy()
# corrected_lgb_params['n_estimators'] = int(np.mean(estimators_bestround))
# corrected_lgb_params['early_stopping_rounds'] = None
# print('#'*10)
# print('Mean Best round:', corrected_lgb_params['n_estimators'])

# # Main Data
# X,y = train_df[features_columns], train_df[TARGET]

# # Test Data
# P = test_df[features_columns]
# RESULTS['lbo'] = 0

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
#     print('Fold:',fold_+1)
#     tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
#     train_data = lgb.Dataset(tr_x, label=tr_y)

#     estimator = lgb.train(
#             corrected_lgb_params,
#             train_data
#         )
    
#     RESULTS['lbo'] += estimator.predict(P)/N_SPLITS

# print('AUC score', metrics.roc_auc_score(RESULTS[TARGET], RESULTS['lbo']))
# print('#'*20)

####################
LBO training...


### deeper analysis
- n_estimators 2만 개는 너무 오래 걸려 2000개로 조정
- 현재 no validation 2천, KFold 2만개 돌린 상태

In [25]:
# 일어나서 여기부터 실행!
print('#'*30)
print('Intermediate results...')
final_df = []
for current_strategy in list(RESULTS.iloc[:,2:]):
    auc_score = metrics.roc_auc_score(RESULTS[TARGET], RESULTS[current_strategy])
    final_df.append([current_strategy, auc_score])
    
final_df = pd.DataFrame(final_df, columns=['Stategy', 'Result'])
final_df.sort_values(by=['Result'], ascending=False, inplace=True)
print(final_df)

##############################
Intermediate results...
              Stategy    Result
4  no_validation_2500  0.933134
3  no_validation_1000  0.932673
5  no_validation_5000  0.930652
2   no_validation_500  0.927727
0               kfold  0.926708
1                DT_W  0.530529


In [26]:
test_df['DT_W'] = test_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
RESULTS['DT_W'] = (test_df['DT_W'].dt.year-2017)*52 + test_df['DT_W'].dt.weekofyear 

for curent_time_block in range(RESULTS['DT_W'].min(), RESULTS['DT_W'].max()+1):
    print('#'*20)
    print('Time Block:', curent_time_block)
    final_df = []
    temp_df = RESULTS[RESULTS['DT_W']==curent_time_block]
    for current_strategy in list(temp_df.iloc[:,2:]):
        auc_score = metrics.roc_auc_score(temp_df[TARGET], temp_df[current_strategy])
        final_df.append([current_strategy, auc_score])
    
    final_df = pd.DataFrame(final_df, columns=['Stategy', 'Result'])
    final_df.sort_values(by=['Result'], ascending=False, inplace=True)
    print(final_df)
    print('#'*30)
    
# Naive analize.
# But we can see temporal auc degradation
# Probably for test set with larger monthly gap
# from training set we need to use less boosting rounds (or more).

####################
Time Block: 70
              Stategy    Result
5  no_validation_5000  0.950372
4  no_validation_2500  0.950023
3  no_validation_1000  0.945684
0               kfold  0.941175
2   no_validation_500  0.936770
1                DT_W  0.500000
##############################
####################
Time Block: 71
              Stategy    Result
4  no_validation_2500  0.928660
3  no_validation_1000  0.928077
5  no_validation_5000  0.924845
2   no_validation_500  0.924023
0               kfold  0.922592
1                DT_W  0.500000
##############################
####################
Time Block: 72
              Stategy    Result
3  no_validation_1000  0.925413
4  no_validation_2500  0.925350
5  no_validation_5000  0.922881
2   no_validation_500  0.921449
0               kfold  0.919267
1                DT_W  0.500000
##############################
####################
Time Block: 73
              Stategy    Result
3  no_validation_1000  0.935728
4  no_validation_2500  0.93

In [28]:
# 그래서 결과는 어느 변수에 저장되는 건지???
final_df

Unnamed: 0,Stategy,Result
3,no_validation_1000,0.926698
4,no_validation_2500,0.92478
2,no_validation_500,0.923474
5,no_validation_5000,0.921158
0,kfold,0.919203
1,DT_W,0.5
