이 노트북의 single model lgbm 예측 결과는 private 0.935347 public 0.962972 을 기록. 나머지 점수 향상은 그 전 모델들과의 blend를 통해 달성

In [1]:
import pandas as pd
import numpy as np
import gc

from lightgbm import LGBMClassifier
from tqdm import tqdm
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

## 1. FE

### 1. categorical columns 정의
https://www.kaggle.com/c/ieee-fraud-detection/data 참고

In [2]:
CAT_COLS = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'DeviceType', 'DeviceInfo', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23',
            'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

### 2. 데이터 로드

In [3]:
def merge():
    train_t = pd.read_csv('data/train_transaction.csv', index_col='TransactionID')
    train_i = pd.read_csv('data/train_identity.csv', index_col='TransactionID')
    train = train_t.merge(train_i, how='left', left_index=True, right_index=True)
    test_t = pd.read_csv('data/test_transaction.csv', index_col='TransactionID')
    test_i = pd.read_csv('data/test_identity.csv', index_col='TransactionID')
    test = test_t.merge(test_i, how='left', left_index=True, right_index=True)
    return pd.concat([train, test], sort=False)

df = merge()
len_train = (~df['isFraud'].isna()).sum()

### 3. TransactionDT로부터 datetime features 추출

https://www.kaggle.com/akasyanama13/eda-what-s-behind-d-feature 에서 D1이 해당 거래가 사용자의 첫 번째 거래로부터 며칠 떨어져있는지를 나타내는 지표임을 알아냄. 이를 `first_dt`라는 피쳐로 만듦

In [4]:
df['datetime'] = pd.to_datetime('2017-11-30') + pd.to_timedelta(df['TransactionDT'], unit='s')
df['dayofweek'] = df['datetime'].dt.dayofweek.astype('uint8')
df['date'] = df['datetime'].dt.day.astype('uint8')
df['second'] = df['datetime'].dt.second.astype('uint8')
df['minute'] = df['datetime'].dt.minute.astype('uint8')
df['minuteofday'] = df['datetime'].apply(lambda x: x.hour*60+x.minute).astype('uint16')
df['first_dt'] = (df['datetime'] - pd.to_timedelta(df['D1'], unit='d')).dt.strftime('%Y-%m-%d')
df['FREQ_TransactionDT'] = (df['TransactionDT'].map(df['TransactionDT'].value_counts(dropna=False))/len(df)).astype(np.float32)

### 4. uid 조합

uid를 판별할 수 있는 feature interaction을 만듦. 특정 uid별 카드 사용자별 거래들을 보면 대부분 `isFraud`가 0 또는 1인 것을 확인.

In [5]:
combs = [
    ('card1', 'addr1', 'P_emaildomain', 'first_dt', 'DeviceInfo'),
    ('card1', 'addr1', 'P_emaildomain', 'first_dt'),
    ('card1', 'addr1', 'first_dt'),
]
for i, comb in enumerate(combs):
    tmp = pd.Series('', index=df.index)
    for c in comb:
        tmp += '__' + df[c].astype(str)
    df[f'uid_{i}'] = tmp
uid_cols = [c for c in df.columns if c.startswith('uid_')]

In [8]:
for uid in uid_cols:
    print(uid)
    only_test_uid = list(set(df.iloc[len_train:][uid])-set(df.iloc[:len_train][uid]))
    print('test에만 있는 uid의 거래횟수 비율:', len(df.set_index(uid).loc[only_test_uid])/len(df))
    utm = df.groupby(uid)['isFraud'].transform('mean')
    print('isFraud가 0또는 1이 아닌 uid의 거래횟수 비율', ((utm<1)&(utm>0)).mean())

uid_0
test에만 있는 uid의 거래횟수 비율: 0.3715899386728957
isFraud가 0또는 1이 아닌 uid의 거래횟수 비율 0.006410682891752056
uid_1
test에만 있는 uid의 거래횟수 비율: 0.3680291570325665
isFraud가 0또는 1이 아닌 uid의 거래횟수 비율 0.015351370859919196
uid_2
test에만 있는 uid의 거래횟수 비율: 0.35762478457134367
isFraud가 0또는 1이 아닌 uid의 거래횟수 비율 0.03229219735862366


### 5. uid 정렬

lgbm 등 tree method의 bin 개수에 비해 uid의 cardinality가 매우 높아서 tree가 binning하기 힘들 것이라 예상하여 uid를 target mean에 기초하여 정렬함. 여기에서 target leak가 발생하나, 추후(11)에 train 또는 test 중 하나에 등장하지 않는 uid의 값을 전부 -2로 치환하여 leak를 최소화함

In [9]:
ordinal_cols = []
ordinal_cols += uid_cols
for col in ordinal_cols:
    mean = df.groupby(col)['isFraud'].mean().sort_values()
    mapper = pd.Series(np.arange(len(mean)), index=mean.index)
    df[col] = df[col].map(mapper)

### 6. TransactionAmt

kaggle forum에 나와있던 정보를 바탕으로 TransactionAmt에 기초한 feature들 추가

In [11]:
df['amt_decimal'] = (df['TransactionAmt'].apply(lambda x: len(str(x).split('.')[1]))).astype(np.float32)
df['TransactionAmt_mod1'] = (df['TransactionAmt'].mod(1) * 100).astype(np.float32)
df['FREQ_TransactionAmt'] = (df['TransactionAmt'].map(df['TransactionAmt'].value_counts(dropna=False))/len(df)).astype(np.float32)

### 7. 소프트웨어 최신도

id_30과 id_31 (소프트웨어 버전)의 최신도를 TransactionDT를 기반으로 추출함

In [12]:
df['id_30_min_dt'] = (df.groupby('id_30')['TransactionDT'].transform('min')).astype(np.float32)
df['id_30_recency'] = (df['TransactionDT'] - df['id_30_min_dt']).astype(np.float32)
df['id_31_min_dt'] = (df.groupby('id_31')['TransactionDT'].transform('min')).astype(np.float32)
df['id_31_recency'] = (df['TransactionDT'] - df['id_31_min_dt']).astype(np.float32)

### 8. `CAT_COL`, `NUM_COL` 재정의

추가된 feature들을 기반으로 재정의

In [13]:
CAT_COLS = list((set(df.select_dtypes('object').columns)|set(CAT_COLS)|set(ordinal_cols))&set(df.columns))
NUM_COLS = [c for c in df.columns if c not in CAT_COLS+['isFraud','datetime']]
print(CAT_COLS)

['id_20', 'M1', 'addr2', 'id_30', 'card3', 'id_14', 'addr1', 'M2', 'M5', 'card2', 'id_24', 'id_22', 'M8', 'id_17', 'card5', 'id_37', 'id_13', 'id_35', 'id_27', 'id_34', 'id_33', 'M3', 'uid_0', 'DeviceType', 'id_18', 'id_25', 'card4', 'DeviceInfo', 'id_29', 'id_32', 'id_23', 'id_38', 'first_dt', 'card1', 'id_26', 'id_31', 'M9', 'P_emaildomain', 'ProductCD', 'M4', 'id_15', 'id_28', 'uid_1', 'id_19', 'id_16', 'uid_2', 'id_12', 'card6', 'M6', 'id_21', 'M7', 'R_emaildomain', 'id_36']


### 9. label / frequency encoding

In [14]:
for col in CAT_COLS:
    if col not in ordinal_cols:
        df[col] = (pd.factorize(df[col])[0]).astype(np.float32)
    df['FREQ_'+col] = (df[col].map(df[col].value_counts(dropna=False))/len(df)).astype(np.float32)

### 10. 중요한 cat_col을 key로 하여 aggregate

NUM_COL에 대해서는 groupby mean과의 gap을 피쳐로 넣었고, CAT_COL에 대해서는 해당 카테고리가 그 그룹에서 몇프로를 차지하고 있는지를 피쳐로 넣음

In [15]:
agg_keys = ['card1', 'addr1', 'ProductCD', 'first_dt', 'P_emaildomain'] + uid_cols
transforms = ['mean']
for key in tqdm(agg_keys):
    group = df.groupby(key)
    for target in NUM_COLS:
        group_col = group[target]
        for transform in transforms:
            mean = group_col.transform(transform).astype(np.float32)
            df[f'GROUPBY_{key}_GAP{transform}_{target}'] = (df[target]-mean).astype(np.float32)
    for target in CAT_COLS:
        if \
        ((key=='uid_0')&(target in combs[0])) | \
        ((key=='uid_1')&(target in combs[1])) | \
        ((key=='uid_2')&(target in combs[2])) | \
        (key==target) | \
        (target in key):
            pass
        else:
            tmp = df[key].astype(str) + '__' + df[target].astype(str)
            df[f'{key}-{target}_FREQRATIO'] = (tmp.map(tmp.value_counts())/group[target].transform('size')).astype(np.float32)

100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [16:01<00:00, 147.70s/it]


### 11. train과 test의 분포 차이가 큰 CAT_COLS 처리

train과 test에서 나오는 횟수가 100배 이상 차이나는 값들을 모두 -2로 치환하여 test에도 generalize 잘 되도록 함

In [16]:
ratio_thresh = 0.01
for col in tqdm(CAT_COLS):
    # replace values that only show in one of train/test
    val_counts = df[col].value_counts()
    train_val_counts = pd.Series(0, index=val_counts.index)
    train_val_counts.update(df[col].iloc[:len_train].value_counts())
    test_val_counts = pd.Series(0, index=val_counts.index)
    test_val_counts.update(df[col].iloc[len_train:].value_counts())
    ratio = train_val_counts/test_val_counts
    replace_vals = ratio[(ratio<ratio_thresh)|(ratio>1/ratio_thresh)].index
    df.loc[df[col].apply(lambda x: x in replace_vals), col] = -2

100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [02:11<00:00,  2.25s/it]


In [17]:
df = df.drop(['datetime'], axis=1)

In [18]:
print(df.shape)
df.head(5)

(1097231, 4075)


Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,uid_2-uid_1_FREQRATIO,uid_2-id_19_FREQRATIO,uid_2-id_16_FREQRATIO,uid_2-id_12_FREQRATIO,uid_2-card6_FREQRATIO,uid_2-M6_FREQRATIO,uid_2-id_21_FREQRATIO,uid_2-M7_FREQRATIO,uid_2-R_emaildomain_FREQRATIO,uid_2-id_36_FREQRATIO
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0.0,86400,68.5,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2987001,0.0,86401,29.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2987002,0.0,86469,59.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2987003,0.0,86499,50.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2987004,0.0,86506,50.0,1.0,4.0,3.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
df.iloc[:len_train].to_pickle('data/train_fe.pkl')
df.iloc[len_train:].drop('isFraud', axis=1).to_pickle('data/test_fe.pkl')

In [20]:
del df; gc.collect();

## 2. Drop Time Dependent Columns

adversarial validation을 통해 train과 test간의 분포 차이가 큰 column들은 drop

In [2]:
def load_df():
    train = pd.read_pickle('data/train_fe.pkl')
    test = pd.read_pickle('data/test_fe.pkl')
    train = train.drop('isFraud', axis=1)
    train['is_test'] = 0
    test['is_test'] = 1
    return pd.concat([train, test])
df = load_df()

In [9]:
dependency = pd.Series()
feats = [c for c in df.columns if c!='is_test']
len_feats = len(feats)
x_train, x_val, y_train, y_val = train_test_split(df, df['is_test'], shuffle=True, random_state=0, test_size=0.2)
for i, col in enumerate(feats):
    print(f'{i+1}/{len_feats}', '|', col, end=' | ')
    x_train_c = x_train[[col]]
    x_val_c = x_val[[col]]
    model = LGBMClassifier(objective='binary', num_threads=8, max_bin=63, num_iterations=10000)
    model.fit(x_train_c, y_train, eval_set=[(x_val_c, y_val)], early_stopping_rounds=3, verbose=0)
    pred = model.predict_proba(x_val_c)[:, 1]
    score = roc_auc_score(y_val, pred)
    print(score)
    dependency.loc[col] = score

1/4074 | TransactionDT | 0.9998751551580981
2/4074 | TransactionAmt | 0.5263802917731566
3/4074 | ProductCD | 0.5165478149765013
4/4074 | card1 | 0.5310808946614476
5/4074 | card2 | 0.5327619985784898
6/4074 | card3 | 0.5131495059006156
7/4074 | card4 | 0.5091135472213745
8/4074 | card5 | 0.5307823371320113
9/4074 | card6 | 0.5108410365784201
10/4074 | addr1 | 0.5207009771431567
11/4074 | addr2 | 0.5108706735267212
12/4074 | dist1 | 0.5203261156027068
13/4074 | dist2 | 0.5106030734905045
14/4074 | P_emaildomain | 0.5219114098371714
15/4074 | R_emaildomain | 0.5189745173543039
16/4074 | C1 | 0.5159708183607871
17/4074 | C2 | 0.5172487594089964
18/4074 | C3 | 0.5103961333321642
19/4074 | C4 | 0.5187495149660606
20/4074 | C5 | 0.5165915445857404
21/4074 | C6 | 0.5131101785671197
22/4074 | C7 | 0.5127243341586064
23/4074 | C8 | 0.5203052866301303
24/4074 | C9 | 0.5156715407808807
25/4074 | C10 | 0.5224417651137185
26/4074 | C11 | 0.5211539511934761
27/4074 | C12 | 0.5547115572266044
28/407

229/4074 | V176 | 0.5165514432073847
230/4074 | V177 | 0.5176217397899272
231/4074 | V178 | 0.5185475139371927
232/4074 | V179 | 0.5181153921192139
233/4074 | V180 | 0.5182425577176484
234/4074 | V181 | 0.5180835482911783
235/4074 | V182 | 0.5188370202864769
236/4074 | V183 | 0.5181637738065457
237/4074 | V184 | 0.5157401689862301
238/4074 | V185 | 0.5174422309111647
239/4074 | V186 | 0.5164971540410846
240/4074 | V187 | 0.5170290642140686
241/4074 | V188 | 0.5162662873109222
242/4074 | V189 | 0.5163651329982177
243/4074 | V190 | 0.5165827095501824
244/4074 | V191 | 0.5165154026414132
245/4074 | V192 | 0.5171651944680031
246/4074 | V193 | 0.5168261476608422
247/4074 | V194 | 0.5163009454033753
248/4074 | V195 | 0.5165533848339388
249/4074 | V196 | 0.5166887395814483
250/4074 | V197 | 0.5163546211299369
251/4074 | V198 | 0.5164099812476006
252/4074 | V199 | 0.5163212550192448
253/4074 | V200 | 0.516186673857246
254/4074 | V201 | 0.5161235634776177
255/4074 | V202 | 0.5192695693825634
25

447/4074 | id_30_recency | 0.5911600185726735
448/4074 | id_31_min_dt | 0.6785433704236103
449/4074 | id_31_recency | 0.6325602088334682
450/4074 | FREQ_id_20 | 0.5337940817071541
451/4074 | FREQ_M1 | 0.5544758631558181
452/4074 | FREQ_addr2 | 0.510832393216681
453/4074 | FREQ_id_30 | 0.5441573089457916
454/4074 | FREQ_card3 | 0.5131225989470661
455/4074 | FREQ_id_14 | 0.5024065285622075
456/4074 | FREQ_addr1 | 0.5189710572869213
457/4074 | FREQ_M2 | 0.5590260872011235
458/4074 | FREQ_M5 | 0.5101145441973343
459/4074 | FREQ_card2 | 0.5319925227797707
460/4074 | FREQ_id_24 | 0.5007232705488094
461/4074 | FREQ_id_22 | 0.5007164085008159
462/4074 | FREQ_M8 | 0.5625025332158947
463/4074 | FREQ_id_17 | 0.5170963048513015
464/4074 | FREQ_card5 | 0.5329781490605383
465/4074 | FREQ_id_37 | 0.5157803020179091
466/4074 | FREQ_id_13 | 0.6123814979558698
467/4074 | FREQ_id_35 | 0.5161196976797723
468/4074 | FREQ_id_27 | 0.5004801895193751
469/4074 | FREQ_id_34 | 0.5324526868115256
470/4074 | FREQ_

602/4074 | GROUPBY_card1_GAPmean_V67 | 0.5920698219896481
603/4074 | GROUPBY_card1_GAPmean_V68 | 0.5893652815628825
604/4074 | GROUPBY_card1_GAPmean_V69 | 0.5884302845642081
605/4074 | GROUPBY_card1_GAPmean_V70 | 0.5880438782883237
606/4074 | GROUPBY_card1_GAPmean_V71 | 0.6153142937553704
607/4074 | GROUPBY_card1_GAPmean_V72 | 0.6134253206674718
608/4074 | GROUPBY_card1_GAPmean_V73 | 0.5855143167655493
609/4074 | GROUPBY_card1_GAPmean_V74 | 0.5904845974509576
610/4074 | GROUPBY_card1_GAPmean_V75 | 0.6028791287003488
611/4074 | GROUPBY_card1_GAPmean_V76 | 0.6016749372650045
612/4074 | GROUPBY_card1_GAPmean_V77 | 0.5937106558637257
613/4074 | GROUPBY_card1_GAPmean_V78 | 0.589855414902835
614/4074 | GROUPBY_card1_GAPmean_V79 | 0.5893307837414787
615/4074 | GROUPBY_card1_GAPmean_V80 | 0.625575823375675
616/4074 | GROUPBY_card1_GAPmean_V81 | 0.6239160978961567
617/4074 | GROUPBY_card1_GAPmean_V82 | 0.5930598588562004
618/4074 | GROUPBY_card1_GAPmean_V83 | 0.5903053082245454
619/4074 | GROUP

741/4074 | GROUPBY_card1_GAPmean_V206 | 0.5230055449635602
742/4074 | GROUPBY_card1_GAPmean_V207 | 0.5220372491279455
743/4074 | GROUPBY_card1_GAPmean_V208 | 0.5225830107201134
744/4074 | GROUPBY_card1_GAPmean_V209 | 0.5214880475962342
745/4074 | GROUPBY_card1_GAPmean_V210 | 0.5226668989175408
746/4074 | GROUPBY_card1_GAPmean_V211 | 0.5226920072273588
747/4074 | GROUPBY_card1_GAPmean_V212 | 0.523901861809751
748/4074 | GROUPBY_card1_GAPmean_V213 | 0.5236093264295018
749/4074 | GROUPBY_card1_GAPmean_V214 | 0.5234531309333774
750/4074 | GROUPBY_card1_GAPmean_V215 | 0.5234383509610266
751/4074 | GROUPBY_card1_GAPmean_V216 | 0.5238737653510309
752/4074 | GROUPBY_card1_GAPmean_V217 | 0.520116205669426
753/4074 | GROUPBY_card1_GAPmean_V218 | 0.5201083790720057
754/4074 | GROUPBY_card1_GAPmean_V219 | 0.5205719764036231
755/4074 | GROUPBY_card1_GAPmean_V220 | 0.5216587264087601
756/4074 | GROUPBY_card1_GAPmean_V221 | 0.5190909871360542
757/4074 | GROUPBY_card1_GAPmean_V222 | 0.521632693972229


879/4074 | GROUPBY_card1_GAPmean_id_05 | 0.5213795441792837
880/4074 | GROUPBY_card1_GAPmean_id_06 | 0.5219997501288315
881/4074 | GROUPBY_card1_GAPmean_id_07 | 0.5005888435610457
882/4074 | GROUPBY_card1_GAPmean_id_08 | 0.5007738936923292
883/4074 | GROUPBY_card1_GAPmean_id_09 | 0.5140852354357018
884/4074 | GROUPBY_card1_GAPmean_id_10 | 0.5123697553829876
885/4074 | GROUPBY_card1_GAPmean_id_11 | 0.5211753872166636
886/4074 | GROUPBY_card1_GAPmean_dayofweek | 0.5208607087080473
887/4074 | GROUPBY_card1_GAPmean_date | 0.5181606530728129
888/4074 | GROUPBY_card1_GAPmean_second | 0.49920556847885583
889/4074 | GROUPBY_card1_GAPmean_minute | 0.5020023058499054
890/4074 | GROUPBY_card1_GAPmean_minuteofday | 0.5119682512234706
891/4074 | GROUPBY_card1_GAPmean_FREQ_TransactionDT | 0.5289975980207052
892/4074 | GROUPBY_card1_GAPmean_amt_decimal | 0.5233990812549535
893/4074 | GROUPBY_card1_GAPmean_TransactionAmt_mod1 | 0.5197379433593763
894/4074 | GROUPBY_card1_GAPmean_FREQ_TransactionAmt | 

1021/4074 | GROUPBY_addr1_GAPmean_V38 | 0.5826402387997813
1022/4074 | GROUPBY_addr1_GAPmean_V39 | 0.6145816169913748
1023/4074 | GROUPBY_addr1_GAPmean_V40 | 0.6151311899276493
1024/4074 | GROUPBY_addr1_GAPmean_V41 | 0.5806301389244775
1025/4074 | GROUPBY_addr1_GAPmean_V42 | 0.61232343816049
1026/4074 | GROUPBY_addr1_GAPmean_V43 | 0.6134457699672029
1027/4074 | GROUPBY_addr1_GAPmean_V44 | 0.5801418917968835
1028/4074 | GROUPBY_addr1_GAPmean_V45 | 0.5805959708739119
1029/4074 | GROUPBY_addr1_GAPmean_V46 | 0.581135641205744
1030/4074 | GROUPBY_addr1_GAPmean_V47 | 0.5817530278797942
1031/4074 | GROUPBY_addr1_GAPmean_V48 | 0.5905844644381635
1032/4074 | GROUPBY_addr1_GAPmean_V49 | 0.5907828571551921
1033/4074 | GROUPBY_addr1_GAPmean_V50 | 0.6094581769398368
1034/4074 | GROUPBY_addr1_GAPmean_V51 | 0.5823992958312464
1035/4074 | GROUPBY_addr1_GAPmean_V52 | 0.5849365674855844
1036/4074 | GROUPBY_addr1_GAPmean_V53 | 0.5798464848588663
1037/4074 | GROUPBY_addr1_GAPmean_V54 | 0.576682190557263
1

1158/4074 | GROUPBY_addr1_GAPmean_V175 | 0.5196593933822026
1159/4074 | GROUPBY_addr1_GAPmean_V176 | 0.5179698863012003
1160/4074 | GROUPBY_addr1_GAPmean_V177 | 0.5198662321079269
1161/4074 | GROUPBY_addr1_GAPmean_V178 | 0.5199854620344283
1162/4074 | GROUPBY_addr1_GAPmean_V179 | 0.5201661569523637
1163/4074 | GROUPBY_addr1_GAPmean_V180 | 0.5199799456285132
1164/4074 | GROUPBY_addr1_GAPmean_V181 | 0.5196517744940579
1165/4074 | GROUPBY_addr1_GAPmean_V182 | 0.519846730027637
1166/4074 | GROUPBY_addr1_GAPmean_V183 | 0.5203298738711222
1167/4074 | GROUPBY_addr1_GAPmean_V184 | 0.5180556622560692
1168/4074 | GROUPBY_addr1_GAPmean_V185 | 0.5187680689082116
1169/4074 | GROUPBY_addr1_GAPmean_V186 | 0.5186051538458082
1170/4074 | GROUPBY_addr1_GAPmean_V187 | 0.5183361191790797
1171/4074 | GROUPBY_addr1_GAPmean_V188 | 0.5188016858652285
1172/4074 | GROUPBY_addr1_GAPmean_V189 | 0.5182778333422222
1173/4074 | GROUPBY_addr1_GAPmean_V190 | 0.5183652613249669
1174/4074 | GROUPBY_addr1_GAPmean_V191 | 

1294/4074 | GROUPBY_addr1_GAPmean_V311 | 0.5200821238511995
1295/4074 | GROUPBY_addr1_GAPmean_V312 | 0.5156808069363746
1296/4074 | GROUPBY_addr1_GAPmean_V313 | 0.5230081528132311
1297/4074 | GROUPBY_addr1_GAPmean_V314 | 0.5204437408977997
1298/4074 | GROUPBY_addr1_GAPmean_V315 | 0.5222113031953207
1299/4074 | GROUPBY_addr1_GAPmean_V316 | 0.5180755713195923
1300/4074 | GROUPBY_addr1_GAPmean_V317 | 0.5209816030648479
1301/4074 | GROUPBY_addr1_GAPmean_V318 | 0.5191751122322996
1302/4074 | GROUPBY_addr1_GAPmean_V319 | 0.5240145217924773
1303/4074 | GROUPBY_addr1_GAPmean_V320 | 0.5243518985736327
1304/4074 | GROUPBY_addr1_GAPmean_V321 | 0.5237070176891889
1305/4074 | GROUPBY_addr1_GAPmean_V322 | 0.5072242616926234
1306/4074 | GROUPBY_addr1_GAPmean_V323 | 0.5075707988119619
1307/4074 | GROUPBY_addr1_GAPmean_V324 | 0.5078292771370726
1308/4074 | GROUPBY_addr1_GAPmean_V325 | 0.5068988830243866
1309/4074 | GROUPBY_addr1_GAPmean_V326 | 0.5080283809681451
1310/4074 | GROUPBY_addr1_GAPmean_V327 |

1431/4074 | GROUPBY_ProductCD_GAPmean_D15 | 0.6844056406368744
1432/4074 | GROUPBY_ProductCD_GAPmean_V1 | 0.5615003514117729
1433/4074 | GROUPBY_ProductCD_GAPmean_V2 | 0.5614273781405128
1434/4074 | GROUPBY_ProductCD_GAPmean_V3 | 0.561915801741649
1435/4074 | GROUPBY_ProductCD_GAPmean_V4 | 0.5636117438848498
1436/4074 | GROUPBY_ProductCD_GAPmean_V5 | 0.5628468579015516
1437/4074 | GROUPBY_ProductCD_GAPmean_V6 | 0.5613315815285028
1438/4074 | GROUPBY_ProductCD_GAPmean_V7 | 0.5619574145870799
1439/4074 | GROUPBY_ProductCD_GAPmean_V8 | 0.5623213110905599
1440/4074 | GROUPBY_ProductCD_GAPmean_V9 | 0.5618890926004918
1441/4074 | GROUPBY_ProductCD_GAPmean_V10 | 0.5628070865863473
1442/4074 | GROUPBY_ProductCD_GAPmean_V11 | 0.5629118728257404
1443/4074 | GROUPBY_ProductCD_GAPmean_V12 | 0.6366287577829043
1444/4074 | GROUPBY_ProductCD_GAPmean_V13 | 0.6341875978122459
1445/4074 | GROUPBY_ProductCD_GAPmean_V14 | 0.6103391726170116
1446/4074 | GROUPBY_ProductCD_GAPmean_V15 | 0.5611194338138261
14

1561/4074 | GROUPBY_ProductCD_GAPmean_V130 | 0.5294017665715818
1562/4074 | GROUPBY_ProductCD_GAPmean_V131 | 0.52479566956631
1563/4074 | GROUPBY_ProductCD_GAPmean_V132 | 0.5188649185987324
1564/4074 | GROUPBY_ProductCD_GAPmean_V133 | 0.5204589464778983
1565/4074 | GROUPBY_ProductCD_GAPmean_V134 | 0.5198265178772781
1566/4074 | GROUPBY_ProductCD_GAPmean_V135 | 0.5228139766279052
1567/4074 | GROUPBY_ProductCD_GAPmean_V136 | 0.5277085926743786
1568/4074 | GROUPBY_ProductCD_GAPmean_V137 | 0.5238527515948711
1569/4074 | GROUPBY_ProductCD_GAPmean_V138 | 0.5053504500455582
1570/4074 | GROUPBY_ProductCD_GAPmean_V139 | 0.5071539171922944
1571/4074 | GROUPBY_ProductCD_GAPmean_V140 | 0.5071591926905271
1572/4074 | GROUPBY_ProductCD_GAPmean_V141 | 0.5054599682062559
1573/4074 | GROUPBY_ProductCD_GAPmean_V142 | 0.5055067409610022
1574/4074 | GROUPBY_ProductCD_GAPmean_V143 | 0.5139060321493159
1575/4074 | GROUPBY_ProductCD_GAPmean_V144 | 0.5105064141375144
1576/4074 | GROUPBY_ProductCD_GAPmean_V145

1689/4074 | GROUPBY_ProductCD_GAPmean_V258 | 0.5163803444245766
1690/4074 | GROUPBY_ProductCD_GAPmean_V259 | 0.5181896549919717
1691/4074 | GROUPBY_ProductCD_GAPmean_V260 | 0.5158715990139424
1692/4074 | GROUPBY_ProductCD_GAPmean_V261 | 0.517090320619288
1693/4074 | GROUPBY_ProductCD_GAPmean_V262 | 0.516450547487166
1694/4074 | GROUPBY_ProductCD_GAPmean_V263 | 0.516469084015813
1695/4074 | GROUPBY_ProductCD_GAPmean_V264 | 0.5168504334018399
1696/4074 | GROUPBY_ProductCD_GAPmean_V265 | 0.5163829678503552
1697/4074 | GROUPBY_ProductCD_GAPmean_V266 | 0.5150746162265749
1698/4074 | GROUPBY_ProductCD_GAPmean_V267 | 0.5151420987319469
1699/4074 | GROUPBY_ProductCD_GAPmean_V268 | 0.5150175560581833
1700/4074 | GROUPBY_ProductCD_GAPmean_V269 | 0.5156674675243136
1701/4074 | GROUPBY_ProductCD_GAPmean_V270 | 0.5169795248836849
1702/4074 | GROUPBY_ProductCD_GAPmean_V271 | 0.5170256359252865
1703/4074 | GROUPBY_ProductCD_GAPmean_V272 | 0.5174377880039126
1704/4074 | GROUPBY_ProductCD_GAPmean_V273 

1817/4074 | ProductCD-uid_0_FREQRATIO | 0.5262280103900426
1818/4074 | ProductCD-DeviceType_FREQRATIO | 0.5208444399855184
1819/4074 | ProductCD-id_18_FREQRATIO | 0.5193600209258433
1820/4074 | ProductCD-id_25_FREQRATIO | 0.5168138951103244
1821/4074 | ProductCD-card4_FREQRATIO | 0.5241225843178855
1822/4074 | ProductCD-DeviceInfo_FREQRATIO | 0.5230351827052823
1823/4074 | ProductCD-id_29_FREQRATIO | 0.5196086158974114
1824/4074 | ProductCD-id_32_FREQRATIO | 0.5077071750264177
1825/4074 | ProductCD-id_23_FREQRATIO | 0.5167308409236838
1826/4074 | ProductCD-id_38_FREQRATIO | 0.5518107975971023
1827/4074 | ProductCD-first_dt_FREQRATIO | 0.7100865098426022
1828/4074 | ProductCD-card1_FREQRATIO | 0.5234681223733139
1829/4074 | ProductCD-id_26_FREQRATIO | 0.5179635614410079
1830/4074 | ProductCD-id_31_FREQRATIO | 0.6263358865700803
1831/4074 | ProductCD-M9_FREQRATIO | 0.5894158761850044
1832/4074 | ProductCD-P_emaildomain_FREQRATIO | 0.53044694716065
1833/4074 | ProductCD-M4_FREQRATIO | 0.5

1950/4074 | GROUPBY_first_dt_GAPmean_V71 | 0.7766695725403124
1951/4074 | GROUPBY_first_dt_GAPmean_V72 | 0.7756044749117521
1952/4074 | GROUPBY_first_dt_GAPmean_V73 | 0.7261221114903944
1953/4074 | GROUPBY_first_dt_GAPmean_V74 | 0.7452572821977977
1954/4074 | GROUPBY_first_dt_GAPmean_V75 | 0.6445843024493156
1955/4074 | GROUPBY_first_dt_GAPmean_V76 | 0.6566697177397028
1956/4074 | GROUPBY_first_dt_GAPmean_V77 | 0.7281610845000952
1957/4074 | GROUPBY_first_dt_GAPmean_V78 | 0.7410839433341762
1958/4074 | GROUPBY_first_dt_GAPmean_V79 | 0.7294116825890159
1959/4074 | GROUPBY_first_dt_GAPmean_V80 | 0.7934543313510101
1960/4074 | GROUPBY_first_dt_GAPmean_V81 | 0.7920182857939025
1961/4074 | GROUPBY_first_dt_GAPmean_V82 | 0.7538938885831119
1962/4074 | GROUPBY_first_dt_GAPmean_V83 | 0.7507709114732057
1963/4074 | GROUPBY_first_dt_GAPmean_V84 | 0.7791290712689847
1964/4074 | GROUPBY_first_dt_GAPmean_V85 | 0.7776175071876901
1965/4074 | GROUPBY_first_dt_GAPmean_V86 | 0.7330981007162221
1966/407

2080/4074 | GROUPBY_first_dt_GAPmean_V201 | 0.597355911975686
2081/4074 | GROUPBY_first_dt_GAPmean_V202 | 0.5673967229864764
2082/4074 | GROUPBY_first_dt_GAPmean_V203 | 0.5675304682585333
2083/4074 | GROUPBY_first_dt_GAPmean_V204 | 0.5651587888362128
2084/4074 | GROUPBY_first_dt_GAPmean_V205 | 0.583095480405665
2085/4074 | GROUPBY_first_dt_GAPmean_V206 | 0.5751097375917251
2086/4074 | GROUPBY_first_dt_GAPmean_V207 | 0.5855716086964018
2087/4074 | GROUPBY_first_dt_GAPmean_V208 | 0.5733443203706156
2088/4074 | GROUPBY_first_dt_GAPmean_V209 | 0.5798211654572935
2089/4074 | GROUPBY_first_dt_GAPmean_V210 | 0.5807052492961795
2090/4074 | GROUPBY_first_dt_GAPmean_V211 | 0.5889150968050978
2091/4074 | GROUPBY_first_dt_GAPmean_V212 | 0.5854548825561388
2092/4074 | GROUPBY_first_dt_GAPmean_V213 | 0.5763847516679705
2093/4074 | GROUPBY_first_dt_GAPmean_V214 | 0.5802146142218189
2094/4074 | GROUPBY_first_dt_GAPmean_V215 | 0.5888642899637282
2095/4074 | GROUPBY_first_dt_GAPmean_V216 | 0.57727811473

2210/4074 | GROUPBY_first_dt_GAPmean_V331 | 0.5489461144463131
2211/4074 | GROUPBY_first_dt_GAPmean_V332 | 0.5486388024344337
2212/4074 | GROUPBY_first_dt_GAPmean_V333 | 0.5534675638667074
2213/4074 | GROUPBY_first_dt_GAPmean_V334 | 0.5698629557718011
2214/4074 | GROUPBY_first_dt_GAPmean_V335 | 0.5609107674636229
2215/4074 | GROUPBY_first_dt_GAPmean_V336 | 0.5661811169367474
2216/4074 | GROUPBY_first_dt_GAPmean_V337 | 0.5695173880041745
2217/4074 | GROUPBY_first_dt_GAPmean_V338 | 0.5693399066502334
2218/4074 | GROUPBY_first_dt_GAPmean_V339 | 0.5658101186468439
2219/4074 | GROUPBY_first_dt_GAPmean_id_01 | 0.5663982398606741
2220/4074 | GROUPBY_first_dt_GAPmean_id_02 | 0.5316885772115912
2221/4074 | GROUPBY_first_dt_GAPmean_id_03 | 0.5423233040673234
2222/4074 | GROUPBY_first_dt_GAPmean_id_04 | 0.5428447006181485
2223/4074 | GROUPBY_first_dt_GAPmean_id_05 | 0.5580126607835137
2224/4074 | GROUPBY_first_dt_GAPmean_id_06 | 0.5567587552981218
2225/4074 | GROUPBY_first_dt_GAPmean_id_07 | 0.50

2339/4074 | GROUPBY_P_emaildomain_GAPmean_V12 | 0.5818738138719847
2340/4074 | GROUPBY_P_emaildomain_GAPmean_V13 | 0.5810310556183551
2341/4074 | GROUPBY_P_emaildomain_GAPmean_V14 | 0.5787123189268353
2342/4074 | GROUPBY_P_emaildomain_GAPmean_V15 | 0.5831805432002989
2343/4074 | GROUPBY_P_emaildomain_GAPmean_V16 | 0.5833850977503762
2344/4074 | GROUPBY_P_emaildomain_GAPmean_V17 | 0.6179210485471045
2345/4074 | GROUPBY_P_emaildomain_GAPmean_V18 | 0.6179475355721307
2346/4074 | GROUPBY_P_emaildomain_GAPmean_V19 | 0.5910846644119181
2347/4074 | GROUPBY_P_emaildomain_GAPmean_V20 | 0.5896543690276186
2348/4074 | GROUPBY_P_emaildomain_GAPmean_V21 | 0.6165139799790317
2349/4074 | GROUPBY_P_emaildomain_GAPmean_V22 | 0.6164699888749225
2350/4074 | GROUPBY_P_emaildomain_GAPmean_V23 | 0.5804674446001583
2351/4074 | GROUPBY_P_emaildomain_GAPmean_V24 | 0.5805891603147679
2352/4074 | GROUPBY_P_emaildomain_GAPmean_V25 | 0.5867907095570029
2353/4074 | GROUPBY_P_emaildomain_GAPmean_V26 | 0.587599438232

2460/4074 | GROUPBY_P_emaildomain_GAPmean_V133 | 0.5243011883208347
2461/4074 | GROUPBY_P_emaildomain_GAPmean_V134 | 0.5233605505161786
2462/4074 | GROUPBY_P_emaildomain_GAPmean_V135 | 0.5253423260121303
2463/4074 | GROUPBY_P_emaildomain_GAPmean_V136 | 0.5285750274469265
2464/4074 | GROUPBY_P_emaildomain_GAPmean_V137 | 0.5257556226632405
2465/4074 | GROUPBY_P_emaildomain_GAPmean_V138 | 0.5059512347184851
2466/4074 | GROUPBY_P_emaildomain_GAPmean_V139 | 0.5065701278485302
2467/4074 | GROUPBY_P_emaildomain_GAPmean_V140 | 0.5064813154295941
2468/4074 | GROUPBY_P_emaildomain_GAPmean_V141 | 0.5060443283120588
2469/4074 | GROUPBY_P_emaildomain_GAPmean_V142 | 0.5060239214394737
2470/4074 | GROUPBY_P_emaildomain_GAPmean_V143 | 0.5128855442844785
2471/4074 | GROUPBY_P_emaildomain_GAPmean_V144 | 0.5098581240899112
2472/4074 | GROUPBY_P_emaildomain_GAPmean_V145 | 0.5136851562186081
2473/4074 | GROUPBY_P_emaildomain_GAPmean_V146 | 0.5058776807047852
2474/4074 | GROUPBY_P_emaildomain_GAPmean_V147 |

2580/4074 | GROUPBY_P_emaildomain_GAPmean_V253 | 0.5158979684456191
2581/4074 | GROUPBY_P_emaildomain_GAPmean_V254 | 0.5159163655409585
2582/4074 | GROUPBY_P_emaildomain_GAPmean_V255 | 0.5190717134775047
2583/4074 | GROUPBY_P_emaildomain_GAPmean_V256 | 0.5188972435322292
2584/4074 | GROUPBY_P_emaildomain_GAPmean_V257 | 0.515394442075047
2585/4074 | GROUPBY_P_emaildomain_GAPmean_V258 | 0.5151261116321274
2586/4074 | GROUPBY_P_emaildomain_GAPmean_V259 | 0.5180437978554746
2587/4074 | GROUPBY_P_emaildomain_GAPmean_V260 | 0.5152684996395384
2588/4074 | GROUPBY_P_emaildomain_GAPmean_V261 | 0.5156480209834964
2589/4074 | GROUPBY_P_emaildomain_GAPmean_V262 | 0.5154744502765685
2590/4074 | GROUPBY_P_emaildomain_GAPmean_V263 | 0.5152137485769177
2591/4074 | GROUPBY_P_emaildomain_GAPmean_V264 | 0.5157547494340696
2592/4074 | GROUPBY_P_emaildomain_GAPmean_V265 | 0.5153578653234909
2593/4074 | GROUPBY_P_emaildomain_GAPmean_V266 | 0.5151901455538425
2594/4074 | GROUPBY_P_emaildomain_GAPmean_V267 | 

2700/4074 | P_emaildomain-card2_FREQRATIO | 0.5233640694637538
2701/4074 | P_emaildomain-id_24_FREQRATIO | 0.521381125717964
2702/4074 | P_emaildomain-id_22_FREQRATIO | 0.5215539211272617
2703/4074 | P_emaildomain-M8_FREQRATIO | 0.5775742443522229
2704/4074 | P_emaildomain-id_17_FREQRATIO | 0.5309803423672608
2705/4074 | P_emaildomain-card5_FREQRATIO | 0.5377364636411693
2706/4074 | P_emaildomain-id_37_FREQRATIO | 0.5293698193482576
2707/4074 | P_emaildomain-id_13_FREQRATIO | 0.6077616683357749
2708/4074 | P_emaildomain-id_35_FREQRATIO | 0.5311426431547457
2709/4074 | P_emaildomain-id_27_FREQRATIO | 0.521597718219048
2710/4074 | P_emaildomain-id_34_FREQRATIO | 0.5495566383624669
2711/4074 | P_emaildomain-id_33_FREQRATIO | 0.5299652788714982
2712/4074 | P_emaildomain-M3_FREQRATIO | 0.5711727468283783
2713/4074 | P_emaildomain-uid_0_FREQRATIO | 0.5191551076241803
2714/4074 | P_emaildomain-DeviceType_FREQRATIO | 0.5321711818210846
2715/4074 | P_emaildomain-id_18_FREQRATIO | 0.531637561324

2836/4074 | GROUPBY_uid_0_GAPmean_V61 | 0.5571632295666515
2837/4074 | GROUPBY_uid_0_GAPmean_V62 | 0.5583415814920539
2838/4074 | GROUPBY_uid_0_GAPmean_V63 | 0.5542501431391266
2839/4074 | GROUPBY_uid_0_GAPmean_V64 | 0.55691009635203
2840/4074 | GROUPBY_uid_0_GAPmean_V65 | 0.5523776433928664
2841/4074 | GROUPBY_uid_0_GAPmean_V66 | 0.5560371812922842
2842/4074 | GROUPBY_uid_0_GAPmean_V67 | 0.5569467278078788
2843/4074 | GROUPBY_uid_0_GAPmean_V68 | 0.5529445167078126
2844/4074 | GROUPBY_uid_0_GAPmean_V69 | 0.5557966151987419
2845/4074 | GROUPBY_uid_0_GAPmean_V70 | 0.5571441146340688
2846/4074 | GROUPBY_uid_0_GAPmean_V71 | 0.5547183168815252
2847/4074 | GROUPBY_uid_0_GAPmean_V72 | 0.5552561674813297
2848/4074 | GROUPBY_uid_0_GAPmean_V73 | 0.5547861999810029
2849/4074 | GROUPBY_uid_0_GAPmean_V74 | 0.5595299414633786
2850/4074 | GROUPBY_uid_0_GAPmean_V75 | 0.5779135803404722
2851/4074 | GROUPBY_uid_0_GAPmean_V76 | 0.5788595504773629
2852/4074 | GROUPBY_uid_0_GAPmean_V77 | 0.5672707278093481

2973/4074 | GROUPBY_uid_0_GAPmean_V198 | 0.520102398180712
2974/4074 | GROUPBY_uid_0_GAPmean_V199 | 0.5184971321189449
2975/4074 | GROUPBY_uid_0_GAPmean_V200 | 0.519267344629976
2976/4074 | GROUPBY_uid_0_GAPmean_V201 | 0.5191855227932417
2977/4074 | GROUPBY_uid_0_GAPmean_V202 | 0.5175797536190189
2978/4074 | GROUPBY_uid_0_GAPmean_V203 | 0.5187951930503301
2979/4074 | GROUPBY_uid_0_GAPmean_V204 | 0.5177695762195977
2980/4074 | GROUPBY_uid_0_GAPmean_V205 | 0.5182118413826653
2981/4074 | GROUPBY_uid_0_GAPmean_V206 | 0.5182311989768082
2982/4074 | GROUPBY_uid_0_GAPmean_V207 | 0.5185730591296889
2983/4074 | GROUPBY_uid_0_GAPmean_V208 | 0.5181850205779367
2984/4074 | GROUPBY_uid_0_GAPmean_V209 | 0.5181927100805536
2985/4074 | GROUPBY_uid_0_GAPmean_V210 | 0.5182559731728519
2986/4074 | GROUPBY_uid_0_GAPmean_V211 | 0.5190823311630965
2987/4074 | GROUPBY_uid_0_GAPmean_V212 | 0.5192907832478607
2988/4074 | GROUPBY_uid_0_GAPmean_V213 | 0.5189175925268374
2989/4074 | GROUPBY_uid_0_GAPmean_V214 | 0

3110/4074 | GROUPBY_uid_0_GAPmean_V335 | 0.5076149739479927
3111/4074 | GROUPBY_uid_0_GAPmean_V336 | 0.5076148899706397
3112/4074 | GROUPBY_uid_0_GAPmean_V337 | 0.5074889901293129
3113/4074 | GROUPBY_uid_0_GAPmean_V338 | 0.50774896480474
3114/4074 | GROUPBY_uid_0_GAPmean_V339 | 0.50718764005014
3115/4074 | GROUPBY_uid_0_GAPmean_id_01 | 0.5192184181547049
3116/4074 | GROUPBY_uid_0_GAPmean_id_02 | 0.5162746183988448
3117/4074 | GROUPBY_uid_0_GAPmean_id_03 | 0.5093748755566083
3118/4074 | GROUPBY_uid_0_GAPmean_id_04 | 0.5091427714237005
3119/4074 | GROUPBY_uid_0_GAPmean_id_05 | 0.5185283171569658
3120/4074 | GROUPBY_uid_0_GAPmean_id_06 | 0.5178073158340809
3121/4074 | GROUPBY_uid_0_GAPmean_id_07 | 0.500769861693656
3122/4074 | GROUPBY_uid_0_GAPmean_id_08 | 0.5007725674681868
3123/4074 | GROUPBY_uid_0_GAPmean_id_09 | 0.510002147212802
3124/4074 | GROUPBY_uid_0_GAPmean_id_10 | 0.5096034302578692
3125/4074 | GROUPBY_uid_0_GAPmean_id_11 | 0.5169928500559262
3126/4074 | GROUPBY_uid_0_GAPmean_d

3250/4074 | GROUPBY_uid_1_GAPmean_V32 | 0.5589810492405947
3251/4074 | GROUPBY_uid_1_GAPmean_V33 | 0.5547352253937227
3252/4074 | GROUPBY_uid_1_GAPmean_V34 | 0.5563987296870855
3253/4074 | GROUPBY_uid_1_GAPmean_V35 | 0.5782003330176508
3254/4074 | GROUPBY_uid_1_GAPmean_V36 | 0.579355556845107
3255/4074 | GROUPBY_uid_1_GAPmean_V37 | 0.5720870750002487
3256/4074 | GROUPBY_uid_1_GAPmean_V38 | 0.5724568899235553
3257/4074 | GROUPBY_uid_1_GAPmean_V39 | 0.5747146628151825
3258/4074 | GROUPBY_uid_1_GAPmean_V40 | 0.5767381228985303
3259/4074 | GROUPBY_uid_1_GAPmean_V41 | 0.5673777397617854
3260/4074 | GROUPBY_uid_1_GAPmean_V42 | 0.5748985887941337
3261/4074 | GROUPBY_uid_1_GAPmean_V43 | 0.5776808859023133
3262/4074 | GROUPBY_uid_1_GAPmean_V44 | 0.5712043197657682
3263/4074 | GROUPBY_uid_1_GAPmean_V45 | 0.5709712288258857
3264/4074 | GROUPBY_uid_1_GAPmean_V46 | 0.5700119921463925
3265/4074 | GROUPBY_uid_1_GAPmean_V47 | 0.5706003181046093
3266/4074 | GROUPBY_uid_1_GAPmean_V48 | 0.569797441034118

3388/4074 | GROUPBY_uid_1_GAPmean_V170 | 0.5193641501394731
3389/4074 | GROUPBY_uid_1_GAPmean_V171 | 0.5193345355322379
3390/4074 | GROUPBY_uid_1_GAPmean_V172 | 0.5193147033052786
3391/4074 | GROUPBY_uid_1_GAPmean_V173 | 0.5200545443689238
3392/4074 | GROUPBY_uid_1_GAPmean_V174 | 0.5202841987935007
3393/4074 | GROUPBY_uid_1_GAPmean_V175 | 0.5199259150339532
3394/4074 | GROUPBY_uid_1_GAPmean_V176 | 0.5197176071847818
3395/4074 | GROUPBY_uid_1_GAPmean_V177 | 0.5189855335042921
3396/4074 | GROUPBY_uid_1_GAPmean_V178 | 0.5189544140279041
3397/4074 | GROUPBY_uid_1_GAPmean_V179 | 0.5188306385087648
3398/4074 | GROUPBY_uid_1_GAPmean_V180 | 0.5192831382603399
3399/4074 | GROUPBY_uid_1_GAPmean_V181 | 0.5196857022215673
3400/4074 | GROUPBY_uid_1_GAPmean_V182 | 0.5198807304993277
3401/4074 | GROUPBY_uid_1_GAPmean_V183 | 0.5188163820690211
3402/4074 | GROUPBY_uid_1_GAPmean_V184 | 0.5188707197592816
3403/4074 | GROUPBY_uid_1_GAPmean_V185 | 0.5193509835679211
3404/4074 | GROUPBY_uid_1_GAPmean_V186 |

3524/4074 | GROUPBY_uid_1_GAPmean_V306 | 0.5149796479043683
3525/4074 | GROUPBY_uid_1_GAPmean_V307 | 0.5108189276924343
3526/4074 | GROUPBY_uid_1_GAPmean_V308 | 0.5119971335030886
3527/4074 | GROUPBY_uid_1_GAPmean_V309 | 0.5103736366303273
3528/4074 | GROUPBY_uid_1_GAPmean_V310 | 0.5120305134777696
3529/4074 | GROUPBY_uid_1_GAPmean_V311 | 0.5096971925801
3530/4074 | GROUPBY_uid_1_GAPmean_V312 | 0.5084319806361383
3531/4074 | GROUPBY_uid_1_GAPmean_V313 | 0.5528838359504351
3532/4074 | GROUPBY_uid_1_GAPmean_V314 | 0.5530580817751473
3533/4074 | GROUPBY_uid_1_GAPmean_V315 | 0.5530361801390881
3534/4074 | GROUPBY_uid_1_GAPmean_V316 | 0.5105731788474023
3535/4074 | GROUPBY_uid_1_GAPmean_V317 | 0.5127761514007866
3536/4074 | GROUPBY_uid_1_GAPmean_V318 | 0.5102253450275357
3537/4074 | GROUPBY_uid_1_GAPmean_V319 | 0.5161410427518539
3538/4074 | GROUPBY_uid_1_GAPmean_V320 | 0.5148298442081164
3539/4074 | GROUPBY_uid_1_GAPmean_V321 | 0.5151505147159205
3540/4074 | GROUPBY_uid_1_GAPmean_V322 | 0.

3663/4074 | GROUPBY_uid_2_GAPmean_V1 | 0.5615003514117729
3664/4074 | GROUPBY_uid_2_GAPmean_V2 | 0.5671083211627933
3665/4074 | GROUPBY_uid_2_GAPmean_V3 | 0.5671618311061172
3666/4074 | GROUPBY_uid_2_GAPmean_V4 | 0.5683790125141943
3667/4074 | GROUPBY_uid_2_GAPmean_V5 | 0.5677399288649534
3668/4074 | GROUPBY_uid_2_GAPmean_V6 | 0.5684207479222957
3669/4074 | GROUPBY_uid_2_GAPmean_V7 | 0.5686105851385254
3670/4074 | GROUPBY_uid_2_GAPmean_V8 | 0.5691393929517374
3671/4074 | GROUPBY_uid_2_GAPmean_V9 | 0.5689215123524795
3672/4074 | GROUPBY_uid_2_GAPmean_V10 | 0.5665229515331187
3673/4074 | GROUPBY_uid_2_GAPmean_V11 | 0.5654253996851917
3674/4074 | GROUPBY_uid_2_GAPmean_V12 | 0.5790682455995185
3675/4074 | GROUPBY_uid_2_GAPmean_V13 | 0.5794195931470852
3676/4074 | GROUPBY_uid_2_GAPmean_V14 | 0.5519167496225525
3677/4074 | GROUPBY_uid_2_GAPmean_V15 | 0.558766537005951
3678/4074 | GROUPBY_uid_2_GAPmean_V16 | 0.5586172432706308
3679/4074 | GROUPBY_uid_2_GAPmean_V17 | 0.5745783763408218
3680/40

3801/4074 | GROUPBY_uid_2_GAPmean_V139 | 0.5067416983827832
3802/4074 | GROUPBY_uid_2_GAPmean_V140 | 0.506699841915335
3803/4074 | GROUPBY_uid_2_GAPmean_V141 | 0.5064145066639593
3804/4074 | GROUPBY_uid_2_GAPmean_V142 | 0.5061217178483275
3805/4074 | GROUPBY_uid_2_GAPmean_V143 | 0.5119130492889061
3806/4074 | GROUPBY_uid_2_GAPmean_V144 | 0.5100363012323428
3807/4074 | GROUPBY_uid_2_GAPmean_V145 | 0.5102089834329864
3808/4074 | GROUPBY_uid_2_GAPmean_V146 | 0.5058961374319796
3809/4074 | GROUPBY_uid_2_GAPmean_V147 | 0.5058565242581156
3810/4074 | GROUPBY_uid_2_GAPmean_V148 | 0.5080156729104344
3811/4074 | GROUPBY_uid_2_GAPmean_V149 | 0.5082132384651524
3812/4074 | GROUPBY_uid_2_GAPmean_V150 | 0.5114217447759853
3813/4074 | GROUPBY_uid_2_GAPmean_V151 | 0.5101821519797125
3814/4074 | GROUPBY_uid_2_GAPmean_V152 | 0.5120263581647638
3815/4074 | GROUPBY_uid_2_GAPmean_V153 | 0.5081969766746179
3816/4074 | GROUPBY_uid_2_GAPmean_V154 | 0.5085615244514924
3817/4074 | GROUPBY_uid_2_GAPmean_V155 | 

3937/4074 | GROUPBY_uid_2_GAPmean_V275 | 0.5163437310086166
3938/4074 | GROUPBY_uid_2_GAPmean_V276 | 0.5155102943664357
3939/4074 | GROUPBY_uid_2_GAPmean_V277 | 0.516164906100547
3940/4074 | GROUPBY_uid_2_GAPmean_V278 | 0.5160766032667118
3941/4074 | GROUPBY_uid_2_GAPmean_V279 | 0.5154825047528623
3942/4074 | GROUPBY_uid_2_GAPmean_V280 | 0.5212677785491017
3943/4074 | GROUPBY_uid_2_GAPmean_V281 | 0.5184557420577893
3944/4074 | GROUPBY_uid_2_GAPmean_V282 | 0.5905280290679589
3945/4074 | GROUPBY_uid_2_GAPmean_V283 | 0.5916117910912302
3946/4074 | GROUPBY_uid_2_GAPmean_V284 | 0.5137067610734405
3947/4074 | GROUPBY_uid_2_GAPmean_V285 | 0.5156199242742225
3948/4074 | GROUPBY_uid_2_GAPmean_V286 | 0.5106519650134911
3949/4074 | GROUPBY_uid_2_GAPmean_V287 | 0.5141603114815047
3950/4074 | GROUPBY_uid_2_GAPmean_V288 | 0.5626059243287922
3951/4074 | GROUPBY_uid_2_GAPmean_V289 | 0.5613729071137594
3952/4074 | GROUPBY_uid_2_GAPmean_V290 | 0.5213772613816836
3953/4074 | GROUPBY_uid_2_GAPmean_V291 | 

In [10]:
dependency.sort_values(ascending=False).head(10)

GROUPBY_ProductCD_GAPmean_TransactionDT        1.000000
GROUPBY_addr1_GAPmean_TransactionDT            0.999955
GROUPBY_P_emaildomain_GAPmean_TransactionDT    0.999950
TransactionDT                                  0.999875
GROUPBY_card1_GAPmean_TransactionDT            0.996122
GROUPBY_first_dt_GAPmean_D1                    0.929342
GROUPBY_first_dt_GAPmean_TransactionDT         0.927207
first_dt                                       0.877196
GROUPBY_first_dt_GAPmean_V282                  0.842723
GROUPBY_first_dt_GAPmean_V283                  0.838532
dtype: float64

In [11]:
dependency.to_pickle('time_dependency.pkl')

auc가 0.8 이상인 feature을 drop (0.8은 public leaderboard 점수 기반으로 설정)

In [12]:
train = pd.read_pickle('data/train_fe.pkl')
test = pd.read_pickle('data/test_fe.pkl')
dependency = pd.read_pickle('time_dependency.pkl')
drop_cols = dependency[dependency>0.8].index.tolist()
drop_cols = [c for c in drop_cols if c in test.columns]
print(len(drop_cols))

12


In [13]:
train.drop(drop_cols, axis=1).to_pickle('data/train_reduced.pkl')
test.drop(drop_cols, axis=1).to_pickle('data/test_reduced.pkl')

In [14]:
del train, test; gc.collect();

## 3. Train & Predict

lightgbm params는 대회 초반 optuna를 이용한 hyperparameter search를 통해 정의

8:2의 time-split validation을 통해 검증하였고, early stopping을 통한 최적 rounds가 1500회 정도로 나옴. 데이터 전체에 대해 같은 파라미터로 1500회 학습 진행 후 test 데이터에 대해 예측

In [2]:
params = {
    'num_leaves': 494,
    'min_child_samples': 30,
    'min_child_weight': 0.5438016354148545,
    'learning_rate': 0.010550826137820763,
    'reg_alpha': 0.41156461949242196,
    'reg_lambda': 0.13494658845908017,
    'min_split_gain': 1.6994838530422652e-06,
    'colsample_bytree': 0.6764757839258583,
    'subsample_for_bin': 29271,
    'boost_from_average': True,
    'seed': 32018,
    'device_type': 'gpu',
    'n_jobs': 8,
    'objective': 'binary',
    'num_iterations': 100000,
    'metric': 'None',
    'boosting_type': 'gbdt',
    'subsample': 1,
    'subsample_freq': 1
}

In [3]:
data = pd.read_pickle('data/train_reduced.pkl')
test = pd.read_pickle('data/test_reduced.pkl')

In [4]:
params['num_iterations'] = 1500
print(params)

{'num_leaves': 494, 'min_child_samples': 30, 'min_child_weight': 0.5438016354148545, 'learning_rate': 0.010550826137820763, 'reg_alpha': 0.41156461949242196, 'reg_lambda': 0.13494658845908017, 'min_split_gain': 1.6994838530422652e-06, 'colsample_bytree': 0.6764757839258583, 'subsample_for_bin': 29271, 'boost_from_average': True, 'seed': 32018, 'device_type': 'gpu', 'n_jobs': 8, 'objective': 'binary', 'num_iterations': 1500, 'metric': 'None', 'boosting_type': 'gbdt', 'subsample': 1, 'subsample_freq': 1}


In [5]:
x_data = data.drop('isFraud', axis=1)
y_data = data['isFraud']
model = LGBMClassifier(**params)
model = model.fit(x_data, y_data)

In [9]:
test_pred = model.predict_proba(test)[:, 1]
dt = datetime.now().strftime('%m%d%H%M')
sub = pd.read_csv('data/sample_submission.csv')
sub['isFraud'] = test_pred
sub.to_csv(f'submission/{dt}.csv', index=False)
sub.head(10)

Unnamed: 0,TransactionID,isFraud
0,3663549,6e-06
1,3663550,3e-06
2,3663551,9e-06
3,3663552,2e-06
4,3663553,0.000122
5,3663554,5e-06
6,3663555,1e-05
7,3663556,0.001529
8,3663557,7e-06
9,3663558,5e-06


In [12]:
feat_imp = pd.Series(model.feature_importances_, index=x_data.columns).sort_values(ascending=False)
feat_imp.to_csv('feat_imp.csv')
feat_imp.head(50)

uid_2                                           3081
GROUPBY_ProductCD_GAPmean_TransactionAmt        2231
first_dt-card2_FREQRATIO                        2052
card1-first_dt_FREQRATIO                        1864
first_dt-card4_FREQRATIO                        1830
addr1-first_dt_FREQRATIO                        1787
first_dt-card5_FREQRATIO                        1759
addr1-card2_FREQRATIO                           1749
first_dt-card1_FREQRATIO                        1711
GROUPBY_P_emaildomain_GAPmean_TransactionAmt    1706
GROUPBY_addr1_GAPmean_TransactionAmt            1706
first_dt-uid_2_FREQRATIO                        1702
first_dt-M4_FREQRATIO                           1681
first_dt-card6_FREQRATIO                        1630
GROUPBY_first_dt_GAPmean_V26                    1626
GROUPBY_first_dt_GAPmean_V87                    1591
C13                                             1560
GROUPBY_first_dt_GAPmean_V23                    1553
GROUPBY_first_dt_GAPmean_TransactionAmt_mod1  

In [11]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)