In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import KFold, TimeSeriesSplit, StratifiedKFold, GroupKFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc, re, sys, os, datetime, pickle
from sklearn.metrics import roc_auc_score
sys.path.append('..')
from utils import *

In [2]:
train_df = pickle.load(open('../input/df_train_baseline.pkl', 'rb'))
test_df = pickle.load(open('../input/df_test_baseline.pkl', 'rb'))
print(train_df.shape)
print(test_df.shape)

(590540, 806)
(506691, 806)


In [5]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

Mem. usage decreased to 1295.31 Mb (43.0% reduction)
Mem. usage decreased to 1119.21 Mb (42.8% reduction)


In [12]:
features = [x for x in train_df.columns if x not in remove_features and train_df[x].dtype!='object']
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':80000,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                    'early_stopping_rounds':100, 
                } 
_, model = local_valid(train_df, 'isFraud', features, params, return_model=True, sample=False)



Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.951499	valid_1's auc: 0.89813
[200]	training's auc: 0.978144	valid_1's auc: 0.91533
[300]	training's auc: 0.990859	valid_1's auc: 0.923991
[400]	training's auc: 0.996123	valid_1's auc: 0.929344
[500]	training's auc: 0.998393	valid_1's auc: 0.93242
[600]	training's auc: 0.999303	valid_1's auc: 0.93403
[700]	training's auc: 0.999684	valid_1's auc: 0.934962
[800]	training's auc: 0.999853	valid_1's auc: 0.93574
[900]	training's auc: 0.999933	valid_1's auc: 0.936052
[1000]	training's auc: 0.999972	valid_1's auc: 0.936219
[1100]	training's auc: 0.999989	valid_1's auc: 0.936282
[1200]	training's auc: 0.999996	valid_1's auc: 0.936413
[1300]	training's auc: 0.999999	valid_1's auc: 0.9362
Early stopping, best iteration is:
[1208]	training's auc: 0.999996	valid_1's auc: 0.936452


In [33]:
df = train_df
n = df.shape[0]
valid_idx = list(df.index[int(n*0.8):])
X_val, y_val = df.loc[valid_idx, features], df.loc[valid_idx, 'isFraud']
columns = [x for x in features if not re.match('^V[0-9]+$', x) and 'emaildomain' not in x]

# results = {}

# y_pred = model.predict_proba(X_val)[:,1]

# results['base_score'] = roc_auc_score(y_val, y_pred)
# print(f'Base score {results["base_score"]:.5}')

for col in columns:
    if col not in results:
        res_lst = []
        freezed_col = X_val[col].copy()
        for _ in range(4):
            X_val[col] = np.random.permutation(X_val[col])
            preds = model.predict_proba(X_val)[:,1]
            res_lst.append(roc_auc_score(y_val, preds))
        X_val[col] = freezed_col
        results[col] = np.mean(res_lst)
        print(f'column: {col} - {results[col]:.5}')
        gc.collect()


column: C1 - 0.93444
column: C2 - 0.93588
column: C3 - 0.93639
column: C4 - 0.93651
column: C5 - 0.93629
column: C6 - 0.93587
column: C7 - 0.93648
column: C8 - 0.93631
column: C9 - 0.93622
column: C10 - 0.93644
column: C11 - 0.9348
column: C12 - 0.93648
column: C13 - 0.93157
column: C14 - 0.93436
column: D1 - 0.9363
column: D2 - 0.93622
column: D3 - 0.93645
column: D4 - 0.93635
column: D5 - 0.93644
column: D6 - 0.93651
column: D7 - 0.93645
column: D8 - 0.93643
column: D9 - 0.9365
column: D10 - 0.93639
column: D11 - 0.9364
column: D12 - 0.93644
column: D13 - 0.93646
column: D14 - 0.93645
column: D15 - 0.93637
column: M1 - 0.93644
column: M2 - 0.93644
column: M3 - 0.93609
column: M4 - 0.93458
column: M5 - 0.93445
column: M6 - 0.93523
column: M7 - 0.93639
column: M8 - 0.93644
column: M9 - 0.93636
column: is_december - 0.93645
column: is_holiday - 0.93645
column: card1_fq_enc - 0.93646
column: card2_fq_enc - 0.93643
column: card3_fq_enc - 0.93655
column: card5_fq_enc - 0.93645
column: uid_

column: D8_D9_decimal_dist - 0.93643
column: D3_DT_D_min_max - 0.93642
column: D3_DT_D_std_score - 0.93628
column: D4_DT_D_min_max - 0.93629
column: D4_DT_D_std_score - 0.9363
column: D5_DT_D_min_max - 0.9364
column: D5_DT_D_std_score - 0.93644
column: D6_DT_D_min_max - 0.93644
column: D6_DT_D_std_score - 0.93641
column: D7_DT_D_min_max - 0.93644
column: D7_DT_D_std_score - 0.93645
column: D8_DT_D_min_max - 0.93639
column: D8_DT_D_std_score - 0.93644
column: D10_DT_D_min_max - 0.93636
column: D10_DT_D_std_score - 0.93625
column: D11_DT_D_min_max - 0.93639
column: D11_DT_D_std_score - 0.93637
column: D12_DT_D_min_max - 0.93643
column: D12_DT_D_std_score - 0.93641
column: D13_DT_D_min_max - 0.93644
column: D13_DT_D_std_score - 0.93666
column: D14_DT_D_min_max - 0.93646
column: D14_DT_D_std_score - 0.93635
column: D15_DT_D_min_max - 0.93607
column: D15_DT_D_std_score - 0.93633
column: D3_DT_W_min_max - 0.93642
column: D3_DT_W_std_score - 0.93648
column: D4_DT_W_min_max - 0.93635
column: D

ValueError: train and valid dataset categorical_feature do not match.

In [34]:
results

{'base_score': 0.9364484865559555,
 'TransactionAmt': 0.9360876007122682,
 'ProductCD': 0.936033096132671,
 'card1': 0.9362017147874546,
 'card2': 0.9363244485327117,
 'card3': 0.9364657679695806,
 'card4': 0.9362822660897288,
 'card5': 0.936394987424732,
 'card6': 0.9359844014696151,
 'addr1': 0.9360700005110958,
 'addr2': 0.9364371207819844,
 'dist1': 0.936077010061319,
 'dist2': 0.9365781370740108,
 'id_31_device': 0.936513038067639,
 'C1': 0.9344426531904594,
 'C2': 0.9358845260321546,
 'C3': 0.9363910880758622,
 'C4': 0.9365103939110253,
 'C5': 0.9362886866111837,
 'C6': 0.9358745767321262,
 'C7': 0.9364755435816388,
 'C8': 0.9363132084397872,
 'C9': 0.9362231096931921,
 'C10': 0.9364409440749419,
 'C11': 0.9348015836959737,
 'C12': 0.9364815617079827,
 'C13': 0.9315731892970859,
 'C14': 0.9343574398225771,
 'D1': 0.9363048865205224,
 'D2': 0.9362245256277311,
 'D3': 0.9364536076540564,
 'D4': 0.9363545699104392,
 'D5': 0.9364404505206169,
 'D6': 0.9365091171426237,
 'D7': 0.93644

In [39]:
perm_imp = pd.DataFrame.from_dict(results, orient='index', columns=['score'])
perm_imp.sort_values('score')

Unnamed: 0,score
C13,0.931573
C1_fq_enc,0.933517
D2_scaled,0.933594
C14,0.934357
C1,0.934443
...,...
C4_fq_enc,0.936556
uid5_D2_mean,0.936558
uid_D4_std,0.936562
dist2,0.936578


In [66]:
perm_imp.to_csv('../result/perm_imp.csv')

In [48]:
(perm_imp['score'] > 0.9363).sum()

371

In [49]:
remove_features = pickle.load(open('../input/remove_features.pkl', 'rb'))
remove_features = list(remove_features['features_to_remove'])
remove_features += [x for x in results if x!='base_score' and results[x]>0.9364]

In [46]:
features = [x for x in train_df.columns if x not in remove_features and train_df[x].dtype!='object']
print(len(features))
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':80000,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                    'early_stopping_rounds':100, 
                } 
_, model = local_valid(train_df, 'isFraud', features, params, return_model=True, sample=True)

512




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.953555	valid_1's auc: 0.898551
[200]	training's auc: 0.974493	valid_1's auc: 0.913637
[300]	training's auc: 0.986777	valid_1's auc: 0.923242
[400]	training's auc: 0.993331	valid_1's auc: 0.929279
[500]	training's auc: 0.996806	valid_1's auc: 0.932651
[600]	training's auc: 0.998576	valid_1's auc: 0.934334
[700]	training's auc: 0.999406	valid_1's auc: 0.935317
[800]	training's auc: 0.999767	valid_1's auc: 0.936123
[900]	training's auc: 0.999915	valid_1's auc: 0.93669
[1000]	training's auc: 0.999973	valid_1's auc: 0.936946
[1100]	training's auc: 0.999993	valid_1's auc: 0.937491
[1200]	training's auc: 0.999999	valid_1's auc: 0.937688
[1300]	training's auc: 1	valid_1's auc: 0.937833
[1400]	training's auc: 1	valid_1's auc: 0.937913
[1500]	training's auc: 1	valid_1's auc: 0.938082
Early stopping, best iteration is:
[1439]	training's auc: 1	valid_1's auc: 0.938031


In [51]:
remove_features = pickle.load(open('../input/remove_features.pkl', 'rb'))
remove_features = list(remove_features['features_to_remove'])
remove_features += [x for x in results if x!='base_score' and results[x]>0.93644]
features = [x for x in train_df.columns if x not in remove_features and train_df[x].dtype!='object']
print(len(features))
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':80000,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                    'early_stopping_rounds':100, 
                } 
_, model = local_valid(train_df, 'isFraud', features, params, return_model=True, sample=True)

604




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.955633	valid_1's auc: 0.899024
[200]	training's auc: 0.975971	valid_1's auc: 0.915457
[300]	training's auc: 0.987887	valid_1's auc: 0.924647
[400]	training's auc: 0.994276	valid_1's auc: 0.930303
[500]	training's auc: 0.997423	valid_1's auc: 0.933321
[600]	training's auc: 0.998912	valid_1's auc: 0.935142
[700]	training's auc: 0.999584	valid_1's auc: 0.936235
[800]	training's auc: 0.999849	valid_1's auc: 0.936794
[900]	training's auc: 0.999951	valid_1's auc: 0.937362
[1000]	training's auc: 0.999987	valid_1's auc: 0.937724
[1100]	training's auc: 0.999998	valid_1's auc: 0.937779
[1200]	training's auc: 1	valid_1's auc: 0.938074
[1300]	training's auc: 1	valid_1's auc: 0.93828
[1400]	training's auc: 1	valid_1's auc: 0.938562
[1500]	training's auc: 1	valid_1's auc: 0.938611
Early stopping, best iteration is:
[1452]	training's auc: 1	valid_1's auc: 0.938659


In [56]:
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators': 1900,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                }
X_train = train_df[features]
y_train = train_df['isFraud']
model = lgb.LGBMClassifier(**params)
model.fit(X_train, y_train, 
        eval_set=[(X_train, y_train)], eval_metric='auc', verbose=100)

[100]	training's auc: 0.946078
[200]	training's auc: 0.971687
[300]	training's auc: 0.985589
[400]	training's auc: 0.992441
[500]	training's auc: 0.996068
[600]	training's auc: 0.997974
[700]	training's auc: 0.998945
[800]	training's auc: 0.999433
[900]	training's auc: 0.999693
[1000]	training's auc: 0.999829
[1100]	training's auc: 0.999907
[1200]	training's auc: 0.999951
[1300]	training's auc: 0.999975
[1400]	training's auc: 0.999988
[1500]	training's auc: 0.999995
[1600]	training's auc: 0.999998
[1700]	training's auc: 0.999999
[1800]	training's auc: 1
[1900]	training's auc: 1


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
        importance_type='split', learning_rate=0.01, max_bin=255,
        max_depth=-1, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=1900,
        n_jobs=-1, num_leaves=256, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, seed=42, silent=True, subsample=0.7,
        subsample_for_bin=200000, subsample_freq=1, tree_learner='serial',
        verbose=100)

In [57]:
test_df['isFraud'] = model.predict_proba(test_df[features])[:, 1]





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [58]:
test_df[['TransactionID', 'isFraud']].to_csv('../result/baseline_perm_imp_0915.csv', index=False)

In [15]:
remove_features = pickle.load(open('../input/remove_features.pkl', 'rb'))
remove_features = list(remove_features['features_to_remove'])
perm_imp = pd.read_csv('../result/perm_imp.csv', index_col=0)
remove_features += [x for x in perm_imp.index if x!='base_score' and perm_imp.loc[x, 'score']>0.9364]
features = [x for x in train_df.columns if x not in remove_features and train_df[x].dtype!='object']
print(len(features))
gc.collect()
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators': 1900,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                }
NFOLDS = 6
folds = GroupKFold(n_splits=NFOLDS)

predictions = np.zeros(len(test_df))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df, train_df['isFraud'], groups=train_df['DT_M'])):
    print('Fold:',fold_)
    tr_x, tr_y = train_df.loc[trn_idx, features], train_df.loc[trn_idx, 'isFraud']
    vl_x, vl_y = train_df.loc[val_idx,features], train_df.loc[val_idx, 'isFraud']

    print(len(tr_x),len(vl_x))
    
    estimator = lgb.LGBMRegressor(**params)
    estimator.fit(tr_x, tr_y,
        eval_set=[(tr_x, tr_y), (vl_x, vl_y)], eval_metric='auc',
        verbose=100, early_stopping_rounds=100)

    pp_p = estimator.predict(test_df[features])
    predictions += pp_p/NFOLDS

    
    del tr_x, tr_y, vl_x, vl_y
    gc.collect()

test_df['isFraud'] = predictions

512
Fold: 0
453219 137321
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.949966	valid_1's auc: 0.877888
[200]	training's auc: 0.975443	valid_1's auc: 0.891396
[300]	training's auc: 0.988073	valid_1's auc: 0.899044
[400]	training's auc: 0.994528	valid_1's auc: 0.903837
[500]	training's auc: 0.997401	valid_1's auc: 0.907659
[600]	training's auc: 0.998751	valid_1's auc: 0.910078
[700]	training's auc: 0.999411	valid_1's auc: 0.911976
[800]	training's auc: 0.999715	valid_1's auc: 0.913608
[900]	training's auc: 0.999858	valid_1's auc: 0.914784
[1000]	training's auc: 0.999934	valid_1's auc: 0.916046
[1100]	training's auc: 0.999969	valid_1's auc: 0.916374
[1200]	training's auc: 0.999986	valid_1's auc: 0.916733
[1300]	training's auc: 0.999995	valid_1's auc: 0.917473
[1400]	training's auc: 0.999998	valid_1's auc: 0.91822
[1500]	training's auc: 0.999999	valid_1's auc: 0.918354
[1600]	training's auc: 1	valid_1's auc: 0.918468
[1700]	training's auc: 1	valid_1

In [18]:
remove_features = pickle.load(open('../input/remove_features.pkl', 'rb'))
remove_features = list(remove_features['features_to_remove'])
perm_imp = pd.read_csv('../result/perm_imp.csv', index_col=0)
remove_features += [x for x in perm_imp.index if x!='base_score' and perm_imp.loc[x, 'score']>0.9363]
features = [x for x in train_df.columns if x not in remove_features and train_df[x].dtype!='object']
print(len(features))
gc.collect()
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators': 1900,
                    'max_bin':255,
                    'verbose':100,
                    'seed': 42,
                }
NFOLDS = 6
folds = GroupKFold(n_splits=NFOLDS)

predictions = np.zeros(len(test_df))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df, train_df['isFraud'], groups=train_df['DT_M'])):
    print('Fold:',fold_)
    tr_x, tr_y = train_df.loc[trn_idx, features], train_df.loc[trn_idx, 'isFraud']
    vl_x, vl_y = train_df.loc[val_idx,features], train_df.loc[val_idx, 'isFraud']

    print(len(tr_x),len(vl_x))
    
    estimator = lgb.LGBMRegressor(**params)
    estimator.fit(tr_x, tr_y,
        eval_set=[(tr_x, tr_y), (vl_x, vl_y)], eval_metric='auc',
        verbose=100, early_stopping_rounds=100)

    pp_p = estimator.predict(test_df[features])
    predictions += pp_p/NFOLDS

    
    del tr_x, tr_y, vl_x, vl_y
    gc.collect()

test_df['isFraud'] = predictions
test_df[['TransactionID', 'isFraud']].to_csv('../result/baseline_perm_imp_gkfold0915_93644.csv', index=False)

417
Fold: 0
453219 137321
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.946083	valid_1's auc: 0.87746
[200]	training's auc: 0.970506	valid_1's auc: 0.891434
[300]	training's auc: 0.984613	valid_1's auc: 0.898873
[400]	training's auc: 0.991886	valid_1's auc: 0.904953
[500]	training's auc: 0.995749	valid_1's auc: 0.908946
[600]	training's auc: 0.997721	valid_1's auc: 0.911423
[700]	training's auc: 0.998821	valid_1's auc: 0.913085
[800]	training's auc: 0.999377	valid_1's auc: 0.914525
[900]	training's auc: 0.999663	valid_1's auc: 0.91544
[1000]	training's auc: 0.999813	valid_1's auc: 0.916247
[1100]	training's auc: 0.999897	valid_1's auc: 0.916764
[1200]	training's auc: 0.999946	valid_1's auc: 0.917566
[1300]	training's auc: 0.999972	valid_1's auc: 0.918362
[1400]	training's auc: 0.999986	valid_1's auc: 0.918718
[1500]	training's auc: 0.999993	valid_1's auc: 0.918964
[1600]	training's auc: 0.999997	valid_1's auc: 0.919044
[1700]	training's auc: 0.9