In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import sys
sys.path.append("..")
from utils import *
from merge_utils import *
import re
import gc

In [2]:
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn import preprocessing
import pickle

In [3]:
df_train = pickle.load(open('./data/df_train_raw.pkl', 'rb'))
df_test = pickle.load(open('./data/df_test_raw.pkl', 'rb'))
df_train = df_train.sort_values('TransactionDT')

In [11]:
cat_cols = ['id_%i'%x for x in range(12, 39)] + ['M%i'%x for x in range(1, 10)] + \
            [ 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'Card_ID']
for col in cat_cols:
    if col in df_train.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(df_train[col].astype(str).values) + list(df_test[col].astype(str).values))
        df_train[col] = le.transform(list(df_train[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values))  

In [4]:
Vcols = ['V%i'%i for i in range(1, 340)]
# 0.8428824

In [5]:
df = df_train.append(df_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
Vgroup = {}
cnt_num = df[Vcols].apply(lambda x: x.count())
cnt_num = cnt_num.to_frame('cnt')
for k,v in cnt_num.groupby('cnt'):
    Vgroup[k] = list(v.index)
Vgroup = {v[0]: v for k,v in Vgroup.items()}

In [19]:
corr_dict = {k: df_train[v].corr() for k,v in Vgroup.items()}

In [7]:
svd_dict = {}
for k,v in Vgroup.items():
    svd_dict[k] = np.linalg.svd(df.loc[df[k].notnull(), v].values.astype(np.float32), full_matrices=0)
tol = 0.9
for k, (U, s, VT) in svd_dict.items():
    select = np.where(np.cumsum(s)/np.sum(s) > tol)[0][0]+1
    print('group: %s to %s, principal components num %i/%i'%(k, Vgroup[k][-1], select, len(Vgroup[k])))
    train_idx = df_train[k].notnull()
    train_num = train_idx.sum()
    test_idx = df_test[k].notnull()
    test_num = test_idx.sum()
    new_cols = ['%s_pca_%i'%(k, i) for i in range(select)]
    for col in new_cols:
        df_train[col] = np.nan
        df_test[col] = np.nan
    df_train.loc[train_idx, new_cols] = np.dot(np.dot(U[:train_num, :select], np.diag(s[:select])), VT[:select, :select])
    df_test.loc[test_idx, new_cols] = np.dot(np.dot(U[train_num:, :select], np.diag(s[:select])), VT[:select, :select])

group: V217 to V278, principal components num 5/46
(130430, 5)
group: V322 to V339, principal components num 4/18
(82351, 4)
group: V220 to V272, principal components num 4/16
(141416, 4)
group: V279 to V321, principal components num 5/32
(590528, 5)
group: V169 to V210, principal components num 3/19
(139819, 3)
group: V35 to V52, principal components num 10/18
(421571, 10)
group: V167 to V216, principal components num 5/31
(139631, 5)
group: V143 to V166, principal components num 1/11
(81951, 1)
group: V1 to V11, principal components num 6/11
(311253, 6)
group: V95 to V137, principal components num 5/43
(590226, 5)
group: V138 to V163, principal components num 3/18
(81945, 3)
group: V12 to V34, principal components num 12/23
(514467, 12)
group: V281 to V315, principal components num 3/11
(589271, 3)
group: V53 to V74, principal components num 12/22
(513444, 12)
group: V75 to V94, principal components num 12/20
(501376, 12)


In [13]:
def xgb_model_v3(params, df_train, df_val, df_test, features, label='label', num_boost_round=200):
    Dtrain = xgb.DMatrix(df_train[features], df_train[label])
    Dval = xgb.DMatrix(df_val[features], df_val[label])
    watchlist = [(Dtrain, 'train'), (Dval, 'val')]
    model = xgb.train(params, dtrain=Dtrain, early_stopping_rounds=100, 
                      evals=watchlist, num_boost_round=num_boost_round, verbose_eval=20)
    auc = model.best_score
    print('model best auc: %.4f at ntree_limit-%i'%(auc, model.best_ntree_limit))
    del Dtrain, Dval
    gc.collect()
#     Dtest = xgb.DMatrix(df_test[features])
#     pred = model.predict(Dtest, model.best_ntree_limit)
    return model, auc

In [None]:
from multiprocessing import Pool
pool = Pool(12)
iv_dict = {}
def worker(x):
    iv_dict[x] = count(df_train, x)[::-1]
pool.map(worker, Vcols)
pool.close()

In [51]:
gc.collect()
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.05,   
    'gamma': 0.5,                  
    'max_depth': 10,  
    'num_leaves': 12,
    'subsample': 0.7,  
    'colsample_bytree': 0.8,       
    'min_child_weight': 100,
    'reg_alpha': 0.,
    'reg_lambda': 0.,
    'scale_pos_weight': 27.6,               
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
}
folds = TimeSeriesSplit(n_splits=5)
f = [x for x in df_train.columns if 'pca' in x]
print(len(f))
#f = [x for x in f if 'test2' not in x  and 'test4' not in x]
# to_drop = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
#                     'ProductCD', 'DeviceType', 'id_19', 'id_20', 'id_32', 'P_emaildomain']
# f = list(set(f)-set(to_drop))
auc_lst = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train)):
    _, auc = xgb_model_v3(params, df_train.iloc[train_index, :], df_train.iloc[valid_index, :], df_test, f, 'label', num_boost_round=600)
    auc_lst.append(auc)
    gc.collect()
print(auc_lst)
print(np.mean(auc_lst))

90
[0]	train-auc:0.863812	val-auc:0.788622
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.89837	val-auc:0.815317
[40]	train-auc:0.909807	val-auc:0.816665
[60]	train-auc:0.916527	val-auc:0.814617
[80]	train-auc:0.921656	val-auc:0.812634
[100]	train-auc:0.926501	val-auc:0.810046
[120]	train-auc:0.930526	val-auc:0.806275
Stopping. Best iteration:
[35]	train-auc:0.906527	val-auc:0.817032

model best auc: 0.8170 at ntree_limit-36
[0]	train-auc:0.858351	val-auc:0.793631
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.894207	val-auc:0.799841
[40]	train-auc:0.905236	val-auc:0.794281
[60]	train-auc:0.911698	val-auc:0.79258
[80]	train-auc:0.916053	val-auc:0.789695
[100]	train-auc:0.919292	val-auc:0.786407
Stopping. Best iteration:
[3]	train-auc:0.881663	val-auc:0.80346

model best

In [52]:
iv_dict = {}
for x in f:
    iv_dict[x] = count(df_train, x)[::-1]
    print(iv_dict[x])

(0.4452334985372508,                          total       1      rate
V35_pca_0                                       
(-0.761, -0.00555]     66991.0  3266.0  0.048753
(-0.00555, -0.00199]   58031.0   288.0  0.004963
(-0.00199, -0.00107]    4293.0    78.0  0.018169
(-0.00107, 0.00414]    47077.0  2968.0  0.063046
(0.00414, 0.972]      142044.0  6481.0  0.045627
(0.972, 0.976]         81806.0   568.0  0.006943
(0.976, 2.959]         21329.0   918.0  0.043040
NaN                   168969.0  6096.0  0.036078)
(0.4112237297829544,                         total       1      rate
V35_pca_1                                      
(-0.273, -0.00135]    53045.0  5857.0  0.110416
(-0.00135, 0.00505]   72938.0   975.0  0.013368
(0.00505, 0.00517]      685.0    45.0  0.065693
(0.00517, 0.0107]     54140.0  1552.0  0.028666
(0.0107, 1.023]      104875.0  2056.0  0.019604
(1.023, 1.028]       113118.0  2789.0  0.024656
(1.028, 3.115]        22770.0  1293.0  0.056785
NaN                  168969.0  6096

(0.567030193788112,                                     total       1      rate
V75_pca_0                                                  
(-1.8619999999999999, -0.00956]   59324.0  2078.0  0.035028
(-0.00956, -0.00667]              62367.0   531.0  0.008514
(-0.00667, 0.00151]               39066.0   874.0  0.022372
(0.00151, 0.00694]                43322.0  3056.0  0.070542
(0.00694, 0.97]                   46609.0  4716.0  0.101182
(0.97, 0.971]                    103540.0  2458.0  0.023740
(0.971, 0.974]                    76748.0   610.0  0.007948
(0.974, 0.979]                    29062.0   767.0  0.026392
(0.979, 3.882]                    41338.0  1297.0  0.031375
NaN                               89164.0  4276.0  0.047957)
(0.6096046092456957,                         total       1      rate
V75_pca_1                                      
(-0.329, -0.00667]    52950.0  6287.0  0.118735
(-0.00667, 0.00612]  103350.0  1490.0  0.014417
(0.00612, 0.01]       49633.0  1149.0  0.02315

(0.46647050856552924,                          total       1      rate
V53_pca_1                                       
(-0.523, -0.00743]     61807.0  1278.0  0.020677
(-0.00743, -0.00448]   44892.0  1059.0  0.023590
(-0.00448, 0.00363]    47996.0  5045.0  0.105113
(0.00363, 0.209]       50788.0  1405.0  0.027664
(0.209, 0.998]        112612.0  1770.0  0.015718
(0.998, 1.001]        114715.0  2655.0  0.023144
(1.001, 1.017]         49383.0   879.0  0.017800
(1.017, 5.044]         31251.0  2063.0  0.066014
NaN                    77096.0  4509.0  0.058486)
(0.6703676803974816,                     total       1      rate
V53_pca_2                                  
(-0.596, 0.991]   52665.0  2033.0  0.038602
(0.991, 0.994]    81184.0  1379.0  0.016986
(0.994, 0.995]   114526.0  2656.0  0.023191
(0.995, 0.998]    50500.0   303.0  0.006000
(0.998, 0.999]    86460.0   622.0  0.007194
(0.999, 1.013]    26175.0  1076.0  0.041108
(1.013, 1.024]    50815.0  4871.0  0.095858
(1.024, 16.862]   511

(0.7046433792837671,                          total       1      rate
V279_pca_1                                      
(-338.623, -0.381]     59053.0  1809.0  0.030633
(-0.381, -0.165]       59269.0  1018.0  0.017176
(-0.165, -0.0625]      58887.0   777.0  0.013195
(-0.0625, 2.09e-06]   245040.0  3416.0  0.013941
(2.09e-06, 2.73e-06]  101546.0  7310.0  0.071987
(2.73e-06, 0.123]       7680.0   668.0  0.086979
(0.123, 964.74]        59053.0  5663.0  0.095897
NaN                       12.0     2.0  0.166667)
(0.30575017508062025,                            total       1      rate
V279_pca_2                                        
(-1.184, -1.31e-07]     247380.0  5469.0  0.022108
(-1.31e-07, -1.27e-07]  101550.0  7310.0  0.071984
(-1.27e-07, 0.0017]       5388.0   602.0  0.111730
(0.0017, 0.00903]        59051.0  1258.0  0.021304
(0.00903, 0.0185]        59062.0  1254.0  0.021232
(0.0185, 0.0397]         59085.0  1793.0  0.030346
(0.0397, 12.475]         59012.0  2975.0  0.050413
NaN    

(0.12779142257101614,                                total        1      rate
V138_pca_2                                             
(-0.0010000000000343, 0.0]   21683.0    627.0  0.028917
(0.0, 9.02e-05]              45857.0   1063.0  0.023181
(9.02e-05, 0.000216]          6388.0    606.0  0.094865
(0.000216, 28.626]            8017.0   1354.0  0.168891
NaN                         508595.0  17013.0  0.033451)
(0.6181307335045202,                         total       1      rate
V220_pca_0                                     
(-7.503, 0.0]         23211.0  2278.0  0.098143
(0.0, 0.000556]       80968.0  2946.0  0.036385
(0.000556, 0.00117]   10172.0  1251.0  0.122985
(0.00117, 0.0261]     13298.0  2965.0  0.222966
(0.0261, 20.681]      13767.0  1542.0  0.112007
NaN                  449124.0  9681.0  0.021555)
(0.5764105742320466,                         total       1      rate
V220_pca_1                                     
(-0.001, 0.000567]   101159.0  4747.0  0.046926
(0.000567, 0.0

(0.6684727728616585,                       total       1      rate
V12_pca_11                                   
(-0.8557, 0.9975]   58723.0  2154.0  0.036681
(0.9975, 1.0005]    57017.0  1049.0  0.018398
(1.0005, 1.0032]    54241.0   402.0  0.007411
(1.0032, 1.0037]    82419.0   527.0  0.006394
(1.0037, 1.0057]    47030.0   958.0  0.020370
(1.0057, 1.0062]   112453.0  2913.0  0.025904
(1.0062, 1.0144]    57383.0  5572.0  0.097102
(1.0144, 12.5445]   45201.0  3201.0  0.070817
NaN                 76073.0  3887.0  0.051096)
(0.14349591794664304,                               total        1      rate
V143_pca_0                                            
(-0.0009999592, 4.09e-08]   51397.0   1215.0  0.023640
(4.09e-08, 8.92e-05]         5992.0    679.0  0.113318
(8.92e-05, 0.00181]          8172.0   1193.0  0.145986
(0.00181, 4.176]             8195.0    518.0  0.063209
(4.176, 12.76]               8195.0     46.0  0.005613
NaN                        508589.0  17012.0  0.033449)


In [56]:
gc.collect()
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.05,   
    'gamma': 0.1,                  
    'max_depth': 4,  
    #'num_leaves': 12,
    'subsample': 0.7,  
    'colsample_bytree': 0.8,       
    'min_child_weight': 100,
    'reg_alpha': 0.,
    'reg_lambda': 0.,
    'scale_pos_weight': 27.6,               
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
}
folds = TimeSeriesSplit(n_splits=5)
f = [x for x in df_train.columns if 'pca_0' in x]
print(len(f))
#f = [x for x in f if 'test2' not in x  and 'test4' not in x]
# to_drop = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
#                     'ProductCD', 'DeviceType', 'id_19', 'id_20', 'id_32', 'P_emaildomain']
# f = list(set(f)-set(to_drop))
auc_lst = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train)):
    _, auc = xgb_model_v3(params, df_train.iloc[train_index, :], df_train.iloc[valid_index, :], df_test, f, 'label', num_boost_round=600)
    auc_lst.append(auc)
    gc.collect()
print(auc_lst)
print(np.mean(auc_lst))

15
[0]	train-auc:0.790814	val-auc:0.778521
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.831762	val-auc:0.802746
[40]	train-auc:0.841123	val-auc:0.805848
[60]	train-auc:0.848387	val-auc:0.809959
[80]	train-auc:0.852973	val-auc:0.81225
[100]	train-auc:0.857527	val-auc:0.813812
[120]	train-auc:0.861319	val-auc:0.815423
[140]	train-auc:0.86423	val-auc:0.815551
[160]	train-auc:0.867158	val-auc:0.815313
[180]	train-auc:0.869956	val-auc:0.815199
[200]	train-auc:0.873231	val-auc:0.81594
[220]	train-auc:0.876184	val-auc:0.816707
[240]	train-auc:0.878982	val-auc:0.816606
[260]	train-auc:0.881186	val-auc:0.815372
[280]	train-auc:0.883676	val-auc:0.814917
[300]	train-auc:0.885606	val-auc:0.814272
[320]	train-auc:0.887608	val-auc:0.814321
Stopping. Best iteration:
[224]	train-auc:0.87666	val-auc:0.816993

model best auc: 0.8170 at ntree_limit-225
[0]	train-auc:0.796847	val-auc:0.774522
Mu

In [62]:
gc.collect()
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.05,   
    'gamma': 0.1,                  
    'max_depth': 20,  
    #'num_leaves': 12,
    'subsample': 0.7,  
    'colsample_bytree': 0.6,       
    'min_child_weight': 100,
    'reg_alpha': 0.,
    'reg_lambda': 0.,
    'scale_pos_weight': 27.6,               
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
}
folds = TimeSeriesSplit(n_splits=5)
f = [x for x in df_train.columns if 'pca' in x] + [x for x in df_train.columns.drop(['label', 'TransactionDT', 'Date', 'Card_ID'])
                                                    if x[0]!='V']
print(len(f))
#f = [x for x in f if 'test2' not in x  and 'test4' not in x]
# to_drop = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
#                     'ProductCD', 'DeviceType', 'id_19', 'id_20', 'id_32', 'P_emaildomain']
# f = list(set(f)-set(to_drop))
auc_lst = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train)):
    _, auc = xgb_model_v3(params, df_train.iloc[train_index, :], df_train.iloc[valid_index, :], df_test, f, 'label', num_boost_round=600)
    auc_lst.append(auc)
    gc.collect()
print(auc_lst)
print(np.mean(auc_lst))

185
[0]	train-auc:0.903081	val-auc:0.785941
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.96602	val-auc:0.849941
[40]	train-auc:0.976742	val-auc:0.847244
[60]	train-auc:0.984645	val-auc:0.842746
[80]	train-auc:0.98927	val-auc:0.84379
[100]	train-auc:0.992344	val-auc:0.840023
[120]	train-auc:0.994289	val-auc:0.837707
Stopping. Best iteration:
[25]	train-auc:0.969025	val-auc:0.853354

model best auc: 0.8534 at ntree_limit-26
[0]	train-auc:0.914236	val-auc:0.784869
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.967678	val-auc:0.848307
[40]	train-auc:0.977838	val-auc:0.848795
[60]	train-auc:0.985555	val-auc:0.850619
[80]	train-auc:0.989841	val-auc:0.850727
[100]	train-auc:0.992673	val-auc:0.84816
[120]	train-auc:0.994437	val-auc:0.847047
[140]	train-auc:0.995592	val-auc:0.

KeyboardInterrupt: 

In [75]:
df_train.drop( [x for x in df_train.columns if 'pca' in x], axis=1, inplace=True)

In [76]:
# svd_dict_train = {}
# for k,v in Vgroup.items():
#     svd_dict_train[k] = np.linalg.svd(df_train.loc[df_train[k].notnull(), v].values.astype(np.float64), full_matrices=0)
tol = 0.9
for k, (U, s, VT) in svd_dict_train.items():
    select = np.where(np.cumsum(s)/np.sum(s) > tol)[0][0]+1
    print('group: %s to %s, principal components num %i/%i'%(k, Vgroup[k][-1], select, len(Vgroup[k])))
    train_idx = df_train[k].notnull()
    train_num = train_idx.sum()
    new_cols = ['%s_pca_%i'%(k, i) for i in range(select)]
    for col in new_cols:
        df_train[col] = np.nan
        df_test[col] = np.nan
    tmp = np.dot(np.dot(U[:train_num, :select], np.diag(s[:select])), VT[:select, :select])
    print(tmp.shape)
    df_train.loc[train_idx, new_cols] = np.dot(np.dot(U[:, :select], np.diag(s[:select])), VT[:select, :select])

group: V35 to V52, principal components num 16/18
(421571, 16)
group: V281 to V315, principal components num 4/11
(589271, 4)
group: V217 to V278, principal components num 9/46
(130430, 9)
group: V75 to V94, principal components num 17/20
(501376, 17)
group: V322 to V339, principal components num 5/18
(82351, 5)
group: V53 to V74, principal components num 18/22
(513444, 18)
group: V167 to V216, principal components num 7/31
(139631, 7)
group: V279 to V321, principal components num 9/32
(590528, 9)
group: V95 to V137, principal components num 8/43
(590226, 8)
group: V1 to V11, principal components num 10/11
(311253, 10)
group: V169 to V210, principal components num 5/19
(139819, 5)
group: V138 to V163, principal components num 5/18
(81945, 5)
group: V220 to V272, principal components num 7/16
(141416, 7)
group: V12 to V34, principal components num 18/23
(514467, 18)
group: V143 to V166, principal components num 3/11
(81951, 3)


In [16]:
gc.collect()
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.05,   
    'gamma': 0.1,                  
    'max_depth': 20,  
    #'num_leaves': 12,
    'subsample': 0.7,  
    'colsample_bytree': 0.7,       
    'min_child_weight': 100,
    'reg_alpha': 5.,
    'reg_lambda': 5.,
    'scale_pos_weight': 27.6,               
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
}
folds = TimeSeriesSplit(n_splits=5)
f = [x for x in df_train.columns if 'pca_0' in x] + [x for x in df_train.columns.drop(['label', 'TransactionDT', 'Date', 'Card_ID'])
                                                    if x[0]!='V']
#f = [x for x in df_train.columns.drop(['label', 'TransactionDT', 'Date', 'Card_ID']) if 'pca' not in x]
print(len(f))
#f = [x for x in f if 'test2' not in x  and 'test4' not in x]
# to_drop = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
#                     'ProductCD', 'DeviceType', 'id_19', 'id_20', 'id_32', 'P_emaildomain']
# f = list(set(f)-set(to_drop))
auc_lst = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train)):
    _, auc = xgb_model_v3(params, df_train.iloc[train_index, :], df_train.iloc[valid_index, :], df_test, f, 'label', num_boost_round=600)
    auc_lst.append(auc)
    gc.collect()
print(auc_lst)
print(np.mean(auc_lst))

110
[0]	train-auc:0.909443	val-auc:0.80936
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.961868	val-auc:0.873057
[40]	train-auc:0.973945	val-auc:0.879883
[60]	train-auc:0.982611	val-auc:0.88595
[80]	train-auc:0.988055	val-auc:0.888896
[100]	train-auc:0.991685	val-auc:0.891514
[120]	train-auc:0.993927	val-auc:0.892785
[140]	train-auc:0.995512	val-auc:0.893748
[160]	train-auc:0.99648	val-auc:0.894533
[180]	train-auc:0.99721	val-auc:0.895342
[200]	train-auc:0.997737	val-auc:0.895483
[220]	train-auc:0.998094	val-auc:0.895504
[240]	train-auc:0.998383	val-auc:0.895715
[260]	train-auc:0.998609	val-auc:0.895328
[280]	train-auc:0.998812	val-auc:0.895281
[300]	train-auc:0.998962	val-auc:0.895436
[320]	train-auc:0.999083	val-auc:0.895146
[340]	train-auc:0.999173	val-auc:0.894993
Stopping. Best iteration:
[251]	train-auc:0.998513	val-auc:0.895921

model best auc: 0.8959 at ntree_limit-252

In [15]:
gc.collect()
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.05,   
    'gamma': 0.2,                  
    'max_depth': 20,  
    #'num_leaves': 12,
    'subsample': 0.7,  
    'colsample_bytree': 0.7,       
    'min_child_weight': 100,
    'reg_alpha': 1.,
    'reg_lambda': 10.,
    'scale_pos_weight': 27.6,               
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
}
folds = TimeSeriesSplit(n_splits=5)
f = [x for x in df_train.columns if 'pca' in x] + [x for x in df_train.columns.drop(['label', 'TransactionDT', 'Date', 'Card_ID'])
                                                    if x[0]!='V']
#f = [x for x in df_train.columns.drop(['label', 'TransactionDT', 'Date', 'Card_ID']) if 'pca' not in x]
print(len(f))
#f = [x for x in f if 'test2' not in x  and 'test4' not in x]
# to_drop = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
#                     'ProductCD', 'DeviceType', 'id_19', 'id_20', 'id_32', 'P_emaildomain']
# f = list(set(f)-set(to_drop))
auc_lst = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train)):
    _, auc = xgb_model_v3(params, df_train.iloc[train_index, :], df_train.iloc[valid_index, :], df_test, f, 'label', num_boost_round=600)
    auc_lst.append(auc)
    gc.collect()
print(auc_lst)
print(np.mean(auc_lst))

185
[0]	train-auc:0.880378	val-auc:0.826314
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[20]	train-auc:0.939088	val-auc:0.875193
[40]	train-auc:0.957542	val-auc:0.882004
[60]	train-auc:0.970108	val-auc:0.886686
[80]	train-auc:0.977126	val-auc:0.889707
[100]	train-auc:0.982472	val-auc:0.892359
[120]	train-auc:0.986521	val-auc:0.893499
[140]	train-auc:0.989164	val-auc:0.894597
[160]	train-auc:0.990974	val-auc:0.895219
[180]	train-auc:0.992495	val-auc:0.894773
[200]	train-auc:0.993738	val-auc:0.895122
[220]	train-auc:0.99472	val-auc:0.894649
[240]	train-auc:0.995594	val-auc:0.894408
[260]	train-auc:0.99619	val-auc:0.894843
[280]	train-auc:0.99663	val-auc:0.894475
Stopping. Best iteration:
[193]	train-auc:0.99335	val-auc:0.895253

model best auc: 0.8953 at ntree_limit-194
[0]	train-auc:0.886601	val-auc:0.821529
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

W

In [17]:
df_train[[x for x in df_train.columns if 'pca_0' in x]+['label']].corr()

Unnamed: 0,V75_pca_0,V138_pca_0,V217_pca_0,V167_pca_0,V279_pca_0,V281_pca_0,V1_pca_0,V220_pca_0,V143_pca_0,V53_pca_0,V95_pca_0,V12_pca_0,V322_pca_0,V169_pca_0,V35_pca_0,label
V75_pca_0,1.0,-0.076515,-0.001412,-0.002792,-0.022784,0.08884,0.015582,-0.013546,0.059894,0.656417,-0.012105,0.70447,0.023464,-0.043665,0.750221,-0.050901
V138_pca_0,-0.076515,1.0,0.053682,-0.001077,-0.004227,0.395652,,0.492979,-0.027667,0.031593,-0.006048,0.082253,-0.001135,0.522625,-0.088575,0.041535
V217_pca_0,-0.001412,0.053682,1.0,0.802405,0.528353,0.104118,,0.105776,-0.022179,-0.03349,0.510328,-0.005484,0.584162,0.188504,-0.014407,0.008333
V167_pca_0,-0.002792,-0.001077,0.802405,1.0,0.97679,0.136056,,-0.001641,-0.035992,-0.058798,0.970363,-0.026837,0.976135,0.277338,-0.018549,-0.020723
V279_pca_0,-0.022784,-0.004227,0.528353,0.97679,1.0,0.021171,0.037152,-0.002289,-0.03326,-0.049651,0.966801,-0.05306,0.984035,0.26382,-0.021329,0.001651
V281_pca_0,0.08884,0.395652,0.104118,0.136056,0.021171,1.0,0.01613,0.371846,-0.034676,0.093538,0.019415,0.069114,0.15282,0.506642,0.086696,0.042221
V1_pca_0,0.015582,,,,0.037152,0.01613,1.0,,,0.010256,0.019466,0.004451,,,0.006148,0.001393
V220_pca_0,-0.013546,0.492979,0.105776,-0.001641,-0.002289,0.371846,,1.0,-0.019396,-0.010313,-0.000617,-0.028832,0.002667,0.570872,-0.034852,0.015329
V143_pca_0,0.059894,-0.027667,-0.022179,-0.035992,-0.03326,-0.034676,,-0.019396,1.0,0.131084,-0.031601,0.075859,-0.033208,-0.042401,0.36011,-0.062437
V53_pca_0,0.656417,0.031593,-0.03349,-0.058798,-0.049651,0.093538,0.010256,-0.010313,0.131084,1.0,-0.047739,0.638468,-0.152196,-0.026363,0.632672,-0.05007


In [8]:
cols = [x for x in df_train.columns if x[0]!='V'] + [x for x in df_train.columns if 'pca' in x] 
print(len(cols))
print(cols)
with open('./data/df_train_pca.pkl', 'wb') as f:
    pickle.dump(df_train[cols], f)
cols.remove('label')
with open('./data/df_test_pca.pkl', 'wb') as f:
    pickle.dump(df_test[cols], f)

189
['label', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'Date', '_Weekdays', '_Hours', '_Days', 'Card_ID', 'V217_pca_0', 'V217_pca_1', 'V217_pca_2', 'V217_pca_3', 'V217_pca_4', 'V322_pca_0', 'V322_pca_1', 'V322_pca_2', 'V322_pca_3', 'V220_pca_0', 'V220_pca_1', 