In [19]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold,StratifiedKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [5]:
%%time
train = pd.read_csv('../input/train4.csv')
print("train finished")

train finished
CPU times: user 37.5 s, sys: 3.08 s, total: 40.5 s
Wall time: 40.5 s


In [6]:
%%time
test = pd.read_csv('../input/test4.csv')
print("test finished")

test finished
CPU times: user 32.8 s, sys: 2.68 s, total: 35.5 s
Wall time: 35.5 s


In [7]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 805) (506691, 804) (506691, 2)


In [8]:
y = train['isFraud']
del train['isFraud']
gc.collect()
print(train.shape,test.shape)

(590540, 804) (506691, 804)


In [10]:
importances = pd.read_csv('../importances/importances1.csv')

In [11]:
print(importances.shape)

(804, 7)


In [12]:
importances = importances[:700]

In [13]:
features = [x for x in list(importances['index']) if x not in ['TransactionID','TransactionDT']]

In [14]:
train = train[features]
test = test[features]

In [15]:
gc.collect()

14

In [16]:
print(train.shape,test.shape)

(590540, 698) (506691, 698)


In [21]:
########################### Model params
lgb_params = 
{
'objective':'binary',
'boosting_type':'gbdt',
'metric':'auc',
'n_jobs':-1,
'learning_rate':0.02,
'num_leaves': 2**8,
'max_depth':12,
'tree_learner':'serial',
'colsample_bytree': 0.5,
'subsample_freq':1,
'subsample':0.7,
'max_bin':255,
'verbose':-1,
'seed': 51
} 

In [22]:
%%time
N = 5
kf = StratifiedKFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y.iloc[val_idx].values)   
    
    clf = lgb.train(lgb_params,n_estimators = 2000, trn_data ,valid_sets=[trn_data, val_data], verbose_eval=100,early_stopping_rounds=100)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

Fold 1




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.955145	valid_1's auc: 0.885654
[200]	training's auc: 0.977057	valid_1's auc: 0.893244
[300]	training's auc: 0.986092	valid_1's auc: 0.888981
Early stopping, best iteration is:
[212]	training's auc: 0.978671	valid_1's auc: 0.893486
Fold 1 ROC AUC Score 0.8934857451110849

Fold 2




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.953068	valid_1's auc: 0.845592
Early stopping, best iteration is:
[28]	training's auc: 0.914302	valid_1's auc: 0.88252
Fold 2 ROC AUC Score 0.8825199600559421

Fold 3




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.952636	valid_1's auc: 0.89035
Early stopping, best iteration is:
[99]	training's auc: 0.952135	valid_1's auc: 0.896533
Fold 3 ROC AUC Score 0.8965327949856777

Fold 4




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.949034	valid_1's auc: 0.92221
[200]	training's auc: 0.975452	valid_1's auc: 0.934106
[300]	training's auc: 0.985673	valid_1's auc: 0.934421
Early stopping, best iteration is:
[288]	training's auc: 0.984889	valid_1's auc: 0.934811
Fold 4 ROC AUC Score 0.9348112697732877

Fold 5




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.951095	valid_1's auc: 0.899685
[200]	training's auc: 0.976033	valid_1's auc: 0.914248
[300]	training's auc: 0.985651	valid_1's auc: 0.919492
[400]	training's auc: 0.990648	valid_1's auc: 0.922088
[500]	training's auc: 0.993747	valid_1's auc: 0.923321
[600]	training's auc: 0.995545	valid_1's auc: 0.923962
[700]	training's auc: 0.996967	valid_1's auc: 0.924364
[800]	training's auc: 0.997894	valid_1's auc: 0.925006
[900]	training's auc: 0.998622	valid_1's auc: 0.925082
Early stopping, best iteration is:
[894]	training's auc: 0.998582	valid_1's auc: 0.925196
Fold 5 ROC AUC Score 0.9251960622977602

Average ROC AUC Score 0.9065091664447505 [STD:0.01997412648066917]
CPU times: user 1h 39min 30s, sys: 41.3 s, total: 1h 40min 11s
Wall time: 14min 12s


In [23]:
print(roc_auc_score(y,oof))

0.8283765519177129


In [24]:
importance['avg'] = importance.mean(axis=1)

In [25]:
importance = importance.sort_values(by='avg',ascending=False)

In [26]:
importance = importance.reset_index(drop=False)

In [27]:
importance.head()

Unnamed: 0,index,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,avg
0,card1,1836,106,709,2474,8736,2772.2
1,dayofyear,2195,297,1072,2770,7415,2749.8
2,TransactionAmt,1490,204,747,1982,7034,2291.4
3,card2,1341,63,470,1844,6168,1977.2
4,addr1,1043,47,313,1581,5967,1790.2


In [28]:
importance.to_csv('../importances/importances4.csv',index=False)

In [29]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [30]:
sample_submission['isFraud'] = y_pred

In [31]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.008508
1,3663550,0.009097
2,3663551,0.012775
3,3663552,0.007722
4,3663553,0.008183


In [32]:
sample_submission.to_csv('../predictions/pred4.csv',index=False)

In [33]:
!kaggle competitions submit -c ieee-fraud-detection -f ./predictions/pred4.csv -m "pred4"

100%|███████████████████████████████████████| 13.9M/13.9M [00:33<00:00, 435kB/s]
Successfully submitted to IEEE-CIS Fraud Detection