In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [5]:
%%time
train = pd.read_csv('../input/train4.csv')
print("train finished")

train finished
CPU times: user 39.9 s, sys: 3.87 s, total: 43.8 s
Wall time: 43.8 s


In [6]:
%%time
test = pd.read_csv('../input/test4.csv')
print("test finished")

test finished
CPU times: user 34.6 s, sys: 2.05 s, total: 36.6 s
Wall time: 36.7 s


In [7]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 805) (506691, 804) (506691, 2)


In [8]:
y = train['isFraud']
del train['isFraud']
gc.collect()
print(train.shape,test.shape)

(590540, 804) (506691, 804)


In [9]:
importances = pd.read_csv('../importances/importances1.csv')

In [10]:
print(importances.shape)

(804, 7)


In [11]:
importances = importances[:700]

In [12]:
features = [x for x in list(importances['index']) if x not in ['TransactionID','TransactionDT']]

In [13]:
train = train[features]
test = test[features]

In [14]:
gc.collect()

14

In [15]:
print(train.shape,test.shape)

(590540, 698) (506691, 698)


In [16]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.02,
                    'num_leaves': 2**8,
                    'max_depth':12,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':2000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 51,
                    'early_stopping_rounds':100, 
                } 

In [17]:
%%time
N = 5
kf = KFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y.iloc[val_idx].values)   
    
    clf = lgb.train(lgb_params, trn_data ,valid_sets=[trn_data, val_data], verbose_eval=100)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

Fold 1




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.951125	valid_1's auc: 0.889565
[200]	training's auc: 0.975449	valid_1's auc: 0.905715
[300]	training's auc: 0.984881	valid_1's auc: 0.914016
[400]	training's auc: 0.989866	valid_1's auc: 0.918706
[500]	training's auc: 0.992821	valid_1's auc: 0.921735
[600]	training's auc: 0.994763	valid_1's auc: 0.923974
[700]	training's auc: 0.996091	valid_1's auc: 0.925195
[800]	training's auc: 0.997298	valid_1's auc: 0.926101
[900]	training's auc: 0.998112	valid_1's auc: 0.926927
Early stopping, best iteration is:
[893]	training's auc: 0.998041	valid_1's auc: 0.926976
Fold 1 ROC AUC Score 0.926975665187209

Fold 2




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.949134	valid_1's auc: 0.907435
[200]	training's auc: 0.975674	valid_1's auc: 0.92323
[300]	training's auc: 0.985634	valid_1's auc: 0.929969
[400]	training's auc: 0.990578	valid_1's auc: 0.932835
[500]	training's auc: 0.993509	valid_1's auc: 0.935005
[600]	training's auc: 0.995403	valid_1's auc: 0.936698
[700]	training's auc: 0.996759	valid_1's auc: 0.93778
[800]	training's auc: 0.997895	valid_1's auc: 0.938286
[900]	training's auc: 0.998556	valid_1's auc: 0.938807
[1000]	training's auc: 0.999042	valid_1's auc: 0.938966
[1100]	training's auc: 0.999404	valid_1's auc: 0.939011
[1200]	training's auc: 0.999636	valid_1's auc: 0.939176
Early stopping, best iteration is:
[1186]	training's auc: 0.999609	valid_1's auc: 0.939244
Fold 2 ROC AUC Score 0.9392441925746695

Fold 3




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.952703	valid_1's auc: 0.90734
[200]	training's auc: 0.976796	valid_1's auc: 0.922368
[300]	training's auc: 0.986724	valid_1's auc: 0.92744
[400]	training's auc: 0.991069	valid_1's auc: 0.92935
[500]	training's auc: 0.99392	valid_1's auc: 0.93059
[600]	training's auc: 0.995665	valid_1's auc: 0.931317
[700]	training's auc: 0.996847	valid_1's auc: 0.931339
Early stopping, best iteration is:
[668]	training's auc: 0.996526	valid_1's auc: 0.931527
Fold 3 ROC AUC Score 0.9315269731220143

Fold 4




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.948449	valid_1's auc: 0.922012
[200]	training's auc: 0.974974	valid_1's auc: 0.93783
[300]	training's auc: 0.985446	valid_1's auc: 0.944344
[400]	training's auc: 0.990419	valid_1's auc: 0.946977
[500]	training's auc: 0.993527	valid_1's auc: 0.948346
[600]	training's auc: 0.995398	valid_1's auc: 0.94915
[700]	training's auc: 0.996837	valid_1's auc: 0.950035
[800]	training's auc: 0.997884	valid_1's auc: 0.950135
Early stopping, best iteration is:
[763]	training's auc: 0.997564	valid_1's auc: 0.950224
Fold 4 ROC AUC Score 0.9502243504155249

Fold 5




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.950928	valid_1's auc: 0.89653
[200]	training's auc: 0.975745	valid_1's auc: 0.913532
[300]	training's auc: 0.985932	valid_1's auc: 0.919734
[400]	training's auc: 0.990697	valid_1's auc: 0.922583
[500]	training's auc: 0.993589	valid_1's auc: 0.923455
[600]	training's auc: 0.995489	valid_1's auc: 0.923679
[700]	training's auc: 0.996968	valid_1's auc: 0.9246
[800]	training's auc: 0.998043	valid_1's auc: 0.924646
Early stopping, best iteration is:
[747]	training's auc: 0.997553	valid_1's auc: 0.925058
Fold 5 ROC AUC Score 0.9250580467353807

Average ROC AUC Score 0.9346058456069597 [STD:0.00921207389632392]
CPU times: user 3h 29min 51s, sys: 43.6 s, total: 3h 30min 35s
Wall time: 28min 4s


In [18]:
print(roc_auc_score(y,oof))

0.9339866831144474


In [19]:
importance['avg'] = importance.mean(axis=1)

In [20]:
importance = importance.sort_values(by='avg',ascending=False)

In [21]:
importance = importance.reset_index(drop=False)

In [22]:
importance.head()

Unnamed: 0,index,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,avg
0,card1,8188,11283,6165,7115,6970,7944.2
1,dayofyear,7744,11025,5922,6774,6488,7590.6
2,TransactionAmt,7068,10039,4714,5567,5476,6572.8
3,card2,6343,8988,4776,5551,5084,6148.4
4,addr1,5011,7298,3574,4272,4828,4996.6


In [23]:
#importance.to_csv('../importances/importances3.csv',index=False)

In [24]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [25]:
sample_submission['isFraud'] = y_pred

In [26]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.001467
1,3663550,0.001109
2,3663551,0.002221
3,3663552,0.00182
4,3663553,0.001593


In [27]:
sample_submission.to_csv('../predictions/pred3.csv',index=False)

In [30]:
!kaggle competitions submit -c ieee-fraud-detection -f ./predictions/pred3.csv -m "pred3"

100%|███████████████████████████████████████| 14.1M/14.1M [00:33<00:00, 437kB/s]
Successfully submitted to IEEE-CIS Fraud Detection