In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from itertools import combinations
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [5]:
%%time
train = pd.read_csv('../input/train5.csv')
print("train finished")

train finished
CPU times: user 52 s, sys: 3.99 s, total: 56 s
Wall time: 56.1 s


In [6]:
%%time
test = pd.read_csv('../input/test5.csv')
print("test finished")

test finished
CPU times: user 44.8 s, sys: 2.27 s, total: 47.1 s
Wall time: 47.1 s


In [7]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 915) (506691, 914) (506691, 2)


In [8]:
#y = train['isFraud']
#del train['isFraud']
gc.collect()
print(train.shape,test.shape)

(590540, 915) (506691, 914)


In [None]:
"""importances = pd.read_csv('./importances/importances1.csv')
print(importances.shape)
importances = importances[:700]
features = [x for x in list(importances['index']) if x not in ['TransactionID','TransactionDT']]
train = train[features]
test = test[features]
"""

In [None]:
"""
dropped_columns = list(train.columns[-55:])
train.drop(dropped_columns,axis=1,inplace=1)
test.drop(dropped_columns,axis=1,inplace=1)
"""

In [9]:
gc.collect()

0

# Training

In [10]:
y = train['isFraud']
del train['isFraud']

In [13]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.04,
                    'num_leaves': 2**8,
                    'max_depth':12,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':1000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 51,
                    'early_stopping_rounds':100, 
                } 

In [14]:
N = 5
kf = KFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y.iloc[val_idx].values)   
    
    clf = lgb.train(lgb_params, trn_data ,valid_sets=[trn_data, val_data], verbose_eval=100)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

Fold 1




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978028	valid_1's auc: 0.905205
[200]	training's auc: 0.991743	valid_1's auc: 0.917651
[300]	training's auc: 0.996291	valid_1's auc: 0.920665
[400]	training's auc: 0.998373	valid_1's auc: 0.922337
[500]	training's auc: 0.999393	valid_1's auc: 0.921905
Early stopping, best iteration is:
[402]	training's auc: 0.998388	valid_1's auc: 0.922398
Fold 1 ROC AUC Score 0.9223978173562837

Fold 2




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.979101	valid_1's auc: 0.921386
[200]	training's auc: 0.993183	valid_1's auc: 0.932895
[300]	training's auc: 0.99697	valid_1's auc: 0.935446
[400]	training's auc: 0.998709	valid_1's auc: 0.9372
[500]	training's auc: 0.999541	valid_1's auc: 0.938021
[600]	training's auc: 0.99984	valid_1's auc: 0.938061
[700]	training's auc: 0.999951	valid_1's auc: 0.938453
[800]	training's auc: 0.99999	valid_1's auc: 0.938168
Early stopping, best iteration is:
[711]	training's auc: 0.999959	valid_1's auc: 0.9386
Fold 2 ROC AUC Score 0.9385998457484893

Fold 3




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.981092	valid_1's auc: 0.924922
[200]	training's auc: 0.99348	valid_1's auc: 0.930685
[300]	training's auc: 0.997044	valid_1's auc: 0.931693
[400]	training's auc: 0.998905	valid_1's auc: 0.93155
Early stopping, best iteration is:
[312]	training's auc: 0.997414	valid_1's auc: 0.931887
Fold 3 ROC AUC Score 0.9318867612956274

Fold 4




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.979135	valid_1's auc: 0.938725
[200]	training's auc: 0.992897	valid_1's auc: 0.947208
[300]	training's auc: 0.996812	valid_1's auc: 0.949115
[400]	training's auc: 0.998664	valid_1's auc: 0.949769
[500]	training's auc: 0.999611	valid_1's auc: 0.949287
Early stopping, best iteration is:
[407]	training's auc: 0.998849	valid_1's auc: 0.949916
Fold 4 ROC AUC Score 0.9499160505642558

Fold 5




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.979626	valid_1's auc: 0.916243
[200]	training's auc: 0.992791	valid_1's auc: 0.922963
[300]	training's auc: 0.997125	valid_1's auc: 0.923903
[400]	training's auc: 0.998882	valid_1's auc: 0.924202
Early stopping, best iteration is:
[333]	training's auc: 0.997891	valid_1's auc: 0.924834
Fold 5 ROC AUC Score 0.9248338252752012

Average ROC AUC Score 0.9335268600479715 [STD:0.009964488129434092]


In [15]:
print(roc_auc_score(y,oof))

0.9306344376060152


In [16]:
importance['avg'] = importance.mean(axis=1)

In [17]:
importance = importance.sort_values(by='avg',ascending=False)

In [18]:
importance = importance.reset_index(drop=False)

In [19]:
importance.head()

Unnamed: 0,index,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,avg
0,TransactionID,3029,5068,1951,3140,2003,3038.2
1,TransactionAmt,2587,4916,2124,2517,2182,2865.2
2,addr1,1789,3724,1662,1784,1798,2151.4
3,card1,1760,3662,1452,1652,1533,2011.8
4,day,1510,3040,1073,1588,1119,1666.0


In [22]:
importance.to_csv('../importances/importances9.csv',index=False)

In [23]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [24]:
sample_submission['isFraud'] = y_pred

In [25]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000867
1,3663550,0.001239
2,3663551,0.00157
3,3663552,0.001923
4,3663553,0.001357


In [26]:
sample_submission.to_csv('../predictions/pred9.csv',index=False)

In [27]:
!kaggle competitions submit -c ieee-fraud-detection -f ../predictions/pred9.csv -m "pred9"

100%|███████████████████████████████████████| 14.1M/14.1M [00:31<00:00, 465kB/s]
Successfully submitted to IEEE-CIS Fraud Detection