In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [5]:
%%time
train = pd.read_csv('../input/train4.csv')
print("train finished")

train finished
CPU times: user 42.2 s, sys: 3.98 s, total: 46.2 s
Wall time: 46.9 s


In [6]:
%%time
test = pd.read_csv('../input/test4.csv')
print("test finished")

test finished
CPU times: user 36 s, sys: 2.17 s, total: 38.2 s
Wall time: 38.8 s


In [7]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 805) (506691, 804) (506691, 2)


In [8]:
y = train['isFraud']
del train['isFraud']
gc.collect()
print(train.shape,test.shape)

(590540, 804) (506691, 804)


In [9]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.02,
                    'num_leaves': 2**8,
                    'max_depth':12,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':2000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 51,
                    'early_stopping_rounds':100, 
                } 

In [10]:
%%time
N = 5
kf = KFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y.iloc[val_idx].values)   
    
    clf = lgb.train(lgb_params, trn_data ,valid_sets=[trn_data, val_data], verbose_eval=100)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

Fold 1




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.950458	valid_1's auc: 0.887526
[200]	training's auc: 0.975745	valid_1's auc: 0.904535
[300]	training's auc: 0.985176	valid_1's auc: 0.913192
[400]	training's auc: 0.989999	valid_1's auc: 0.918221
[500]	training's auc: 0.992988	valid_1's auc: 0.920958
[600]	training's auc: 0.99485	valid_1's auc: 0.922593
[700]	training's auc: 0.996325	valid_1's auc: 0.924179
[800]	training's auc: 0.997386	valid_1's auc: 0.92473
[900]	training's auc: 0.998212	valid_1's auc: 0.925457
[1000]	training's auc: 0.998739	valid_1's auc: 0.925598
[1100]	training's auc: 0.999183	valid_1's auc: 0.925718
[1200]	training's auc: 0.999481	valid_1's auc: 0.925892
[1300]	training's auc: 0.999688	valid_1's auc: 0.926334
[1400]	training's auc: 0.999803	valid_1's auc: 0.926297
Early stopping, best iteration is:
[1318]	training's auc: 0.999712	valid_1's auc: 0.926385
Fold 1 ROC AUC Score 0.9263848864263222

Fold 2




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.948801	valid_1's auc: 0.906116
[200]	training's auc: 0.975917	valid_1's auc: 0.922448
[300]	training's auc: 0.985725	valid_1's auc: 0.92844
[400]	training's auc: 0.990523	valid_1's auc: 0.931632
[500]	training's auc: 0.993323	valid_1's auc: 0.933242
[600]	training's auc: 0.995322	valid_1's auc: 0.93484
[700]	training's auc: 0.996734	valid_1's auc: 0.935548
[800]	training's auc: 0.99785	valid_1's auc: 0.936334
[900]	training's auc: 0.998668	valid_1's auc: 0.937109
[1000]	training's auc: 0.999194	valid_1's auc: 0.937316
[1100]	training's auc: 0.999492	valid_1's auc: 0.938371
Early stopping, best iteration is:
[1034]	training's auc: 0.999319	valid_1's auc: 0.938565
Fold 2 ROC AUC Score 0.9385645183143027

Fold 3




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.952674	valid_1's auc: 0.908239
[200]	training's auc: 0.977618	valid_1's auc: 0.923043
[300]	training's auc: 0.986504	valid_1's auc: 0.927743
[400]	training's auc: 0.990942	valid_1's auc: 0.929739
[500]	training's auc: 0.993874	valid_1's auc: 0.930869
[600]	training's auc: 0.995605	valid_1's auc: 0.931257
[700]	training's auc: 0.996908	valid_1's auc: 0.931687
[800]	training's auc: 0.998001	valid_1's auc: 0.932079
[900]	training's auc: 0.998639	valid_1's auc: 0.931989
Early stopping, best iteration is:
[802]	training's auc: 0.998038	valid_1's auc: 0.932129
Fold 3 ROC AUC Score 0.9321290837050848

Fold 4




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.948737	valid_1's auc: 0.922385
[200]	training's auc: 0.976077	valid_1's auc: 0.938569
[300]	training's auc: 0.986002	valid_1's auc: 0.944167
[400]	training's auc: 0.990576	valid_1's auc: 0.94683
[500]	training's auc: 0.993794	valid_1's auc: 0.948428
[600]	training's auc: 0.99569	valid_1's auc: 0.949585
[700]	training's auc: 0.997129	valid_1's auc: 0.950172
[800]	training's auc: 0.998085	valid_1's auc: 0.950393
[900]	training's auc: 0.998672	valid_1's auc: 0.950279
Early stopping, best iteration is:
[853]	training's auc: 0.998431	valid_1's auc: 0.950496
Fold 4 ROC AUC Score 0.9504963121428377

Fold 5




Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.952064	valid_1's auc: 0.897683
[200]	training's auc: 0.976663	valid_1's auc: 0.913653
[300]	training's auc: 0.986645	valid_1's auc: 0.919317
[400]	training's auc: 0.991359	valid_1's auc: 0.921773
[500]	training's auc: 0.994085	valid_1's auc: 0.922814
[600]	training's auc: 0.995915	valid_1's auc: 0.92382
[700]	training's auc: 0.997115	valid_1's auc: 0.923988
[800]	training's auc: 0.998112	valid_1's auc: 0.924821
[900]	training's auc: 0.998786	valid_1's auc: 0.924902
Early stopping, best iteration is:
[882]	training's auc: 0.998675	valid_1's auc: 0.925337
Fold 5 ROC AUC Score 0.925336847212859

Average ROC AUC Score 0.9345823295602813 [STD:0.009249952452932834]
CPU times: user 4h 15min 45s, sys: 50.5 s, total: 4h 16min 35s
Wall time: 34min 10s


In [13]:
importance['avg'] = importance.mean(axis=1)

In [14]:
importance = importance.sort_values(by='avg',ascending=False)

In [15]:
importance = importance.reset_index(drop=False)

In [16]:
importance.head()

Unnamed: 0,index,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,avg
0,card1,12171,9377,7007,7353,8683,8918.2
1,TransactionID,11609,9044,6608,6961,7552,8354.8
2,TransactionAmt,11495,8869,6180,7030,7031,8121.0
3,card2,9266,7502,5429,5474,6163,6766.8
4,addr1,8497,6253,4836,5380,5647,6122.6


In [17]:
importance.to_csv('../importances/importances1.csv',index=False)

In [21]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [22]:
sample_submission['isFraud'] = y_pred

In [23]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.00124
1,3663550,0.000725
2,3663551,0.001453
3,3663552,0.001341
4,3663553,0.001067


In [26]:
sample_submission.to_csv('../predictions/pred1.csv',index=False)