In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from itertools import combinations
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)

In [5]:
train_dtypes = pd.read_csv('../input/train9_dtypes.csv')
test_dtypes = pd.read_csv('../input/test9_dtypes.csv')
train_dtypes.index = train_dtypes['TransactionID']
del train_dtypes['TransactionID']
test_dtypes.index = test_dtypes['TransactionID']
del test_dtypes['TransactionID']
train_dtypes = train_dtypes.iloc[:,0]
test_dtypes = test_dtypes.iloc[:,0]
print(type(train_dtypes),type(test_dtypes))
train_dtypes = train_dtypes.to_dict()
test_dtypes = test_dtypes.to_dict()
gc.collect()

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


21

In [6]:
%%time
train = pd.read_csv('../input/train9.csv',dtype= train_dtypes)
print("train finished")

train finished
CPU times: user 40.4 s, sys: 1.42 s, total: 41.8 s
Wall time: 42.7 s


In [7]:
%%time
test = pd.read_csv('../input/test9.csv',dtype= test_dtypes)
print("test finished")

test finished
CPU times: user 35.7 s, sys: 1 s, total: 36.7 s
Wall time: 37.4 s


In [8]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 556) (506691, 555) (506691, 2)


In [13]:
dropped_columns = ['V300','V309','V111','V124','V106','V125','V315','V134','V102','V123','V316','V113','V136','V305','V110','V299',
'V289','V286','V318','V304','V116','V284','V293',
'V137','V295','V301','V104','V311','V115','V109','V119','V321','V114','V133','V122','V319',
'V105','V112','V118','V117','V121','V108','V135','V320','V303','V297','V120',
'V1','V14','V41','V65','V88', 'V89', 'V107', 'V68', 'V28', 'V27', 'V29', 'V241','V269',
'V240', 'V325', 'V138', 'V154', 'V153', 'V330', 'V142', 'V195', 'V302', 'V328', 'V327',
'V198', 'V196', 'V155','C13','C14','D3']

In [14]:
dropped_columns = [x for x in dropped_columns if x in train.columns]
train.drop(dropped_columns,axis=1,inplace=True)
test.drop(dropped_columns,axis=1,inplace=True)
gc.collect()

1613

In [16]:
print(dropped_columns)

['V309', 'V315', 'V289', 'V304', 'V295', 'V303', 'V1', 'V14', 'V41', 'V65', 'V88', 'V89', 'V68', 'V28', 'V27', 'V29', 'V241', 'V269', 'V240', 'V325', 'V138', 'V154', 'V153', 'V330', 'V142', 'V195', 'V302', 'V328', 'V327', 'V198', 'V196', 'V155']


In [17]:
print(train.shape,test.shape)

(590540, 524) (506691, 523)


# Training

In [18]:
train.drop(['TransactionID'],axis=1,inplace=True)
test.drop(['TransactionID'],axis=1,inplace=True)
gc.collect()

14

In [19]:
y = train['isFraud']
del train['isFraud']

In [20]:
########################### Model params
lgb_params = {'num_leaves': 546,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.1797454081646243,
          'bagging_fraction': 0.2181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3299927210061127,
          'reg_lambda': 0.3885237330340494,
          'random_state': 42,
}

In [22]:
train.drop(['C13','C14','D3'],axis=1,inplace=True)
test.drop(['C13','C14','D3'],axis=1,inplace=True)

In [23]:
N = 5
kf = KFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y.iloc[val_idx].values)   
    
    #params, dtrain, 1600, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500
    clf = lgb.train(lgb_params, trn_data ,2000 ,valid_sets=[trn_data, val_data], verbose_eval=200,early_stopping_rounds=500)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

Fold 1
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.972335	valid_1's auc: 0.900045
[400]	training's auc: 0.990936	valid_1's auc: 0.915043
[600]	training's auc: 0.997326	valid_1's auc: 0.922471
[800]	training's auc: 0.999288	valid_1's auc: 0.926261
[1000]	training's auc: 0.999826	valid_1's auc: 0.928545
[1200]	training's auc: 0.999964	valid_1's auc: 0.929522
[1400]	training's auc: 0.999994	valid_1's auc: 0.930168
[1600]	training's auc: 0.999999	valid_1's auc: 0.930654
[1800]	training's auc: 1	valid_1's auc: 0.930858
[2000]	training's auc: 1	valid_1's auc: 0.930972
Did not meet early stopping. Best iteration is:
[2000]	training's auc: 1	valid_1's auc: 0.930972
Fold 1 ROC AUC Score 0.9309723294855793

Fold 2
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.97238	valid_1's auc: 0.926924
[400]	training's auc: 0.991332	valid_1's auc: 0.939133
[600]	training's auc: 0.997686	valid_1's auc: 0.943867
[800]	training's

In [24]:
print(roc_auc_score(y,oof))

0.9416269280353917


In [25]:
importance['avg'] = importance.mean(axis=1)

In [26]:
importance = importance.sort_values(by='avg',ascending=False)

In [27]:
importance = importance.reset_index(drop=False)

In [28]:
importance

Unnamed: 0,index,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,avg
0,uid3_fq_enc,20990,13729,22790,15196,23663,19273.6
1,DT_D,22727,13015,22703,14652,22847,19188.8
2,uid_DT_W,22403,13694,21310,15231,21908,18909.2
3,TransactionAmt,19809,12839,21475,14501,21237,17972.2
4,uid_DT_D,20753,12155,21217,13710,21601,17887.2
5,DT_D_total,20663,12772,20606,14293,20478,17762.4
6,uid3_TransactionAmt_mean,21236,14224,18704,14910,19325,17679.8
7,uid_DT_M,17730,11188,20151,12316,20249,16326.8
8,uid,17774,11732,17271,12740,17705,15444.4
9,card1_addr1,17474,11204,17674,12350,17819,15304.2


In [29]:
importance.to_csv('../importances/importances25.csv',index=False)

In [30]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [31]:
sample_submission['isFraud'] = y_pred

In [32]:
sample_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000229
1,3663550,0.0004
2,3663551,0.000457
3,3663552,0.000409
4,3663553,0.000331


In [33]:
sample_submission.to_csv('../predictions/pred25.csv',index=False)

In [34]:
!kaggle competitions submit -c ieee-fraud-detection -f ../predictions/pred25.csv -m "pred25"

100%|███████████████████████████████████████| 14.3M/14.3M [00:34<00:00, 442kB/s]
Successfully submitted to IEEE-CIS Fraud Detection