In [1]:
import datetime
import gc
import math
import os
import pickle
import random

from functools import partial
from time import time

import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold, KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
INPUT_PATH = '/kaggle/input/ieee-fraud-detection'

COL_ID = 'TransactionID'
COL_DT = 'TransactionDT'
COL_AMOUNT = 'TransactionAmt'
COL_TARGET = 'isFraud'

START_DATE = '2017-11-30'

In [3]:
EMAIL_MAP = {
    'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 
    'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 
    'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 
    'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 
    'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
    'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 
    'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 
    'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 
    'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 
    'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 
    'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
    'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 
    'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 
    'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'
}

In [4]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


def save_to_disk(df, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
def load_df(path, filenames):
    assert len(filenames) == 2
    dfs = [pd.read_csv(os.path.join(path, f), index_col=COL_ID) for f in filenames]
    return dfs[0].merge(dfs[1], how='left', left_index=True, right_index=True)


def split_train_val(x, y, val_start, val_end):
    assert val_start < val_end
    val = (x[COL_DT] >= val_start) & (x[COL_DT] < val_end)
    x_train, x_val = x[~val], x[val]
    indices = lambda df: df.index.values.astype(int)
    y_train, y_val = y[indices(x_train)], y[indices(x_val)]
    return x_train, y_train, x_val, y_val


def split_label_inputs(df):
    return df[COL_TARGET], df.drop(COL_TARGET, axis=1)

In [6]:
INT_TYPES = [np.int8, np.int16, np.int32, np.int64]
UINT_TYPES = [np.uint8, np.uint16, np.uint32, np.uint64]
IGNORE_COLUMNS = [
    'TransactionAmt_log'
]


def downcast_int(df, col, min_val, max_val):    
    if min_val < 0:
        for i, dtype in enumerate(INT_TYPES):
            if (i == 3) or (np.iinfo(dtype).min < min_val and max_val < np.iinfo(dtype).max):
                df[col] = df[col].astype(dtype); break
    else:
        for i, dtype in enumerate(UINT_TYPES):
            if (i == 3) or (max_val < np.iinfo(dtype).max):
                df[col] = df[col].astype(dtype); break


def mem_usg_mb(df):
    return df.memory_usage().sum() / 1024**2


# original: https://www.kaggle.com/mjbahmani/reducing-memory-size-for-ieee
def reduce_mem_usage(df):
    init_mem_usg = mem_usg_mb(df)
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if col in IGNORE_COLUMNS or df[col].dtype == object:
            continue
        dtype_before = df[col].dtype
        min_val, max_val = df[col].min(), df[col].max()
        # Integer does not support NA, therefore, NA needs to be filled
        if not np.isfinite(df[col]).all(): 
            NAlist.append(col)
            df[col].fillna(-32222, inplace=True) # changed here mn-1 -> -32222
        # test if column can be converted to an integer
        # https://stackoverflow.com/questions/21583758/how-to-check-if-a-float-value-is-a-whole-number       
        if (df[col] % 1 == 0).all():
            downcast_int(df, col, min_val, max_val)  
        else:
            df[col] = df[col].astype(np.float32)
        print("{}, range: ({}, {}), dtype: {} -> {}".format(
            col, min_val, max_val, dtype_before, df[col].dtype))
    print("\n *** DONE ***")
    mem_usg = mem_usg_mb(df) 
    print("Memory usage: %.02f -> %.02f MB" % (init_mem_usg, mem_usg))
    print("This is {:.02f}% of the initial size".format(100 * mem_usg / init_mem_usg))
    return df, NAlist

In [7]:
def parse_email_suffix(email):
    if email is np.nan:
        return 0, np.nan, np.nan
    parts = email.split('.') 
    middle = parts[-2] if len(parts) > 2 else parts[-1]
    last = parts[-1]
    return len(parts), middle, last

In [8]:
def feature_cents(df):
    return (np.modf(df[COL_AMOUNT])[0] * 1000).astype(np.uint16)


def feature_days(df):
    return (df[COL_DT] / (24 * 60 * 60)).astype(np.uint16)


def parse_datetime(df):
    start_date = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    return df[COL_DT].apply(lambda x: (start_date + datetime.timedelta(seconds = x)))


def frequency(col, dfs):
    counts = pd.concat(map(lambda df: df[col], dfs), ignore_index=True).value_counts(dropna=False)
    return map(lambda df: df[col].map(counts), dfs)

# Data preparation

In [9]:
%%time
df_train = load_df(INPUT_PATH, ['train_transaction.csv', 'train_identity.csv'])
df_test = load_df(INPUT_PATH, ['test_transaction.csv', 'test_identity.csv'])
df_train.shape, df_test.shape

CPU times: user 59.1 s, sys: 14.5 s, total: 1min 13s
Wall time: 1min 10s


((590540, 433), (506691, 432))

In [10]:
all_dfs = [df_train, df_test]
encode_labels_cols = []

for i, df in enumerate(all_dfs):
    df['Cents'] = feature_cents(df)
    df[COL_AMOUNT+'_log'] = np.log(df[COL_AMOUNT])
    
    dt_ = parse_datetime(df)
    df['Hour'] = dt_.dt.hour
    df['DayOfWeek'] = dt_.dt.weekday
    df['DayOfMonth'] = dt_.dt.day
    df['MonthFraction'] = (dt_.dt.day / dt_.dt.days_in_month * 100).astype(np.uint8)
    df['isDecember'] = dt_.dt.month == 12

In [11]:
dt_m = parse_datetime(df_train).dt.date.astype('str').str[:7]
dt_m.value_counts()

2017-12    137321
2018-03    101632
2018-01     92585
2018-05     89326
2018-02     86021
2018-04     83655
Name: TransactionDT, dtype: int64

In [12]:
interactions = [
    'card1__addr1', 'card1__card5', 'card2__dist1', 'card2__id_20', 'card5__P_emaildomain',
    'card1__card2', 
    'card1__card2__card3__card5', 
    'card1__card2__card3__card5__addr1__addr2',
]
rename = {
    'card1__card2': 'uuid',
    'card1__card2__card3__card5': 'uuid2',
    'card1__card2__card3__card5__addr1__addr2': 'uuid3'
}
encode_freq_cols = ['card1__addr1', 'card1__card5', 'uuid3']
groups = [
    (['uuid2', 'ProductCD'], COL_AMOUNT, 'mean'),
    (['uuid2', 'addr1'], 'dist2', 'mean'),
    ('V307', 'V313', 'mean'),
    ('uuid3', COL_AMOUNT, ['mean', 'std']),
    ('uuid2', COL_AMOUNT, 'mean'),
    ('uuid', COL_AMOUNT, 'mean'),
    ('card1', COL_AMOUNT, 'mean'),
]
drop = ['uuid', 'uuid2', 'uuid3']

for inter in interactions:
    cols = inter.split('__')
    col = rename[inter] if inter in rename else inter
    for df in all_dfs:
        df[col] = df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)    
    if col in encode_freq_cols:
        for df, freq in zip(all_dfs, frequency(col, all_dfs)):
            df[col+'_count'] = freq
    if col not in drop:
        encode_labels_cols.append(col)
    gc.collect()

for df in all_dfs:
    for group, target, aggs in groups:
        if not isinstance(group, list):
            group = [group]
        if not isinstance(aggs, list):
            aggs = [aggs]
        for agg in aggs:
            col = f'{target}_{agg}_by_{"_".join(group)}'
            df[col] = df.groupby(group)[target].transform(agg)
    for col in drop:
        df.drop([col], axis=1, inplace=True)
gc.collect()

0

In [13]:
encode_freq_cols = [
    'card1', 'card2', 'card5', 
    'addr1', 'addr2', 'dist1',
    'C1', 'C2', 'C5', 'C6', 'C7', 'C9', 'C11', 'C13', 'C14', 
    'P_emaildomain', 'DeviceInfo'
]
    
for col in encode_freq_cols:
    for df, freq in zip(all_dfs, frequency(col, all_dfs)):
        df[col+'_count'] = freq 

In [14]:
na_cols = [
    'D2', 'D3', 'D5', 'D6', 'D7', 'D13', 'D14',
    'M4', 'M5', 'M6', 'M7', 'M8',
]

for df in all_dfs:
    for col in na_cols:
        df[col+'_na'] = df[col].isna() 

In [15]:
for df in all_dfs:
    for col in ['P_emaildomain', 'R_emaildomain']:
        temp = df[col].apply(lambda x: parse_email_suffix(x))
        df[col+'_parts'] = temp.map(lambda x: x[0])
        df[col+'_sfx2']  = temp.map(lambda x: x[1])
        df[col+'_sfx1']  = temp.map(lambda x: x[2])
        
        #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
        df[col+'_bin'] = df[col].map(EMAIL_MAP)
        
        encode_labels_cols.extend([
            col+'_sfx2', 
            col+'_sfx1', 
            col+'_bin'
        ])

In [16]:
def str_contains(df, col, val):
    return df[col].str.contains(val, case=False).fillna(False)


for df in all_dfs:
    col = 'Browser'
    df[col] = -1
    search = partial(str_contains, df, 'id_31')
    for i, browser in enumerate(['chrome', 'safari', 'ie ', 'firefox', 'edge', 'samsung', 'opera']):
        df.loc[search(browser), [col]] = i
    df.loc[(df['id_31'] == 'ie'), [col]] = 2
    
for df in all_dfs:
    col = 'OS'
    df[col] = -1
    search = partial(str_contains, df, 'id_30')
    for i, os_name in enumerate(['windows', 'ios', 'mac', 'android', 'linux']):
        df.loc[search(os_name), [col]] = i

In [17]:
%%time
df_train, na_train = reduce_mem_usage(df_train)

isFraud, range: (0, 1), dtype: int64 -> uint8
TransactionDT, range: (86400, 15811131), dtype: int64 -> uint32
TransactionAmt, range: (0.251, 31937.391), dtype: float64 -> float32
card1, range: (1000, 18396), dtype: int64 -> uint16
card2, range: (100.0, 600.0), dtype: float64 -> uint16
card3, range: (100.0, 231.0), dtype: float64 -> uint8
card5, range: (100.0, 237.0), dtype: float64 -> uint8
addr1, range: (100.0, 540.0), dtype: float64 -> uint16
addr2, range: (10.0, 102.0), dtype: float64 -> uint8
dist1, range: (0.0, 10286.0), dtype: float64 -> uint16
dist2, range: (0.0, 11623.0), dtype: float64 -> uint16
C1, range: (0.0, 4685.0), dtype: float64 -> uint16
C2, range: (0.0, 5691.0), dtype: float64 -> uint16
C3, range: (0.0, 26.0), dtype: float64 -> uint8
C4, range: (0.0, 2253.0), dtype: float64 -> uint16
C5, range: (0.0, 349.0), dtype: float64 -> uint16
C6, range: (0.0, 2253.0), dtype: float64 -> uint16
C7, range: (0.0, 2255.0), dtype: float64 -> uint16
C8, range: (0.0, 3331.0), dtype: fl

In [18]:
%%time
df_test, na_test = reduce_mem_usage(df_test)

TransactionDT, range: (18403224, 34214345), dtype: int64 -> uint32
TransactionAmt, range: (0.018000000000000002, 10270.0), dtype: float64 -> float32
card1, range: (1001, 18397), dtype: int64 -> uint16
card2, range: (100.0, 600.0), dtype: float64 -> uint16
card3, range: (100.0, 232.0), dtype: float64 -> uint8
card5, range: (100.0, 237.0), dtype: float64 -> uint8
addr1, range: (100.0, 540.0), dtype: float64 -> uint16
addr2, range: (10.0, 102.0), dtype: float64 -> uint8
dist1, range: (0.0, 8081.0), dtype: float64 -> uint16
dist2, range: (0.0, 9213.0), dtype: float64 -> uint16
C1, range: (0.0, 2950.0), dtype: float64 -> uint16
C2, range: (0.0, 3275.0), dtype: float64 -> uint16
C3, range: (0.0, 31.0), dtype: float64 -> uint8
C4, range: (0.0, 1601.0), dtype: float64 -> uint16
C5, range: (0.0, 376.0), dtype: float64 -> uint16
C6, range: (0.0, 1601.0), dtype: float64 -> uint16
C7, range: (0.0, 1621.0), dtype: float64 -> uint16
C8, range: (0.0, 1005.0), dtype: float64 -> uint16
C9, range: (0.0,

In [19]:
y_train, df_train = split_label_inputs(df_train)
df_train.shape, y_train.shape

((590540, 494), (590540,))

In [20]:
all_dfs = [df_train, df_test]

for col in ['card4', 'card6', 'ProductCD', 'M4']:
    print('Encoding', col)
    temp_df = pd.concat([df[col] for df in all_dfs])
    col_encoded = temp_df.value_counts().to_dict()
    del temp_df
    for df in all_dfs:
        df[col] = df[col].map(col_encoded)
    print(col_encoded)

Encoding card4
{'visa': 719649, 'mastercard': 347386, 'american express': 16009, 'discover': 9524}
Encoding card6
{'debit': 824959, 'credit': 267648, 'debit or credit': 30, 'charge card': 16}
Encoding ProductCD
{'W': 800657, 'C': 137785, 'R': 73346, 'H': 62397, 'S': 23046}
Encoding M4
{'M0': 357789, 'M2': 122947, 'M1': 97306}


In [21]:
for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
    for df in all_dfs:
        df[col] = df[col].map({ 'T': 1, 'F': 0 })

In [22]:
def minify_identity_df(df):
    df['id_23'] = df['id_23'].map({
        'TRANSPARENT': 4, 
        'IP_PROXY': 3, 
        'IP_PROXY:ANONYMOUS': 2, 
        'IP_PROXY:HIDDEN': 1
    })
    
    for col in ['id_16']:
        df[col+'_na'] = df[col].isna()
    
    for col in ['id_12', 'id_15', 'id_16', 'id_27', 'id_28', 'id_29']:
        df[col] = df[col].map({
            'New': 2, 
            'Found': 1, 
            'NotFound': 0, 
            'Unknown': -1
        })

    for col in ['id_35', 'id_36', 'id_37', 'id_38']:
        df[col] = df[col].map({ 'T': 1, 'F': 0 })

    df['id_34'] = df['id_34'].fillna(':-2')
    df['id_34'] = df['id_34'].apply(lambda x: x.split(':')[1]).astype(np.int8)
    df['id_34'] = np.where(df['id_34'] == -2, np.nan, df['id_34'])
    
    df['id_33'] = df['id_33'].fillna('0x0')
    df['id_33_0'] = df['id_33'].apply(lambda x: x.split('x')[0]).astype(int)
    df['id_33_1'] = df['id_33'].apply(lambda x: x.split('x')[1]).astype(int)
    df['id_33'] = np.where(df['id_33'] == '0x0', np.nan, df['id_33'])

    df['DeviceType'] = df['DeviceType'].map({ 'desktop': 1, 'mobile': 0 })
    return df

In [23]:
import itertools


for df in all_dfs:
    minify_identity_df(df)

for col in ['id_33', 'P_emaildomain', 'R_emaildomain', 'id_30', 'id_31', 'DeviceInfo'] + encode_labels_cols:
    for df in all_dfs:
        df[col] = df[col].fillna('unknown_value')
    
    le = LabelEncoder()
    values = [list(df[col]) for df in all_dfs]
    values = list(itertools.chain(*values))
    le.fit(values)
    
    for df in all_dfs:
        df[col] = le.transform(df[col])

In [24]:
drop = [COL_DT]

for df in all_dfs:  
    for col in df.columns:
        if col in drop:
            df.drop(col, axis=1, inplace=True)

In [25]:
df_train, na_train = reduce_mem_usage(df_train)
for df, filename in zip([df_train, na_train, y_train], 
                        ['df_train.pkl', 'na_train.pkl', 'y_train.pkl']):
    save_to_disk(df, filename)

TransactionAmt, range: (0.25099998712539673, 31937.390625), dtype: float32 -> float32
ProductCD, range: (23046, 800657), dtype: int64 -> uint32
card1, range: (1000, 18396), dtype: uint16 -> uint16
card2, range: (100, 33314), dtype: uint16 -> uint16
card3, range: (34, 231), dtype: uint8 -> uint8
card4, range: (9524.0, 719649.0), dtype: float64 -> uint32
card5, range: (34, 237), dtype: uint8 -> uint8
card6, range: (16.0, 824959.0), dtype: float64 -> uint32
addr1, range: (100, 33314), dtype: uint16 -> uint16
addr2, range: (10, 102), dtype: uint8 -> uint8
dist1, range: (0, 33314), dtype: uint16 -> uint16
dist2, range: (0, 33314), dtype: uint16 -> uint16
P_emaildomain, range: (0, 60), dtype: int64 -> uint8
R_emaildomain, range: (0, 60), dtype: int64 -> uint8
C1, range: (0, 4685), dtype: uint16 -> uint16
C2, range: (0, 5691), dtype: uint16 -> uint16
C3, range: (0, 26), dtype: uint8 -> uint8
C4, range: (0, 2253), dtype: uint16 -> uint16
C5, range: (0, 349), dtype: uint16 -> uint16
C6, range: 

In [26]:
df_test, na_test = reduce_mem_usage(df_test)
for df, filename in zip([df_test, na_test], ['df_test.pkl', 'na_test.pkl']):
    save_to_disk(df, filename)

TransactionAmt, range: (0.017999999225139618, 10270.0), dtype: float32 -> float32
ProductCD, range: (23046, 800657), dtype: int64 -> uint32
card1, range: (1001, 18397), dtype: uint16 -> uint16
card2, range: (100, 33314), dtype: uint16 -> uint16
card3, range: (34, 232), dtype: uint8 -> uint8
card4, range: (9524.0, 719649.0), dtype: float64 -> uint32
card5, range: (34, 237), dtype: uint8 -> uint8
card6, range: (16.0, 824959.0), dtype: float64 -> uint32
addr1, range: (100, 33314), dtype: uint16 -> uint16
addr2, range: (10, 102), dtype: uint8 -> uint8
dist1, range: (0, 33314), dtype: uint16 -> uint16
dist2, range: (0, 33314), dtype: uint16 -> uint16
P_emaildomain, range: (0, 60), dtype: int64 -> uint8
R_emaildomain, range: (0, 60), dtype: int64 -> uint8
C1, range: (0, 33314), dtype: uint16 -> uint16
C2, range: (0, 33314), dtype: uint16 -> uint16
C3, range: (0, 34), dtype: uint8 -> uint8
C4, range: (0, 33314), dtype: uint16 -> uint16
C5, range: (0, 33314), dtype: uint16 -> uint16
C6, range:

# Modelling

In [27]:
params = {
    'num_leaves': 512,
    'min_child_weight': 0.035,
    'feature_fraction': 0.25,
    'bagging_fraction': 0.22,
    'min_data_in_leaf': 100,
    'objective': 'binary',
    'max_depth': -1,
    'learning_rate': 0.006,
    "boosting_type": "gbdt",
    "bagging_seed": 57,
    "metric": 'auc',
    "verbosity": -1,
    'reg_alpha': 0.33,
    'reg_lambda': 0.39,
    'random_state': 42,
}

In [28]:
seeds = [11, 19, 23, 27, 31, 37]
scores = []
test_preds = []

training_start_time = time()

for fold, month in enumerate(np.unique(dt_m)):
    seed = seeds[fold]
    seed_everything(seed)
    params['seed'] = seed
    
    start_time = time()
    print('Training on fold {}'.format(fold))
    
    is_val = dt_m.isin([month])
    x_train, y_train0 = df_train[~is_val], y_train[~is_val]
    x_val, y_val = df_train[is_val], y_train[is_val]
    print('Train: {}, Validation: {}'.format(len(x_train), len(x_val)))
    
    train_data = lgb.Dataset(x_train, label=y_train0)
    valid_data = lgb.Dataset(x_val, label=y_val)
    
    model = lgb.train(params, train_data, 
                      num_boost_round=3_000, 
                      valid_sets=valid_data, 
                      verbose_eval=100, 
                      early_stopping_rounds=100)
    
    print('Best iteration: ', model.best_iteration)
    
    y_pred_val = model.predict(x_val)
    auc = roc_auc_score(y_val, y_pred_val)
    scores.append(auc)
    print('OOF score for {} fold: {}'.format(fold, auc))
    save_to_disk(y_pred_val, 'y_pred_valid_fold{}.pkl'.format(fold))
    
    y_pred_test = model.predict(df_test)
    test_preds.append(y_pred_test)
    save_to_disk(y_pred_test, 'y_pred_test_fold{}.pkl'.format(fold))
    
    end_time = str(datetime.timedelta(seconds=time() - start_time))
    print('Fold {} finished in {}'.format(fold, end_time))
    
print('\nDONE')
print('Total time: {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))

Training on fold 0
Train: 453219, Validation: 137321
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.884033
[200]	valid_0's auc: 0.889811
[300]	valid_0's auc: 0.896413
[400]	valid_0's auc: 0.901741
[500]	valid_0's auc: 0.905965
[600]	valid_0's auc: 0.909074
[700]	valid_0's auc: 0.911325
[800]	valid_0's auc: 0.912805
[900]	valid_0's auc: 0.913974
[1000]	valid_0's auc: 0.915014
[1100]	valid_0's auc: 0.916052
[1200]	valid_0's auc: 0.916756
[1300]	valid_0's auc: 0.917191
[1400]	valid_0's auc: 0.917649
[1500]	valid_0's auc: 0.917991
[1600]	valid_0's auc: 0.918338
[1700]	valid_0's auc: 0.918409
[1800]	valid_0's auc: 0.918451
[1900]	valid_0's auc: 0.918606
Early stopping, best iteration is:
[1888]	valid_0's auc: 0.918635
Best iteration:  1888
OOF score for 0 fold: 0.918634742724612
Fold 0 finished in 0:23:38.241702
Training on fold 1
Train: 497955, Validation: 92585
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.9073

In [29]:
print('OOF score: avg %f, std %f, min %f' % (np.mean(scores), np.std(scores), np.min(scores)))

OOF score: avg 0.943304, std 0.012141, min 0.918635


# Submission

In [30]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = np.average(test_preds, axis=0)
sub.to_csv('submission.csv', index=False)