In [1]:
import datetime
import math
import os
import pickle
import random

from functools import partial
from time import time

import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold, KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from catboost import CatBoostClassifier, Pool, cv

In [2]:
INPUT_PATH = '/kaggle/input/fraud-just-data'

COL_ID = 'TransactionID'
COL_DT = 'TransactionDT'
COL_AMOUNT = 'TransactionAmt'
COL_TARGET = 'isFraud'

START_DATE = '2017-11-30'

In [3]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


def save_to_disk(df, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        
def read_from_disk(path, filename):
    with open(os.path.join(path, filename), 'rb') as handle:
        return pickle.load(handle)

In [4]:
def parse_datetime(df):
    start_date = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    return df[COL_DT].apply(lambda x: (start_date + datetime.timedelta(seconds = x)))

In [5]:
df_train, y_train, df_test, dt_m = map(partial(read_from_disk, INPUT_PATH), 
                                 ['df_train.pkl', 'y_train.pkl', 'df_test.pkl', 'dt_m.pkl'])

df_train.shape, y_train.shape, dt_m.shape, df_test.shape

((590540, 986), (590540,), (590540,), (506691, 986))

In [6]:
dt_m.unique()

array(['2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05'],
      dtype=object)

In [7]:
new_categorical_cols = [
    'ProductCD__Cents_magic', 'DayOfWeek', 'card1__addr1', 'card1__card5', 'card2__dist1', 
    'card5__P_emaildomain', 'uuid2', 'uuid3', 'card2__id_19', 'card2__id_20', 'DeviceInfo__id_30', 
    'D2_na', 'D3_na', 'D5_na', 'D6_na', 'D7_na', 'D13_na', 'D14_na', 'M4_na', 'M5_na', 'id_16_na', 
    'D2_na', 'D3_na', 'D5_na', 'D6_na', 'D7_na', 'D13_na', 'D14_na', 'M4_na', 'M5_na', 'id_16_na', 
    'P_emaildomain_sfx2', 'P_emaildomain_sfx1', 'P_emaildomain_bin', 'R_emaildomain_sfx2', 
    'R_emaildomain_sfx1', 'R_emaildomain_bin', 'Browser', 'OS', 'C1_eq_1', 'C10_eq_0', 'C11_eq_1', 
    'C11_count_eq_731792', 'C12_eq_0', 'C13_eq_1', 'C13_count_eq_367617', 'C14_eq_1', 
    'C14_count_eq_598520', 'C1_count_eq_586742', 'C2_eq_1', 'C2_count_eq_581696', 'C3_eq_0', 'C4_eq_0', 
    'C4_eq_1', 'C5_eq_0', 'C5_eq_1', 'C5_count_eq_707008', 'C5_count_eq_219471', 'C6_eq_1', 
    'C6_count_eq_631237', 'C7_eq_0', 'C7_count_eq_961237', 'C8_eq_0', 'C9_eq_1', 'C9_eq_0', 
    'C9_count_eq_420354', 'C9_count_eq_341561', 'Cents_eq_0', 'Cents_eq_950', 'D1_eq_0', 'D10_eq_0', 
    'D12_eq_33314', 'D13_eq_33314', 'D14_eq_33314', 'D15_eq_0', 'D2_eq_33314', 'D3_eq_33314', 
    'D4_eq_0', 'D5_eq_33314', 'D6_eq_33314', 'D7_eq_33314', 'DeviceInfo__id_30_count_eq_859447', 
    'DeviceInfo_count_eq_863508', 'P_emaildomain_count_eq_435803', 'P_emaildomain_parts_eq_2', 
    'R_emaildomain_parts_eq_0', 'R_emaildomain_parts_eq_2', 'V1_eq_1', 'V1_eq_34', 'V10_eq_34', 
    'V10_eq_0', 'V10_eq_1', 'V100_eq_0', 'V101_eq_0', 'V102_eq_0', 'V103_eq_0', 'V104_eq_0', 
    'V105_eq_0', 'V106_eq_0', 'V107_eq_1', 'V108_eq_1', 'V109_eq_1', 'V11_eq_34', 'V11_eq_0', 
    'V11_eq_1', 'V110_eq_1', 'V111_eq_1', 'V112_eq_1', 'V113_eq_1', 'V114_eq_1', 'V115_eq_1', 
    'V116_eq_1', 'V117_eq_1', 'V118_eq_1', 'V119_eq_1', 'V12_eq_1', 'V12_eq_0', 'V120_eq_1', 
    'V121_eq_1', 'V122_eq_1', 'V123_eq_1', 'V124_eq_1', 'V125_eq_1', 'V126_eq_0.0', 'V127_eq_0.0', 
    'V128_eq_0.0', 'V129_eq_0.0', 'V13_eq_1', 'V13_eq_0', 'V130_eq_0.0', 'V131_eq_0.0', 'V132_eq_0.0', 
    'V133_eq_0.0', 'V134_eq_0.0', 'V135_eq_0.0', 'V136_eq_0.0', 'V137_eq_0.0', 'V139_eq_34', 'V14_eq_1', 
    'V140_eq_34', 'V141_eq_34', 'V142_eq_34', 'V143_eq_33314', 'V143_eq_34', 'V144_eq_34', 'V145_eq_33314', 
    'V146_eq_34', 'V148_eq_34', 'V149_eq_34', 'V150_eq_33314', 'V151_eq_34', 'V152_eq_34', 'V154_eq_34', 
    'V155_eq_34', 'V156_eq_34', 'V158_eq_34', 'V16_eq_0', 'V167_eq_33314', 'V167_eq_34', 'V168_eq_33314', 
    'V169_eq_34', 'V169_eq_0', 'V17_eq_0', 'V170_eq_34', 'V171_eq_34', 'V172_eq_34', 'V172_eq_0', 
    'V173_eq_34', 'V173_eq_0', 'V174_eq_34', 'V174_eq_0', 'V175_eq_34', 'V175_eq_0', 'V176_eq_34', 
    'V176_eq_1', 'V177_eq_33314', 'V177_eq_34', 'V177_eq_0', 'V178_eq_33314', 'V178_eq_0', 'V179_eq_33314', 
    'V179_eq_0', 'V18_eq_0', 'V180_eq_34', 'V180_eq_0', 'V181_eq_34', 'V181_eq_0', 'V182_eq_34', 'V182_eq_0', 
    'V183_eq_34', 'V183_eq_0', 'V184_eq_34', 'V184_eq_0', 'V186_eq_34', 'V186_eq_1', 'V187_eq_34', 
    'V187_eq_1', 'V188_eq_34', 'V188_eq_1', 'V189_eq_34', 'V189_eq_1', 'V19_eq_1', 'V190_eq_34', 'V190_eq_1', 
    'V191_eq_34', 'V191_eq_1', 'V192_eq_34', 'V192_eq_1', 'V193_eq_34', 'V193_eq_1', 'V194_eq_34', 
    'V194_eq_1', 'V195_eq_34', 'V195_eq_1', 'V196_eq_34', 'V196_eq_1', 'V197_eq_34', 'V197_eq_1', 
    'V198_eq_34', 'V198_eq_1', 'V199_eq_34', 'V199_eq_1', 'V2_eq_1', 'V2_eq_34', 'V20_eq_1', 'V200_eq_34', 
    'V200_eq_1', 'V201_eq_34', 'V201_eq_1', 'V205_eq_0.0', 'V206_eq_0.0', 'V207_eq_0.0', 'V208_eq_0.0', 
    'V209_eq_0.0', 'V21_eq_0', 'V210_eq_0.0', 'V211_eq_0.0', 'V212_eq_0.0', 'V213_eq_0.0', 'V214_eq_0.0', 
    'V215_eq_0.0', 'V216_eq_0.0', 'V217_eq_33314', 'V217_eq_34', 'V218_eq_33314', 'V219_eq_33314', 
    'V220_eq_34', 'V220_eq_0', 'V221_eq_33314', 'V221_eq_34', 'V222_eq_33314', 'V223_eq_34', 'V223_eq_0', 
    'V224_eq_34', 'V224_eq_0', 'V225_eq_34', 'V225_eq_0', 'V226_eq_34', 'V226_eq_33314', 'V226_eq_0', 
    'V227_eq_33314', 'V227_eq_34', 'V227_eq_0', 'V228_eq_34', 'V229_eq_34', 'V229_eq_33314', 'V23_eq_1', 
    'V230_eq_34', 'V230_eq_33314', 'V231_eq_33314', 'V231_eq_34', 'V231_eq_0', 'V232_eq_33314', 
    'V233_eq_33314', 'V233_eq_0', 'V234_eq_34', 'V234_eq_33314', 'V234_eq_0', 'V235_eq_34', 'V235_eq_0', 
    'V236_eq_34', 'V236_eq_0', 'V237_eq_34', 'V237_eq_0', 'V238_eq_34', 'V238_eq_0', 'V239_eq_34', 
    'V239_eq_0', 'V24_eq_1', 'V240_eq_34', 'V240_eq_1', 'V241_eq_34', 'V241_eq_1', 'V242_eq_34', 'V242_eq_1', 
    'V243_eq_34', 'V243_eq_1', 'V244_eq_34', 'V244_eq_1', 'V245_eq_33314', 'V245_eq_34', 'V246_eq_34', 
    'V246_eq_1', 'V247_eq_34', 'V247_eq_1', 'V248_eq_34', 'V248_eq_1', 'V249_eq_34', 'V249_eq_1', 'V25_eq_1', 
    'V250_eq_34', 'V251_eq_34', 'V253_eq_34', 'V253_eq_1', 'V254_eq_34', 'V254_eq_1', 'V255_eq_34', 
    'V256_eq_34', 'V257_eq_34', 'V257_eq_1', 'V258_eq_34', 'V258_eq_33314', 'V258_eq_1', 'V259_eq_33314', 
    'V259_eq_34', 'V26_eq_1', 'V260_eq_34', 'V260_eq_1', 'V261_eq_34', 'V262_eq_34', 'V262_eq_1', 
    'V266_eq_0.0', 'V267_eq_0.0', 'V268_eq_0.0', 'V269_eq_0.0', 'V270_eq_0.0', 'V271_eq_0.0', 'V272_eq_0.0', 
    'V273_eq_0.0', 'V275_eq_0.0', 'V276_eq_0.0', 'V277_eq_0.0', 'V278_eq_0.0', 'V279_eq_0', 'V28_eq_0', 
    'V280_eq_0', 'V281_eq_0', 'V282_eq_1', 'V282_eq_0', 'V283_eq_1', 'V283_eq_0', 'V283_count_eq_513989', 
    'V283_count_eq_399663', 'V284_eq_0', 'V285_eq_0', 'V285_count_eq_658790', 'V286_eq_0', 'V287_eq_0', 
    'V288_eq_0', 'V289_eq_0', 'V29_eq_0', 'V29_eq_1', 'V290_eq_1', 'V291_eq_1', 'V292_eq_1', 'V293_eq_0', 
    'V294_eq_0', 'V295_eq_0', 'V296_eq_0', 'V297_eq_0', 'V298_eq_0', 'V299_eq_0', 'V3_eq_1', 'V3_eq_34', 
    'V30_eq_0', 'V30_eq_1', 'V300_eq_0', 'V301_eq_0', 'V302_eq_0', 'V302_eq_1', 'V303_eq_0', 'V303_eq_1', 
    'V304_eq_0', 'V304_eq_1', 'V306_eq_0.0', 'V307_eq_0.0', 'V307_count_eq_583910', 'V308_eq_0.0', 
    'V308_count_eq_778776', 'V309_eq_0.0', 'V31_eq_0', 'V310_eq_0.0', 'V311_eq_0.0', 'V312_eq_0.0', 
    'V313_eq_0.0', 'V313_mean_by_V307_eq_2.22330379486084', 'V313_mean_by_V307_eq_2.315218925476074', 
    'V314_eq_0.0', 'V315_eq_0.0', 'V316_eq_0.0', 'V317_eq_0.0', 'V318_eq_0.0', 'V319_eq_0.0', 'V32_eq_0', 
    'V320_eq_0.0', 'V321_eq_0.0', 'V322_eq_33314', 'V322_eq_34', 'V323_eq_33314', 'V323_eq_34', 
    'V324_eq_33314', 'V324_eq_34', 'V326_eq_34', 'V327_eq_34', 'V329_eq_34', 'V33_eq_0', 'V330_eq_34', 
    'V34_eq_0', 'V35_eq_1', 'V35_eq_0', 'V35_eq_34', 'V36_eq_1', 'V36_eq_0', 'V36_eq_34', 'V37_eq_1', 
    'V37_eq_34', 'V38_eq_1', 'V38_eq_34', 'V39_eq_0', 'V39_eq_34', 'V4_eq_1', 'V4_eq_34', 'V40_eq_0', 
    'V40_eq_34', 'V42_eq_0', 'V42_eq_34', 'V43_eq_0', 'V43_eq_34', 'V44_eq_1', 'V44_eq_34', 'V45_eq_1', 
    'V45_eq_34', 'V46_eq_1', 'V46_eq_34', 'V47_eq_1', 'V47_eq_34', 'V48_eq_0', 'V48_eq_1', 'V48_eq_34', 
    'V49_eq_0', 'V49_eq_1', 'V49_eq_34', 'V5_eq_1', 'V5_eq_34', 'V50_eq_0', 'V50_eq_34', 'V51_eq_0', 
    'V51_eq_34', 'V52_eq_0', 'V52_eq_34', 'V53_eq_1', 'V53_eq_0', 'V54_eq_1', 'V54_eq_0', 'V55_eq_1', 
    'V56_eq_1', 'V57_eq_0', 'V58_eq_0', 'V59_eq_0', 'V60_eq_0', 'V61_eq_1', 'V62_eq_1', 'V63_eq_0', 
    'V64_eq_0', 'V65_eq_1', 'V66_eq_1', 'V67_eq_1', 'V69_eq_0', 'V69_eq_1', 'V7_eq_1', 'V7_eq_34', 'V70_eq_0', 
    'V70_eq_1', 'V71_eq_0', 'V72_eq_0', 'V73_eq_0', 'V74_eq_0', 'V75_eq_1', 'V75_eq_0', 'V76_eq_1', 'V76_eq_0',
    'V77_eq_1', 'V78_eq_1', 'V79_eq_0', 'V8_eq_1', 'V8_eq_34', 'V80_eq_0', 'V81_eq_0', 'V82_eq_1', 'V83_eq_1', 
    'V84_eq_0', 'V85_eq_0', 'V86_eq_1', 'V87_eq_1', 'V88_eq_1', 'V89_eq_0', 'V9_eq_1', 'V9_eq_34', 'V90_eq_0', 
    'V90_eq_1', 'V91_eq_0', 'V91_eq_1', 'V92_eq_0', 'V93_eq_0', 'V94_eq_0', 'V95_eq_0', 'V96_eq_0', 'V97_eq_0', 
    'V98_eq_0', 'V99_eq_0', 'addr2_count_eq_956415', 'card3_count_eq_956845', 'card5_count_eq_553537', 
    'dist1_eq_33314', 'dist2_eq_33314', 'id_01_eq_34', 'id_02_eq_4294935074', 'id_03_eq_34', 'id_04_eq_34', 
    'id_05_eq_34', 'id_06_eq_34', 'id_07_eq_34', 'id_08_eq_34', 'id_09_eq_34', 'id_10_eq_34', 'id_11_eq_100.0', 
    'id_33_0_eq_0', 'id_33_1_eq_0'
]

In [8]:
def col_names(prefix, interval):
    assert len(interval) == 2
    return map(lambda i: prefix + str(i), range(interval[0], interval[1] + 1))


categorical = [
    'ProductCD', 
    'DeviceType', 'DeviceInfo', 
    'P_emaildomain', 'R_emaildomain', 
    'addr1', 'addr2',
    *col_names('card', (1, 6)),
    *col_names('M', (1, 9)),
    *col_names('id_', (12, 38)),
    *new_categorical_cols,
]
print(categorical)

['ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'ProductCD__Cents_magic', 'DayOfWeek', 'card1__addr1', 'card1__card5', 'card2__dist1', 'card5__P_emaildomain', 'uuid2', 'uuid3', 'card2__id_19', 'card2__id_20', 'DeviceInfo__id_30', 'D2_na', 'D3_na', 'D5_na', 'D6_na', 'D7_na', 'D13_na', 'D14_na', 'M4_na', 'M5_na', 'id_16_na', 'D2_na', 'D3_na', 'D5_na', 'D6_na', 'D7_na', 'D13_na', 'D14_na', 'M4_na', 'M5_na', 'id_16_na', 'P_emaildomain_sfx2', 'P_emaildomain_sfx1', 'P_emaildomain_bin', 'R_emaildomain_sfx2', 'R_emaildomain_sfx1', 'R_emaildomain_bin', 'Browser', 'OS', 'C1_eq_1', 'C10_eq_0', 'C11_eq_1', '

In [9]:
numerical = sorted(list(set(df_train.columns) - set(categorical)))
print(numerical)

['C1', 'C10', 'C11', 'C11_count', 'C12', 'C13', 'C13_count', 'C14', 'C14_count', 'C1_count', 'C2', 'C2_count', 'C3', 'C4', 'C5', 'C5_count', 'C6', 'C6_count', 'C7', 'C7_count', 'C8', 'C9', 'C9_count', 'Cents', 'D1', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'DayOfMonth', 'DeviceInfo__id_30_count', 'DeviceInfo_count', 'Hour', 'MonthFraction', 'P_emaildomain_count', 'P_emaildomain_parts', 'R_emaildomain_parts', 'TransactionAmt', 'TransactionAmt_log', 'TransactionAmt_mean_by_card1', 'TransactionAmt_mean_by_uuid2_ProductCD', 'TransactionAmt_mean_by_uuid3', 'TransactionAmt_std_by_uuid3', 'V1', 'V10', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V11', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V12', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V13', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V139', 'V14', 'V140', '

In [10]:
params = {
    'loss_function': 'Logloss',
    'custom_loss': ['AUC'],
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type': 'GPU',
    'early_stopping_rounds': 100,
    'iterations': 2000,
    'learning_rate': 0.075,
    'depth': 8,
    'l2_leaf_reg': 4.0,
    'bagging_temperature': 0.6,
}

In [11]:
seeds = [11, 19, 23, 27, 31, 37]
scores = []
test_preds = []
feat_imps = []

training_start_time = time()

for fold, month in enumerate(np.unique(dt_m)):
    seed = seeds[fold]
    seed_everything(seed)
    params['random_seed'] = seed
    
    start_time = time()
    print('Training on fold {}'.format(fold))
    
    is_val = dt_m.isin([month])
    x_train, y_train0 = df_train[~is_val], y_train[~is_val]
    x_val, y_val = df_train[is_val], y_train[is_val]
    print('Train: {}, Validation: {}'.format(len(x_train), len(x_val)))
    
    train_pool = Pool(x_train, y_train0, cat_features=categorical)
    validate_pool = Pool(x_val, y_val, cat_features=categorical)
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validate_pool)
    
    y_pred_val = model.predict_proba(x_val)[:,1]
    auc = roc_auc_score(y_val, y_pred_val)
    scores.append(auc)
    print('OOF score for {} fold: {}'.format(fold, auc))
    save_to_disk(y_pred_val, 'y_pred_valid_fold{}.pkl'.format(fold))
    
    y_pred_test = model.predict_proba(df_test)[:,1]
    test_preds.append(y_pred_test)
    save_to_disk(y_pred_test, 'y_pred_test_fold{}.pkl'.format(fold))
    
    end_time = str(datetime.timedelta(seconds=time() - start_time))
    print('Fold {} finished in {}'.format(fold, end_time))
    
    feature_importances = model.get_feature_importance(train_pool)
    feat_imps.append(feature_importances)
    
print('\nDONE')
print('Total time: {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))

Training on fold 0
Train: 453219, Validation: 137321
OOF score for 0 fold: 0.922756312264148
Fold 0 finished in 0:33:34.155914
Training on fold 1
Train: 497955, Validation: 92585
OOF score for 1 fold: 0.9456013050697782
Fold 1 finished in 0:31:57.894690
Training on fold 2
Train: 504519, Validation: 86021
OOF score for 2 fold: 0.9499251241718365
Fold 2 finished in 0:29:51.067781
Training on fold 3
Train: 488908, Validation: 101632
OOF score for 3 fold: 0.9449566425521206
Fold 3 finished in 0:30:35.462032
Training on fold 4
Train: 506885, Validation: 83655
OOF score for 4 fold: 0.9618361559374311
Fold 4 finished in 0:34:40.116021
Training on fold 5
Train: 501214, Validation: 89326
OOF score for 5 fold: 0.9359028538959432
Fold 5 finished in 0:32:10.157459

DONE
Total time: 3:12:49.618109


In [12]:
print('OOF score: avg %f, std %f, min %f' % (np.mean(scores), np.std(scores), np.min(scores)))

OOF score: avg 0.943496, std 0.012056, min 0.922756


In [13]:
for score, name in sorted(zip(np.average(feat_imps, axis=0), df_train.columns), reverse=True):
    print('{:50s} {}'.format(name, score))

C13                                                4.791186818321852
uuid3                                              4.373661390273861
card1__addr1                                       3.2511431124873593
C14                                                2.6919969545520424
M5                                                 2.5199386897523492
C1_count                                           2.3248551966183943
TransactionAmt                                     2.263980200335744
card2__dist1                                       2.0631580433761694
C13_count                                          2.0448054182342807
TransactionAmt_log                                 1.8968093790692988
DeviceInfo__id_30                                  1.8595586971059914
C1                                                 1.7088327599891249
card5__P_emaildomain                               1.6213270294504583
P_emaildomain                                      1.4930996294636767
D2                     

# Submission

In [14]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = np.average(test_preds, axis=0)
sub.to_csv('submission.csv', index=False)