In [1]:
import datetime
import math
import os
import pickle
import random

from functools import partial
from time import time

import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold, KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from catboost import CatBoostClassifier, Pool, cv

In [2]:
INPUT_PATH = '/kaggle/input/fraud-just-data'

COL_ID = 'TransactionID'
COL_DT = 'TransactionDT'
COL_AMOUNT = 'TransactionAmt'
COL_TARGET = 'isFraud'

START_DATE = '2017-11-30'

In [3]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


def save_to_disk(df, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        
def read_from_disk(path, filename):
    with open(os.path.join(path, filename), 'rb') as handle:
        return pickle.load(handle)

In [4]:
def parse_datetime(df):
    start_date = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    return df[COL_DT].apply(lambda x: (start_date + datetime.timedelta(seconds = x)))

In [5]:
df_train, y_train, df_test, dt_m = map(partial(read_from_disk, INPUT_PATH), 
                                 ['df_train.pkl', 'y_train.pkl', 'df_test.pkl', 'dt_m.pkl'])

df_train.shape, y_train.shape, dt_m.shape, df_test.shape

((590540, 497), (590540,), (590540,), (506691, 497))

In [6]:
dt_m.unique()

array(['2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05'],
      dtype=object)

In [7]:
new_categorical_cols = [
    'ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 
    'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 
    'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 
    'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 
    'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 
    'id_38', 'isDecember', 'card1__addr1', 'card1__card5', 'card2__dist1', 
    'card2__id_20', 'card5__P_emaildomain', 'D2_na', 'D3_na', 'D5_na', 'D6_na', 
    'D7_na', 'D13_na', 'D14_na', 'M4_na', 'M5_na', 'M6_na', 'M7_na', 'M8_na', 
    'D2_na', 'D3_na', 'D5_na', 'D6_na', 'D7_na', 'D13_na', 'D14_na', 'M4_na', 
    'M5_na', 'M6_na', 'M7_na', 'M8_na', 'P_emaildomain_sfx2', 'P_emaildomain_sfx1', 
    'P_emaildomain_bin', 'R_emaildomain_sfx2', 'R_emaildomain_sfx1', 
    'R_emaildomain_bin', 'Browser', 'OS', 'id_16_na', 'ProductCD__Cents_magic',
    'DayOfWeek', 'P_emaildomain_parts', 'R_emaildomain_parts'
]

In [8]:
def col_names(prefix, interval):
    assert len(interval) == 2
    return map(lambda i: prefix + str(i), range(interval[0], interval[1] + 1))


categorical = [
    'ProductCD', 
    'DeviceType', 'DeviceInfo', 
    'P_emaildomain', 'R_emaildomain', 
    'addr1', 'addr2',
    *col_names('card', (1, 6)),
    *col_names('M', (1, 9)),
    *col_names('id_', (12, 38)),
    *new_categorical_cols,
]
print(categorical)

['ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'isDecember', 'card1__addr1', 'card1__card5', 'card2__dist1', 'card2__id_20', 'card5__P_emaildomain', 'D2_na', 

In [9]:
numerical = sorted(list(set(df_train.columns) - set(categorical)))
print(numerical)

['C1', 'C10', 'C11', 'C11_count', 'C12', 'C13', 'C13_count', 'C14', 'C14_count', 'C1_count', 'C2', 'C2_count', 'C3', 'C4', 'C5', 'C5_count', 'C6', 'C6_count', 'C7', 'C7_count', 'C8', 'C9', 'C9_count', 'Cents', 'D1', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'DayOfMonth', 'DeviceInfo_count', 'Hour', 'MonthFraction', 'P_emaildomain_count', 'TransactionAmt', 'TransactionAmt_log', 'TransactionAmt_mean_by_card1', 'TransactionAmt_mean_by_uuid', 'TransactionAmt_mean_by_uuid2', 'TransactionAmt_mean_by_uuid2_ProductCD', 'TransactionAmt_mean_by_uuid3', 'TransactionAmt_std_by_uuid3', 'V1', 'V10', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V11', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V12', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V13', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V14', 'V140', 'V1

In [10]:
params = {
    'loss_function': 'Logloss',
    'custom_loss': ['AUC'],
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type': 'GPU',
    'early_stopping_rounds': 100,
    'iterations': 2000,
    'learning_rate': 0.075,
    'depth': 7,
    'l2_leaf_reg': 4.0,
    'bagging_temperature': 0.6,
}

In [11]:
seeds = [11, 19, 23, 27, 31, 37]
scores = []
test_preds = []
feat_imps = []

training_start_time = time()

for fold, month in enumerate(np.unique(dt_m)):
    seed = seeds[fold]
    seed_everything(seed)
    params['random_seed'] = seed
    
    start_time = time()
    print('Training on fold {}'.format(fold))
    
    is_val = dt_m.isin([month])
    x_train, y_train0 = df_train[~is_val], y_train[~is_val]
    x_val, y_val = df_train[is_val], y_train[is_val]
    print('Train: {}, Validation: {}'.format(len(x_train), len(x_val)))
    
    train_pool = Pool(x_train, y_train0, cat_features=categorical)
    validate_pool = Pool(x_val, y_val, cat_features=categorical)
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validate_pool)
    
    y_pred_val = model.predict_proba(x_val)[:,1]
    auc = roc_auc_score(y_val, y_pred_val)
    scores.append(auc)
    print('OOF score for {} fold: {}'.format(fold, auc))
    save_to_disk(y_pred_val, 'y_pred_valid_fold{}.pkl'.format(fold))
    
    y_pred_test = model.predict_proba(df_test)[:,1]
    test_preds.append(y_pred_test)
    save_to_disk(y_pred_test, 'y_pred_test_fold{}.pkl'.format(fold))
    
    end_time = str(datetime.timedelta(seconds=time() - start_time))
    print('Fold {} finished in {}'.format(fold, end_time))
    
    feature_importances = model.get_feature_importance(train_pool)
    feat_imps.append(feature_importances)
    
print('\nDONE')
print('Total time: {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))

Training on fold 0
Train: 453219, Validation: 137321
OOF score for 0 fold: 0.9225550496691793
Fold 0 finished in 0:09:55.764692
Training on fold 1
Train: 497955, Validation: 92585
OOF score for 1 fold: 0.9460520637083951
Fold 1 finished in 0:12:30.419322
Training on fold 2
Train: 504519, Validation: 86021
OOF score for 2 fold: 0.9497965373450681
Fold 2 finished in 0:10:11.330654
Training on fold 3
Train: 488908, Validation: 101632
OOF score for 3 fold: 0.9465843539479972
Fold 3 finished in 0:08:01.182991
Training on fold 4
Train: 506885, Validation: 83655
OOF score for 4 fold: 0.9622734756680539
Fold 4 finished in 0:12:44.411469
Training on fold 5
Train: 501214, Validation: 89326
OOF score for 5 fold: 0.9370562517676475
Fold 5 finished in 0:09:30.164062

DONE
Total time: 1:02:53.838288


In [12]:
print('OOF score: avg %f, std %f, min %f' % (np.mean(scores), np.std(scores), np.min(scores)))

OOF score: avg 0.944053, std 0.012155, min 0.922555


In [13]:
for score, name in sorted(zip(np.average(feat_imps, axis=0), df_train.columns), reverse=True):
    print('{:50s} {}'.format(name, score))

card1__addr1                                       7.77548097851252
C13                                                5.475803800749492
C14                                                2.9081617979958465
C1_count                                           2.6471459618459128
M5                                                 2.5546406270472914
TransactionAmt                                     2.5194760490596146
C1                                                 2.389786397985875
C13_count                                          2.2192511824813854
card2__dist1                                       2.181063908625142
card5__P_emaildomain                               1.8888152578647983
DeviceInfo                                         1.8838583246078417
TransactionAmt_log                                 1.4782607797643907
P_emaildomain                                      1.4547087491709652
id_31                                              1.4364031323697575
M6                       

# Submission

In [14]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = np.average(test_preds, axis=0)
sub.to_csv('submission.csv', index=False)