In [1]:
from functools import partial

import datetime
import gc
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import random
import seaborn as sns

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

sns.set()

%matplotlib inline

In [2]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    
def read_from_disk(path, filename):
    with open(os.path.join(path, filename), 'rb') as handle:
        return pickle.load(handle)
    
    
def save_to_disk(obj, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Data preparation

In [3]:
df_train = read_from_disk('../input/fraud-fs-fe9477-no-day-data-preparation', 'df_train.pkl')
df_test = read_from_disk('../input/fraud-fs-fe9477-no-day-data-preparation', 'df_test.pkl')
y = read_from_disk('../input/fraud-fs-fe9477-no-day-data-preparation', 'y.pkl')

In [4]:
print(list(df_train.columns))

['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V66', 'V67', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V90', 'V91', 'V92', 'V93', 'V

In [5]:
categorical = ['ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'card1__addr1', 'card1__card5', 'card2__dist1', 'card2__id_20', 'card5__P_emaildomain', 'P_emaildomain_sfx2', 'P_emaildomain_sfx1', 'P_emaildomain_bin', 'R_emaildomain_sfx2', 'R_emaildomain_sfx1', 'R_emaildomain_bin', 'P_emaildomain_sfx2', 'P_emaildomain_sfx1', 'P_emaildomain_bin', 'R_emaildomain_sfx2', 'R_emaildomain_sfx1', 'R_emaildomain_bin']
numerical = sorted(list(set(df_train.columns) - set(categorical)))
print(numerical)

['Browser', 'C1', 'C10', 'C10_count', 'C11', 'C11_count', 'C12', 'C12_count', 'C13', 'C13_count', 'C14', 'C14_count', 'C1_count', 'C2', 'C2_count', 'C3', 'C3_count', 'C4', 'C4_count', 'C5', 'C5_count', 'C6', 'C6_count', 'C7', 'C7_count', 'C8', 'C8_count', 'C9', 'C9_count', 'Cents', 'Cents_ProductCD_W', 'D1', 'D10', 'D11', 'D12', 'D13', 'D13_na', 'D14', 'D14_na', 'D15', 'D1_count', 'D2', 'D2_count', 'D2_na', 'D3', 'D3_count', 'D3_na', 'D4', 'D4_count', 'D5', 'D5_count', 'D5_na', 'D6', 'D6_count', 'D6_na', 'D7', 'D7_count', 'D7_na', 'D8', 'D8_count', 'D9', 'DayOfWeek', 'DeviceInfo_count', 'DeviceType_count', 'Hour', 'M4_na', 'M5_na', 'M6_na', 'M7_na', 'M8_na', 'OS', 'P_emaildomain_count', 'P_emaildomain_parts', 'R_emaildomain_count', 'R_emaildomain_parts', 'TransactionAmt', 'TransactionDT', 'V1', 'V10', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V108', 'V109', 'V11', 'V110', 'V111', 'V112', 'V114', 'V115', 'V116', 'V12', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128

In [6]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
TransactionDT1 = df_train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
dt_m = TransactionDT1.dt.date.astype('str').str[:7]

In [7]:
df_train = df_train.drop(['TransactionDT'], axis=1)
df_test = df_test.drop(['TransactionDT'], axis=1)
df_train.shape, df_test.shape

((590540, 504), (506691, 504))

# Modelling

In [8]:
month = '2017-12'
X_train, y_train = df_train[~dt_m.isin([month])], y[~dt_m.isin([month])]
X_valid, y_valid = df_train[dt_m.isin([month])], y[dt_m.isin([month])]

In [9]:
params = {
    'n_estimators': 2500,
    'early_stopping_rounds': 500,
    'depth': 8,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': 13,
    'thread_count': 7,
    'border_count': 32,
    'bootstrap_type': 'Poisson',
    'grow_policy': 'Lossguide',
    'max_leaves': 20,
    'min_data_in_leaf': 3,
    'task_type': 'GPU',
    'verbose': False
    }

In [10]:
y_preds = np.zeros(df_test.shape[0])
score = 0

NFOLDS = len(np.unique(dt_m)) # 6

for fold_n, month in enumerate(np.unique(dt_m)):
        X_train, y_train = df_train[~dt_m.isin([month])], y[~dt_m.isin([month])]
        X_valid, y_valid = df_train[dt_m.isin([month])], y[dt_m.isin([month])]

        clf = CatBoostClassifier(**params)
        clf.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
#             cat_features=categorical,
            use_best_model=True,
            verbose=False
        )

        y_pred_valid = clf.predict_proba(X_valid)[:, 1]
        save_to_disk(y_pred_valid, 'y_pred_valid_fold{}.pkl'.format(fold_n))
        print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
        
        score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
        
        y_pred_test = clf.predict_proba(df_test)[:, 1]
        save_to_disk(y_pred_test, 'y_pred_test_fold{}.pkl'.format(fold_n))
        y_preds += y_pred_test / NFOLDS
        
        del X_train, X_valid, y_train, y_valid
        gc.collect()

print(f"\nMean AUC = {score}")

Fold 1 | AUC: 0.9133559064202742
Fold 2 | AUC: 0.9323482753133613
Fold 3 | AUC: 0.9363597725626474
Fold 4 | AUC: 0.937291416833934
Fold 5 | AUC: 0.9453792853785589
Fold 6 | AUC: 0.9303716725429071

Mean AUC = 0.9325177215086138


# Submission

In [11]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv('submission.csv', index=False)