In [1]:
from functools import partial

import datetime
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import pickle
import random
import time

from copy import deepcopy
from scipy.stats import rankdata
from sklearn.metrics import roc_auc_score

In [2]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    
def read_from_disk(path, filename):
    with open(os.path.join(path, filename), 'rb') as handle:
        return pickle.load(handle)
    
    
def save_to_disk(obj, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def timedelta(seconds):
    return str(datetime.timedelta(seconds=seconds)).split('.')[0]

In [3]:
trans_train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
id_train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_train = trans_train.merge(id_train, how='left', left_on='TransactionID', right_on='TransactionID')
del trans_train, id_train
df_train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [4]:
trans_test = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
id_test = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
df_test = trans_test.merge(id_test, how='left', left_on='TransactionID', right_on='TransactionID')
del trans_test, id_test
df_test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [5]:
def postprocess_df(X):
    
    df = X.copy()
    
    list_v = []

    for i in [313]:#[313,314,315]:
        df[f"V{i}_new"] = df[f"V{i}"]
        df.loc[df[f"V{i}_new"]==0.0, f"V{i}_new"] = 9999999
        list_v.append(f"V{i}_new")
    df['first_trans_amt'] = np.round(1000*np.min(df[list_v], axis=1))/1000
    df.loc[(df['first_trans_amt']==9999999.000) 
                       & (df['V306']==0)
                       & (df['V307']==0)
                       & (df['V308']==0)
                       & (df['V309']==0)
                       & (df['V310']==0)
                       & (df['V311']==0)
                       & (df['V312']==0)
                       & (df['V313']==0)
                       & (df['V314']==0)
                       & (df['V315']==0)
                       & (df['V316']==0)
                       & (df['V317']==0)
                       & (df['V318']==0)
                       & (df['V319']==0)
                       & (df['V320']==0)
                       & (df['V321']==0),
                       'first_trans_amt'] \
        = df.loc[(df['first_trans_amt']==9999999.000) 
                             & (df['V306']==0)
                       & (df['V307']==0)
                       & (df['V308']==0)
                       & (df['V309']==0)
                       & (df['V310']==0)
                       & (df['V311']==0)
                       & (df['V312']==0)
                       & (df['V313']==0)
                       & (df['V314']==0)
                       & (df['V315']==0)
                       & (df['V316']==0)
                       & (df['V317']==0)
                       & (df['V318']==0)
                       & (df['V319']==0)
                       & (df['V320']==0)
                       & (df['V321']==0),
                             'TransactionAmt']
    df['D1'] = df['D1'].fillna(0)
    df['D3'] = df['D3'].fillna(0)
    df['day_first_trans']=(df['TransactionDT']/60/60/24-df['D1']).astype('int')
    df = df.fillna(-999)
    
    list_cols = ['card1','card2','card3','card4','card5','card6', 'day_first_trans', 'first_trans_amt', 
              'addr1', 
             'id_01', 
             'DeviceInfo', 
             'P_emaildomain'
            ]#, 'D3'
    suff = '_no_D3'
    x = df\
        .groupby(list_cols, as_index=False)\
        .agg({'TransactionID':'count', 'TransactionDT':[np.min, np.max, np.median]})
    x.columns = ['_'.join(col).strip() if col[1] != '' else col[0] for col in x.columns.values]
    x['num_group'] = x.index
    x['TransactionDT_amax_amin'+suff] = x['TransactionDT_amax'] - x['TransactionDT_amin']
    x['TransactionDT_amax_amed'+suff] = x['TransactionDT_amax'] - x['TransactionDT_median']
    x['TransactionDT_amin_amed'+suff] = x['TransactionDT_median']- x['TransactionDT_amin']
    x['TransactionDT_amed_diff'+suff] = (x['TransactionDT_amax_amed'+suff] - x['TransactionDT_amin_amed'+suff])/x['TransactionDT_amax_amin'+suff]
    x['TransactionDT_amed_diff'+suff] = x['TransactionDT_amed_diff'+suff].fillna(0)
    x['TransactionDT_mean_diff'+suff] = x['TransactionDT_amax_amin'+suff]/x['TransactionID_count']
    x=x.drop(columns=['TransactionDT_amin', 'TransactionDT_amax', 'TransactionDT_median'],axis=1)
    # x.head(100)
    x = x.rename(index=str, columns={'TransactionID_count':'TransactionID_count'+suff})
    df = df.merge(x, how='left', on=list_cols)
    df['rank_grp'+suff] = df.groupby(list_cols)['TransactionID'].rank(ascending=True)

    df['is_first_grp'+suff] = df['rank_grp'+suff] == 1
    df['is_last_grp'+suff] = df['rank_grp'+suff] ==  df['TransactionID_count'+suff]
    
    return df

In [6]:
START_DATE = '2017-11-30'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
TransactionDT1 = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
dt_m = TransactionDT1.dt.date.astype('str').str[:7]
train_folds_postprocessed = []
for fold_n, month in enumerate(np.unique(dt_m)):
    X_val = df_train[dt_m.isin([month])]
    X_val = postprocess_df(X_val)
    train_folds_postprocessed.append(X_val)
    print(X_val.shape)

(137321, 447)
(92585, 447)
(86021, 447)
(101632, 447)
(83655, 447)
(89326, 447)


In [7]:
save_to_disk(train_folds_postprocessed, 'train_folds_postprocessed.pkl')

In [8]:
df_test = postprocess_df(df_test)
save_to_disk(df_test, 'df_test_postprocessed.pkl')