In [1]:
#hi
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import gc
import hashlib

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, TimeSeriesSplit, KFold, GroupKFold
from sklearn.metrics import roc_auc_score, f1_score, recall_score,precision_score, confusion_matrix
from category_encoders import TargetEncoder

import catboost as cb
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

In [2]:
# Process DeviceInfo, id_30-31 and generate a feature 'has identity'
def id_split(dataframe):
  dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
  dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]
  dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
  dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
  dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
  dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
  dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
  dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
  dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
  dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
  dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
  dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
  dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
  dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
  dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
  dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
  dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
  dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
  dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
  dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
  dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'
    
# Classes that are too sparse are placed in other classes
  dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 100].index), 'device_name'] = "Others"
  dataframe['had_id'] = 1
  gc.collect()
  return dataframe

In [3]:
folder_path = '/kaggle/input/ieee-fraud-detection/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')

In [4]:
# Rename columns in test_identity (replace hyphens with underscores in "id-xx" columns)
test_identity.columns = [col.replace("id-", "id_") for col in test_identity.columns]

In [5]:
print(test_identity.columns)

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22',
       'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object')


In [6]:
train_identity = id_split(train_identity)
test_identity = id_split(test_identity)

In [7]:
print(train_identity.shape)
print(test_identity.shape)

(144233, 46)
(141907, 46)


In [8]:
# New fields have been added
print(train_identity['device_name'].value_counts())
print(train_identity['device_version'].value_counts())
print(train_identity['OS_id_30'].value_counts())
print(train_identity['browser_id_31'].value_counts())

device_name
Windows            47722
iOS Device         19782
MacOS              12573
Samsung            12092
Trident             7440
Others              4978
RV                  4385
Motorola            2935
Huawei              2377
LG                  2331
Sony                 575
ZTE                  518
HTC                  406
hi6210sft Build      190
F3213 Build          125
Linux                121
F5121 Build          116
Name: count, dtype: int64
device_version
7.0                7440
NRD90M             5908
MMB29K             1874
MRA58K             1446
MMB29M             1342
                   ... 
Q1010                 1
V41020c               1
OPR4.170623.006       1
OPN27.76-12-22        1
HUAWEILDN-LX3         1
Name: count, Length: 293, dtype: int64
OS_id_30
Windows    36739
iOS        19782
Mac        13580
Android     6303
Linux       1136
other         15
func          10
Name: count, dtype: int64
browser_id_31
chrome               76059
mobile               283

In [9]:
# Let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on = 'TransactionID', how = 'left')
test = pd.merge(test_transaction, test_identity, on = 'TransactionID', how = 'left')

In [10]:
del train_transaction, test_transaction, train_identity, test_identity

In [11]:
train_len = len(train)

## Feature Engineering

- DT_D (Days)
- DT_W (Weeks)
- DT_M (Months)

In [12]:
START_DATE = '2017-11-30'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train['TransactionDT'] = train['TransactionDT'].apply(lambda x: startdate + datetime.timedelta(seconds=x))
test['TransactionDT'] = test['TransactionDT'].apply(lambda x: startdate + datetime.timedelta(seconds=x))
for df in [train, test]:
    df['DT_D'] = ((df['TransactionDT'].dt.year - 2017) * 365 + df['TransactionDT'].dt.dayofyear).astype(np.int16)
    df['DT_W'] = (df['TransactionDT'].dt.year - 2017) * 52 + df['TransactionDT'].dt.isocalendar().week
    df['DT_M'] = (df['TransactionDT'].dt.year - 2017) * 12 + df['TransactionDT'].dt.month

In [13]:
train.ProductCD.value_counts()

ProductCD
W    439670
C     68519
R     37699
H     33024
S     11628
Name: count, dtype: int64

Recognizing that the various categories of ProductCD behave differently in terms of fraud and time series, we split their count_encoding feature into five indicators

In [14]:
# W
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_W_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_W_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'W','ProductCD_W_Day'] = -999
test.loc[test.ProductCD != 'W','ProductCD_W_Day'] = -999

# C
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_C_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_C_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'C','ProductCD_C_Day'] = 999999
test.loc[test.ProductCD != 'C','ProductCD_C_Day'] = 999999

# R
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_R_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_R_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'R','ProductCD_R_Day'] = -999
test.loc[test.ProductCD != 'R','ProductCD_R_Day'] = -999

# H
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_H_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_H_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'H','ProductCD_H_Day'] = -999
test.loc[test.ProductCD != 'H','ProductCD_H_Day'] = -999

# S
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_S_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_S_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'S','ProductCD_S_Day'] = -999
test.loc[test.ProductCD != 'S','ProductCD_S_Day'] = -999

In [15]:
# Five new fields have been added
print(train.columns[-5:].tolist())

['ProductCD_W_Day', 'ProductCD_C_Day', 'ProductCD_R_Day', 'ProductCD_H_Day', 'ProductCD_S_Day']


In [16]:
# Use card series and open_card fields are used as identifiers only UserID
train['open_card'] = train.DT_D - train['D1']
train['first_tran'] = train.DT_D - train['D2']
test['open_card'] = test.DT_D - test['D1']
test['first_tran'] = test.DT_D - test['D2']

train['uid1'] = train['card1'].astype(str) +' '+ train['card2'].astype(str)+' '+ train['card3'].astype(str)+' '+train['card4'].astype(str)+' '+ train['card5'].astype(str)+' '+ train['card6'].astype(str) +' '+ train['addr1'].astype(str)+' '+train['addr2'].astype(str)+' '+train['open_card'].astype(str)
test['uid1'] = test['card1'].astype(str) +' '+ test['card2'].astype(str)+' '+ test['card3'].astype(str)+' '+ test['card4'].astype(str)+' '+ test['card5'].astype(str)+' '+ test['card6'].astype(str) +' '+ test['addr1'].astype(str)+' '+test['addr2'].astype(str)+' '+test['open_card'] .astype(str)

In [17]:
print(train['uid1'].nunique())
print(test['uid1'].nunique())

222518
198011


In [18]:
# Identify a device using id_30, id_31, id_32, id_33, DeviceType, DeviceInfo
def device_hash(x):
    s =  str(x['id_30'])+str(x['id_31'])+str(x['id_32'])+str(x['id_33'])+str( x['DeviceType'])+ str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

In [19]:
for df in [train,test]:df['device_hash'] = df.apply(lambda x: device_hash(x), axis=1)

In [20]:
# Number of devices with the same user
concat_df = pd.concat([train[['uid1','device_hash']],test[['uid1','device_hash']]])
tmp = concat_df.groupby('uid1')['device_hash'].agg(['nunique'])

In [21]:
train['uid_device_nunique'] = train.uid1.map(tmp.to_dict()['nunique'])
test['uid_device_nunique'] = test.uid1.map(tmp.to_dict()['nunique'])

In [22]:
# Number of user with the same devices
tmp = concat_df.groupby('device_hash')['uid1'].agg(['nunique'])
train['device_uid_nunique'] = train.device_hash.map(tmp.to_dict()['nunique'])
test['device_uid_nunique'] = test.device_hash.map(tmp.to_dict()['nunique'])

In [23]:
del concat_df

In [None]:
# Get number of non-zero decimal digits
def change(dec):
    dec = np.round(dec,3)
    num = 3
    dec = int(np.round(np.round(dec,3)*1000))
    while(dec % 10 ==0):
        num = num-1
        dec = dec /10
    if num<0:
        num = 0
    return num
  
train['decimal_digit'] = train["TransactionAmt"].map(change)
test['decimal_digit'] = test['TransactionAmt'].map(change)

gc.collect()

In [None]:
# Fill missing value with zero 
train['had_id'] = train['had_id'].fillna(0)
test['had_id'] = test['had_id'].fillna(0)

In [None]:
# D series data has a trend of increasing with time. Future data is larger than past data, so zoom in and keep the relative relationship.
for t in ['D1','D2','D4','D6','D10','D11','D12','D14','D15']:
  train[t+'_revised'] = train[t]/train.groupby('DT_W')[t].transform('max')
  test[t+'_revised'] = test[t]/test.groupby('DT_W')[t].transform('max')
for t in ['D3','D5','D7','D8','D13']:
  train[t+'_revised'] = train[t]/train.groupby('DT_M')[t].transform('max')
  test[t+'_revised'] = test[t]/test.groupby('DT_M')[t].transform('max')

In [None]:
test.loc[test.DT_W == 78 ,'D14_revised'] = test.loc[test.DT_W == 78 ,'D14_revised'].map(lambda x: np.nan if pd.isna(x) else x/900*530)

In [None]:
# Subdivide the time into days of the week and hours of the day.
train['dow'] = train['TransactionDT'].dt.dayofweek
train['hour'] = train['TransactionDT'].dt.hour
test['dow'] = test['TransactionDT'].dt.dayofweek
test['hour'] = test['TransactionDT'].dt.hour
train['email_domain_comp'] = (train['P_emaildomain'].values == train['R_emaildomain'].values).astype(int)
test['email_domain_comp'] = (test['P_emaildomain'].values == test['R_emaildomain'].values).astype(int)
train.drop(['D9'],axis=1,inplace=True)
test.drop(['D9'],axis=1,inplace=True)

In [None]:
# Categorical Variables
cat_columns = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'device_version', 'OS_id_30',  'browser_id_31']
# Count Numerical
count_columns = ['uid1','id_13','id_14','id_17','id_18','id_19','id_20','id_21','id_22','id_24','id_25','id_26','id_30','id_31','id_33',
                 'DeviceInfo','card6','P_emaildomain','R_emaildomain','card1','card2','card3','card5','addr1','addr2','hour','device_version',
                 'OS_id_30','browser_id_31']

In [None]:
# Fill NaN in categorical columns with 'missing'
for col in cat_columns:
    train[col] = train[col].fillna('missing')
    test[col] = test[col].fillna('missing')

In [None]:
train.fillna(-999,inplace = True)
test.fillna(-999,inplace = True)

In [None]:
# Count encoding
for i in count_columns:
  train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
  test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [None]:
# Calculate the number of transactions in hours and days
train_test_all = pd.concat([train[['TransactionDT','TransactionAmt']],test[['TransactionDT','TransactionAmt']]],ignore_index=True,sort=False)
train_test_all['day_count'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('count')
train_test_all['hour_count'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('count')
train['day_count'] = train_test_all[:train_len].day_count.tolist()
test['day_count'] = train_test_all[train_len:].day_count.tolist()
train['hour_count'] = train_test_all[:train_len].hour_count.tolist()
test['hour_count'] = train_test_all[train_len:].hour_count.tolist()

In [None]:
# Identify commodity ID by price category
temp123 = ['TransactionAmt__ProductCD']
for feature in temp123:
  f1, f2 = feature.split('__')
  train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
  test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
  cat_columns.append(feature)
train.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
test.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
train[train.index('TransactionAmt__ProductCD')] = 'ProductID'
test[test.index('TransactionAmt__ProductCD')] = 'ProductID'
for i in ['ProductID']:
  train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
  test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [None]:
# Similarly, continue to do some crossover categories
temp = ['DeviceInfo__P_emaildomain', 
        'card1__card5', 
        'card2__id_20',
        'card5__P_emaildomain', 
        'addr1__card1',
        'addr1__addr2',
        'card1__card2',
        'card2__addr1',
        'card1__P_emaildomain',
        'card2__P_emaildomain',
        'addr1__P_emaildomain',
        'DeviceInfo__id_31',
        'DeviceInfo__id_20',
        'DeviceType__id_31',
        'DeviceType__id_20',
        'DeviceType__P_emaildomain',
        'card1__M4',
        'card2__M4',
        'addr1__M4',
        'P_emaildomain__M4',
       'uid1__ProductID',
       'uid1__DeviceInfo']
for feature in temp:
  f1, f2 = feature.split('__')
  train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
  test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
  cat_columns.append(feature)

for i in temp:
  train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
  test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))

In [None]:
# Cross some categories and continuous variables
con_fea = ['V258','C1','C14','C13','TransactionAmt','D15_revised','D2_revised','id_02','dist1','V294','C11']
cat_fea = ['card1','card2','addr1','card4','R_emaildomain','P_emaildomain','ProductID','uid1']
train_test = pd.concat([train[con_fea+cat_fea],test[con_fea+cat_fea]],ignore_index=True,sort=False)

for cont in con_fea:
  for cat in cat_fea:
    train[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[:train_len].tolist()
    train[cont+'_'+cat+'_std'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[:train_len].tolist()
    test[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[train_len:].tolist()
    test[cont+'_'+cat+'_std'] =  train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[train_len:].tolist()

In [None]:
del df
del train_test
del train_test_all
gc.collect()

In [None]:
# Remove non important features 
drop = ['DeviceInfo','device_version','DT_D','DT_W','DT_M','D15',
             'D2','D1','D4','D6','D10','D11','D12','D3','D5','D7','D8','D13','D14','TransactionAmt_ProductID_mean','V256', 'V223', 'V19', 'V244', 'V324', 'V37', 'V200', 'card3', 'C1_P_emaildomain_mean', 'V131', 'V35', 'id_17_count_full', 
        'id_17', 'V30', 'V163', 'V81', 'V332', 'V164', 'D7_revised', 'id_02_R_emaildomain_std', 'V336', 'M9', 'V301', 'V251', 'M8', 
        'V275', 'V5', 'V272', 'V331', 'V215', 'V291', 'V129', 'C11_P_emaildomain_mean', 'id_34', 'V160', 'V139', 'V124', 'V159',
        'V274', 'V59', 'V126', 'V52', 'V79', 'V271', 'V24', 'V137', 'V286', 'TransactionAmt_R_emaildomain_std', 'V335', 'V115', 
        'V198', 'V234', 'V298', 'V43', 'V258_card4_mean', 'V264', 'OS_id_30_count_full', 'V267', 'V169', 'V217', 'C3', 'V23', 
        'V287', 'id_18_count_full', 'V96', 'V208', 'card4', 'id_32', 'V232', 'V188', 'V4', 'V7', 'id_38', 'id_02_R_emaildomain_mean', 
        'V293', 'V219', 'V259', 'V276', 'C1_R_emaildomain_mean', 'V279', 'V102', 'V253', 'C13_R_emaildomain_std', 'V245', 'V73', 
        'browser_id_31', 'V74', 'V322', 'V209', 'V203', 'V273', 'V221', 'V40', 'V242', 'V289', 'D15_revised_R_emaildomain_std', 
        'C14_R_emaildomain_std', 'V150', 'V316', 'V239', 'V265', 'V278', 'V166', 'V172', 'V132', 'V93', 'V58', 'C1_R_emaildomain_std', 
        'V29', 'V300', 'V134', 'V254', 'V145', 'V141', 'C11_P_emaildomain_std', 'V292', 'V210', 'V231', 'V280', 'V158', 'V123', 'V135', 
        'V220', 'V39', 'V26', 'V238', 'id_11', 'V319', 'V125', 'id_37', 'V206', 'C1_card4_mean', 'V94', 'V304', 'V57', 'V270', 'V33', 
        'V170', 'V202', 'V218', 'V108', 'V303', 'V213', 'V222', 'V64', 'V263', 'V326', 'V10', 'V147', 'V101', 'V142', 'V97', 'V214', 
        'V105', 'V60', 'V171', 'V329', 'ProductCD', 'V216', 'V34', 'V25', 'V6', 'TransactionAmt_card4_mean', 'V212', 'V250', 'V3',
        'V63', 'V194', 'id_36', 'V178', 'V42', 'V85', 'V193', 'V290', 'id_23', 'V258_card4_std', 'V15', 'V288', 'id_15', 'V182', 'V2', 
        'V192', 'V260', 'V235', 'id_26_count_full', 'V138', 'id_24', 'id_10', 'C1_card4_std', 'V11', 'id_08', 'id_25_count_full', 'id_07', 
        'V167', 'V51', 'V229', 'V248', 'V197', 'V230', 'V144', 'V233', 'V157', 'dist1_card4_std', 'V284', 'V140', 'addr2_count_full', 'V154', 
        'V22', 'V204', 'M1', 'V71', 'V211', 'V255', 'V72', 'TransactionAmt_card4_std', 'V1', 'V80', 'V184', 'V299', 'C11_R_emaildomain_mean', 
        'V173', 'V177', 'id_04', 'D15_revised_card4_std', 'V180', 'V228', 'V151', 'V186', 'OS_id_30', 'V109', 'DeviceType', 'V18', 'V17', 
        'id_26', 'V247', 'V9', 'V191', 'V148', 'V65', 'V196', 'id_21', 'V297', 'V46', 'V338', 'addr2', 'V95', 'V92', 'dist1_card4_mean', 
        'V334', 'V100', 'id_25', 'V179', 'V104', 'V116', 'V16', 'V183', 'id_21_count_full', 'V302', 'V199', 'V227', 'C11_R_emaildomain_std', 
        'V176', 'V249', 'V237', 'V327', 'id_16', 'V155', 'V8', 'V252', 'V175', 'V339', 'V330', 'V181', 'V190', 'C14_card4_mean', 'V14', 'V337', 
        'C14_card4_std', 'id_35', 'id_02_card4_mean', 'V110', 'id_12', 'V226', 'V168', 'V21', 'V153', 'V195', 'id_02_card4_std', 'V236', 'V174', 
        'id_28', 'V84', 'V32', 'V106', 'V41', 'V111', 'V112', 'V114', 'V146', 'V328', 'V50', 'id_29', 'C13_card4_mean', 'V103', 'V98', 'V121', 
        'id_24_count_full', 'D2_revised_card4_mean', 'had_id', 'V113', 'D2_revised_card4_std', 'V240', 'TransactionAmt_ProductID_std', 'V185', 
        'id_22_count_full', 'id_22', 'V31', 'C13_card4_std', 'V68', 'V88', 'V294_card4_std', 'V294_card4_mean', 'V122', 'dist1_R_emaildomain_mean', 
        'V118', 'V269', 'V107', 'V305', 'V117', 'V119', 'V120', 'C11_card4_mean', 'C11_card4_std', 'dist1_R_emaildomain_std', 'V89', 'V241', 'id_27', 
        'V325', 'V28', 'D15_revised_card4_mean', 'V27']

In [None]:
drop1 = drop[:200]
drop2 = drop[200:]

train.drop(drop1,axis=1,inplace=True)
test.drop(drop1,axis=1,inplace=True)
train.drop(drop2,axis=1,inplace=True)
test.drop(drop2,axis=1,inplace=True)

In [None]:
y_train = train['isFraud'].copy()
X_train = train.drop(['TransactionID','isFraud','TransactionDT'],axis=1)
X_test = test.drop(['TransactionID','TransactionDT'],axis=1)

In [None]:
del train,test

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
cat = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
       'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
       'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'OS_id_30',  'browser_id_31','ProductID',
       'DeviceInfo__P_emaildomain', 'card1__card5', 'card2__id_20','card5__P_emaildomain', 'addr1__card1','addr1__addr2','card1__card2','card2__addr1','card1__P_emaildomain',
        'card2__P_emaildomain','addr1__P_emaildomain','DeviceInfo__id_31','DeviceInfo__id_20','DeviceType__id_31','DeviceType__id_20','DeviceType__P_emaildomain',
        'card1__M4','card2__M4','addr1__M4','P_emaildomain__M4','uid1__ProductID','uid1__DeviceInfo']
        

In [None]:
for i in drop:
  if i in cat:
    cat.remove(i)
  if i in cat_columns:
    cat_columns.remove(i)

In [None]:
X_train.drop(['device_hash'],axis=1,inplace=True)
X_test.drop(['device_hash'],axis=1,inplace=True)

In [None]:
import os

# Ensure the directory exists
processed_dir = '/kaggle/working/processed'
os.makedirs(processed_dir, exist_ok=True)

X_train.to_pickle(f'{processed_dir}/X_train2.pkl')
X_test.to_pickle(f'{processed_dir}/X_test2.pkl')
y_train.to_frame().to_pickle(f'{processed_dir}/y_train2.pkl')

## Model

In [None]:
X_train = pd.read_pickle("/kaggle/working/processed/X_train2.pkl")
X_test = pd.read_pickle("/kaggle/working/processed/X_test2.pkl")
y_train = pd.read_pickle("/kaggle/working/processed/y_train2.pkl")
y_train = y_train.isFraud

In [None]:
cat = set(cat) & set(X_train.columns)

In [None]:
for column in cat:
    train_set = set(X_train[column])
    test_set = set(X_test[column])
    tt = train_set.intersection(test_set)
    print('----------------------------------------')
    print(column)
    print(f'train:{len(tt)/len(train_set)}')
    print(f'test:{len(tt)/len(test_set)}')
    X_train[column] = X_train[column].map(lambda x: -999 if x not in tt else x)
    X_test[column] = X_test[column].map(lambda x: -999 if x not in tt else x)

### CatBoost

In [None]:
## Memory Reducer
# :df pandas dataframe to reduce size             
# type: pd.DataFrame()
# :verbose                                        
# type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
%%time
X_train = reduce_mem_usage(X_train)

In [None]:
cat = list(set(cat) & set(X_train.columns))

kf = KFold(n_splits=5)
resu1 = 0
impor1 = 0
y_pred = 0
stack_train = np.zeros([X_train.shape[0],])

# Initialize lists to store metrics for each fold
auc_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
conf_matrices = []  # Initialize the list for confusion matrices

for train_index, test_index in kf.split(X_train, y_train):
    X_train = pd.read_pickle(f"{processed_dir}/X_train2.pkl")
    X_train = reduce_mem_usage(X_train, verbose=False)
    X_train2 = X_train.iloc[train_index, :]
    y_train2 = y_train.iloc[train_index]
    X_test2 = X_train.iloc[test_index, :]
    y_test2 = y_train.iloc[test_index]

    del X_train
    print("check1")

    clf = cb.CatBoostClassifier(
        n_estimators=100000,
        random_state=0,
        learning_rate=0.1,
        depth=10,
        task_type="GPU",
        devices="0,1",
        early_stopping_rounds=400,
        eval_metric="AUC",
        border_count=254,
        l2_leaf_reg=2,
    )
    clf.fit(X_train2, y_train2, eval_set=(X_test2, y_test2), verbose=100)
    del X_train2, y_train2

    print("check2")

    # Predict probabilities and labels
    temp_predict_proba = clf.predict_proba(X_test2)[:, 1]
    temp_predict_labels = clf.predict(X_test2)

    # Store predictions for stacking
    stack_train[test_index] = temp_predict_proba

    # Calculate metrics
    auc = roc_auc_score(y_test2, temp_predict_proba)
    f1 = f1_score(y_test2, temp_predict_labels)
    recall = recall_score(y_test2, temp_predict_labels)
    precision = precision_score(y_test2, temp_predict_labels)

    # Append metrics to lists
    auc_scores.append(auc)
    f1_scores.append(f1)
    recall_scores.append(recall)
    precision_scores.append(precision)

    # Print metrics for the current fold
    print(f"Fold Metrics: AUC = {auc:.4f}, F1 = {f1:.4f}, Recall = {recall:.4f}, Precision = {precision:.4f}")

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test2, temp_predict_labels)
    conf_matrices.append(conf_matrix)
    print("Confusion Matrix:")
    print(conf_matrix)
    
    del X_test2, y_test2

    print("check3")

    X_test = pd.read_pickle(f"{processed_dir}/X_test2.pkl")
    X_test = reduce_mem_usage(X_test, verbose=False)
    y_pred += clf.predict_proba(X_test)[:, 1] / 5
    del X_test

    print("check4")

    resu1 += auc / 5
    impor1 += clf.feature_importances_ / 5
    gc.collect()

# Print average metrics across all folds
print(f"Average Metrics: AUC = {np.mean(auc_scores):.4f}, F1 = {np.mean(f1_scores):.4f}, Recall = {np.mean(recall_scores):.4f}, Precision = {np.mean(precision_scores):.4f}")
print("Confusion Matrices for Each Fold:")
for i, cm in enumerate(conf_matrices):
    print(f"Fold {i + 1}:")
    print(cm)
print(f'End AUC: {resu1}')

#### CatBoost Submission

### Extreme Gradient Boosting (XGBoost)

In [None]:
kf = KFold(n_splits=5)
resu1 = 0
impor1 = 0
y_pred = 0
stack_train = np.zeros([X_train.shape[0],])

# Initialize lists to store metrics for each fold
auc_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
conf_matrices = []  # Initialize the list for confusion matrices

for train_index, test_index in kf.split(X_train, y_train):
    X_train2 = X_train.iloc[train_index, :]
    y_train2 = y_train.iloc[train_index]
    X_test2 = X_train.iloc[test_index, :]
    y_test2 = y_train.iloc[test_index]

    # Initialize TargetEncoder with out-of-fold encoding
    encoder = TargetEncoder(smoothing=5)

    # Apply target encoding to each categorical column
    for col in cat_columns:
        X_train2[col] = encoder.fit_transform(X_train2[col], y_train2)
        X_test2[col] = encoder.transform(X_test2[col])

    clf = xgb.XGBClassifier(
        n_estimators=100000,
        max_depth=11,
        learning_rate=0.01,
        random_state=0,
        subsample=0.8,
        colsample_bytree=0.6,
        min_child_weight=3,
        reg_alpha=1,
        reg_lambda=0.01,
        n_jobs=-1,
        tree_method='gpu_hist',
        n_gpus=2,
        eval_metric='auc',
        early_stopping_rounds=500
    )
    clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], verbose=30)
    del X_train2, y_train2

    # Predict probabilities and labels
    temp_predict_proba = clf.predict_proba(X_test2)[:, 1]
    temp_predict_labels = clf.predict(X_test2)

    # Store predictions for stacking
    stack_train[test_index] = temp_predict_proba

    # Calculate metrics
    auc = roc_auc_score(y_test2, temp_predict_proba)
    f1 = f1_score(y_test2, temp_predict_labels)
    recall = recall_score(y_test2, temp_predict_labels)
    precision = precision_score(y_test2, temp_predict_labels)

    # Append metrics to lists
    auc_scores.append(auc)
    f1_scores.append(f1)
    recall_scores.append(recall)
    precision_scores.append(precision)

    # Print metrics for the current fold
    print(f"Fold Metrics: AUC = {auc:.4f}, F1 = {f1:.4f}, Recall = {recall:.4f}, Precision = {precision:.4f}")

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test2, temp_predict_labels)
    conf_matrices.append(conf_matrix)
    print("Confusion Matrix:")
    print(conf_matrix)
    
    # Update predictions and feature importances
    y_pred += clf.predict_proba(X_test)[:, 1] / 5
    resu1 += auc / 5
    impor1 += clf.feature_importances_ / 5
    
    del X_test2, y_test2
    gc.collect()

# Print average metrics across all folds
print(f"Average Metrics: AUC = {np.mean(auc_scores):.4f}, F1 = {np.mean(f1_scores):.4f}, Recall = {np.mean(recall_scores):.4f}, Precision = {np.mean(precision_scores):.4f}")
print("Confusion Matrices for Each Fold:")
for i, cm in enumerate(conf_matrices):
    print(f"Fold {i + 1}:")
    print(cm)
print(f'End AUC: {resu1}')

In [None]:
sample_submission = pd.read_csv("/kaggle/input/ieee-fraud-detection/sample_submission.csv")

# Assign predictions to the correct column (replace 'target_column' with actual name)
sample_submission["isFraud"] = y_pred  # Ensure correct column name

# Save the submission file
sample_submission.to_csv("/kaggle/working/submission.csv", index=False)


#### XGBoost Submission

In [None]:
result = pd.read_csv(f'{folder_path}sample_submission.csv')
result['isFraud'] = y_pred
result.to_csv('/kaggle/working/xgb.csv',index=False)

df = pd.DataFrame()
df['train'] = stack_train
df.to_csv('/kaggle/working/xgb_train.csv',index=False)

#### Leaderboard Score
- **Public Score** : 0.960077 
- **Private Score** : 0.935508

### LightGBM

#### LightGBM Submission

#### Leaderboard Score
- **Public Score** : 0.961445 
- **Private Score** : 0.938790

In [None]:
kf = KFold(n_splits=5)
resu1 = 0
impor1 = 0
y_pred = 0
stack_train = np.zeros([X_train.shape[0],])

# Initialize lists to store metrics for each fold
auc_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
conf_matrices = []  # To store confusion matrices for each fold

for train_index, test_index in kf.split(X_train, y_train):
    X_train2 = X_train.iloc[train_index, :]
    y_train2 = y_train.iloc[train_index]
    X_test2 = X_train.iloc[test_index, :]
    y_test2 = y_train.iloc[test_index]

    clf = lgb.LGBMClassifier(
        n_estimators=100000,
        random_state=42,
        subsample=0.7,
        device="gpu",
        gpu_platform_id=0,
        gpu_device_id=1,
        gpu_use_dp=True,
        colsample_bytree=0.7, learning_rate=0.005, importance_type='gain',
        max_depth=-1,
        num_leaves=256,
        min_child_samples=20,
        min_split_gain=0.001,
        bagging_freq=1,
        reg_alpha=0,
        reg_lambda=0,
        n_jobs=-1,
        metric="None",
    )
    clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], eval_metric="auc")
    clf.booster_.early_stopping(500)

    # Predict probabilities and labels
    temp_predict_proba = clf.predict_proba(X_test2)[:, 1]
    temp_predict_labels = clf.predict(X_test2)

    # Store predictions for stacking
    stack_train[test_index] = temp_predict_proba

    # Calculate metrics
    auc = roc_auc_score(y_test2, temp_predict_proba)
    f1 = f1_score(y_test2, temp_predict_labels)
    recall = recall_score(y_test2, temp_predict_labels)
    precision = precision_score(y_test2, temp_predict_labels)
    conf_matrix = confusion_matrix(y_test2, temp_predict_labels)

    # Append metrics to lists
    auc_scores.append(auc)
    f1_scores.append(f1)
    recall_scores.append(recall)
    precision_scores.append(precision)
    conf_matrices.append(conf_matrix)

    # Print metrics for the current fold
    print(f"Fold Metrics: AUC = {auc:.4f}, F1 = {f1:.4f}, Recall = {recall:.4f}, Precision = {precision:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

    # Update predictions for the test set
    y_pred += clf.predict_proba(X_test)[:, 1] / 5

    # Update average AUC
    resu1 += auc / 5

    # Update feature importances
    impor1 += clf.feature_importances_ / 5

    # Garbage collection
    gc.collect()

# Print average metrics across all folds
print(f"Average Metrics: AUC = {np.mean(auc_scores):.4f}, F1 = {np.mean(f1_scores):.4f}, Recall = {np.mean(recall_scores):.4f}, Precision = {np.mean(precision_scores):.4f}")
print("Confusion Matrices for Each Fold:")
for i, cm in enumerate(conf_matrices):
    print(f"Fold {i + 1}:")
    print(cm)

print(f"End AUC: {resu1}")

In [None]:
result = pd.read_csv(f'{folder_path}sample_submission.csv')
result['isFraud'] = y_pred
result.to_csv(f'{folder_path}lgbm.csv',index=False)

df = pd.DataFrame()
df['train'] = stack_train
df.to_csv(f'{folder_path}lgb_train.csv',index=False)

#### Leaderboard Score
- **Public Score** : 0.958433 
- **Private Score** : 0.933450


## Ensemble 
Ensemble learning helps improve machine learning results by combining several models. This approach allows the production of better predictive performance compared to a single model. 

In [None]:
y_train = train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv',usecols = ['isFraud'])
y_train = y_train.isFraud

In [None]:
#lgb = pd.read_csv(f'{folder_path}lgb.csv')
#lgb_train = pd.read_csv(f'{folder_path}lgb_train.csv')
xgb = pd.read_csv(f'{folder_path}xgb.csv')
xgb_train = pd.read_csv(f'{folder_path}xgb_train.csv')
cat = pd.read_csv(f'{folder_path}catboost.csv')
cat_train = pd.read_csv(f'{folder_path}catboost_train.csv')

### ROC_AUC Score

In [None]:
#print(roc_auc_score(y_train.values,lgb_train.train.values))
print(roc_auc_score(y_train.values,xgb_train.train.values))
print(roc_auc_score(y_train.values,cat_train.train.values))

#### Ensemble Submissiion

In [None]:
sub = lgb.copy()
sub.isFraud = 0.8*lgb.isFraud + 0.2*cat.isFraud
sub.to_csv(f'{folder_path}ensemble_model.csv',index = None)

#### Final Leaderboard Score
- **Public Score** : 0.963487 
- **Private Score** : 0.941655
