# Data dictionary https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-601500

In [2]:
# Load required libraries
import numpy as np
import pandas as pd

# Custom utils from kesh-utils (Check source code: https://github.com/KeshavShetty/kesh-utils)
from KUtils.common import utils
from KUtils.eda import chartil
from KUtils.eda import data_preparation as dp
from KUtils.classifier import generic_classifier_utils as gcu

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, recall_score, precision_score

import datetime 

In [3]:
# Show all column in head()
pd.set_option('display.max_columns', None)

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [5]:
# Train data 
train_transaction_df = pd.read_csv('train_transaction.csv')
train_identity_df = pd.read_csv('train_identity.csv')

# Test data
test_transaction_df = pd.read_csv('test_transaction.csv')
test_identity_df = pd.read_csv('test_identity.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [1]:
identinty_columns = list(train_identity_df.columns)
identinty_columns.remove('TransactionID')

NameError: name 'train_identity_df' is not defined

In [18]:
print(train_identity_df.shape)
print(train_transaction_df.shape)
print(test_identity_df.shape)
print(test_transaction_df.shape)

(144233, 41)
(423033, 394)
(141907, 41)
(506691, 393)


In [6]:
train_df = pd.merge(train_transaction_df, train_identity_df, how='left', on='TransactionID') # Used left as Indentity may not exist for all transactions
test_df = pd.merge(test_transaction_df, test_identity_df, how='left', on='TransactionID')
print(train_df.shape)
print(test_df.shape)

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [8]:
# Clean the memory
del train_transaction_df
del train_identity_df
del test_transaction_df
del test_identity_df

In [9]:
train_inner_df = train_df.loc[~(np.isnan(train_df['id_01']))]
train_outer_df = train_df.loc[np.isnan(train_df['id_01'])]
train_outer_df = train_outer_df.drop(identinty_columns, axis=1)

test_inner_df = test_df.loc[~(np.isnan(test_df['id_01']))]
test_outer_df = test_df.loc[np.isnan(test_df['id_01'])]
test_outer_df = test_outer_df.drop(identinty_columns, axis=1)

del train_df
del test_df

In [10]:
print(train_inner_df.shape)
print(train_outer_df.shape)
   
print(test_inner_df.shape)
print(test_outer_df.shape)

(144233, 434)
(446307, 394)
(141907, 433)
(364784, 393)


In [11]:
train_inner_df['isFraud']=train_inner_df['isFraud'].astype('category')
train_outer_df['isFraud']=train_outer_df['isFraud'].astype('category')

In [12]:
# card1 - card6: payment card information, such as card type, card category, issue bank, country, etc. 
# Converting it to categorical column
for card_col in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']:
    train_inner_df[card_col]=train_inner_df[card_col].astype('category')
    train_outer_df[card_col]=train_outer_df[card_col].astype('category')

    test_inner_df[card_col]=test_inner_df[card_col].astype('category')
    test_outer_df[card_col]=test_outer_df[card_col].astype('category')

In [13]:
# addr1-addr2 : address
# Converting it to categorical column
for addr_col in ['addr1', 'addr2']:
    train_inner_df[addr_col]=train_inner_df[addr_col].astype('category')
    train_outer_df[addr_col]=train_outer_df[addr_col].astype('category')

    test_inner_df[addr_col]=test_inner_df[addr_col].astype('category')
    test_outer_df[addr_col]=test_outer_df[addr_col].astype('category')

In [14]:
# Drop columns with more than 90% nan
def dropLargeNanCols(df):
    colNulls = (df.isnull().sum(axis=0))*100/(df.shape[0])
    colNulls = colNulls[colNulls>90]
    nullcolumns = list(colNulls.index)
    print('Dropping below columns ')
    print(colNulls)
    for aCol in nullcolumns:        
        df.drop([aCol], axis=1, inplace=True)        
    print('Done')
    return df, nullcolumns

train_inner_df, inner_columns_to_drop = dropLargeNanCols(train_inner_df)
train_outer_df, outer_columns_to_drop = dropLargeNanCols(train_outer_df) # Use inner_columns_to_drop & outer_columns_to_drop lists to drop columns from test_df

test_inner_df.drop(inner_columns_to_drop, axis=1, inplace=True)
test_outer_df.drop(outer_columns_to_drop, axis=1, inplace=True)

Dropping below columns 
dist1    100.000000
D11      100.000000
M1       100.000000
M2       100.000000
M3       100.000000
M5       100.000000
M6       100.000000
M7       100.000000
M8       100.000000
M9       100.000000
V1       100.000000
V2       100.000000
V3       100.000000
V4       100.000000
V5       100.000000
V6       100.000000
V7       100.000000
V8       100.000000
V9       100.000000
V10      100.000000
V11      100.000000
id_07     96.425922
id_08     96.425922
id_21     96.423149
id_22     96.416215
id_23     96.416215
id_24     96.708798
id_25     96.441868
id_26     96.420375
id_27     96.416215
dtype: float64
Done
Dropping below columns 
dist2             99.992382
R_emaildomain     98.609029
D6                98.697309
D7                99.377783
D8               100.000000
D9               100.000000
D12               98.698654
D13               99.996415
D14               99.995071
V138              99.930989
V139              99.930989
V140              99.930

In [15]:
print(inner_columns_to_drop)
print(outer_columns_to_drop)

['dist1', 'D11', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'id_07', 'id_08', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27']
['dist2', 'R_emaildomain', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V

In [16]:
def extract_date_info(df):
    
    # create date column
    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    df['TransactionDT'] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
    
    df['year'] = df['TransactionDT'].dt.year
    df['month'] = df['TransactionDT'].dt.month
    df['dow'] = df['TransactionDT'].dt.dayofweek
    df['hour'] = df['TransactionDT'].dt.hour
    df['day'] = df['TransactionDT'].dt.day 

    return df

train_inner_df = extract_date_info(train_inner_df)
train_outer_df = extract_date_info(train_outer_df)

test_inner_df = extract_date_info(test_inner_df)
test_outer_df = extract_date_info(test_outer_df)

train_inner_df.drop('TransactionDT', axis=1, inplace=True)
train_outer_df.drop('TransactionDT', axis=1, inplace=True)

test_inner_df.drop('TransactionDT', axis=1, inplace=True)
test_outer_df.drop('TransactionDT', axis=1, inplace=True)

In [17]:
def id_split(df, isInner):
    if isInner:
        df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
        df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]

        df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
        df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]
        
        df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
        df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
        df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
        df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
        df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
        df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
        df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
        df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
        df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
        df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
        df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
        df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
        df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
        df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
        df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
        df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
        df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

        df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"

        df['screen_width'] = df['id_33'].str.split('x', expand=True)[0]
        df['screen_height'] = df['id_33'].str.split('x', expand=True)[1]        
        df['id_34'] = df['id_34'].str.split(':', expand=True)[1] 
        
        # Replace all missing vlues with nan
        df['device_name'].fillna('_Unknown', inplace=True)
        df['device_version'].fillna('_Unknown', inplace=True)
        df['browser_id_31'].fillna('_Unknown', inplace=True)
        df['version_id_31'].fillna('_Unknown', inplace=True)
        df['screen_width'].fillna('_Unknown', inplace=True)
        df['screen_height'].fillna('_Unknown', inplace=True)
        df['id_34'].fillna('_Unknown', inplace=True)
        
        
        df['had_id'] = 1
    
    return df

train_inner_df = id_split(train_inner_df, True)
train_outer_df = id_split(train_outer_df, False) 

test_inner_df = id_split(test_inner_df, True)
test_outer_df = id_split(test_outer_df, False)

In [18]:
emails_provider = {'gmail': 'google', 'outlook': 'microsoft', 'yahoo': 'yahoo', 'mail': 'other', 'anonymous': 'other', 'hotmail': 'microsoft', 'verizon': 'yahoo', 'aol': 'other',
                   'me': 'apple', 'comcast': 'other', 'optonline': 'other', 'cox': 'other', 'charter': 'spectrum', 'rocketmail': 'yahoo', 'prodigy': 'att', 'embarqmail': 'centurylink',
                   'icloud': 'apple', 'live': 'microsoft','att': 'att','juno': 'other','ymail': 'yahoo','sbcglobal': 'att','bellsouth': 'other','msn': 'microsoft', 'q': 'centurylink',
                   'centurylink': 'centurylink','servicios-ta': 'other','earthlink': 'other','cfl': 'other','roadrunner': 'other','netzero': 'other','gmx': 'other','suddenlink': 'other',
                   'frontiernet': 'yahoo','windstream': 'other','frontier': 'yahoo','mac': 'apple','aim': 'aol','web': 'other','twc': 'spectrum','cableone': 'other','sc': 'other',
                   'ptd': 'other', 'protonmail' : 'other'}

In [19]:
def getDomainName(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split('.')
        if len(all_words)>0:
            return all_words[0]
        else:
            return np.nan
        
def getDomainGroup(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split('.')
        if len(all_words)>1:
            return all_words[len(all_words)-1]
        else:
            return np.nan
        
# Apply to P_emaildomain(train and test)    
train_inner_df['P_emaildomain_hostname'] = train_inner_df['P_emaildomain'].apply(getDomainName)
train_inner_df['P_emaildomain_domaingroup']  = train_inner_df['P_emaildomain'].apply(getDomainGroup)
train_inner_df['P_email_provider'] = train_inner_df['P_emaildomain_hostname'].map(emails_provider)

# Apply to R_emaildomain 
train_inner_df['R_emaildomain_hostname'] = train_inner_df['R_emaildomain'].apply(getDomainName)
train_inner_df['R_emaildomain_domaingroup']  = train_inner_df['R_emaildomain'].apply(getDomainGroup)
train_inner_df['R_email_provider'] = train_inner_df['R_emaildomain_hostname'].map(emails_provider)

test_inner_df['P_emaildomain_hostname'] = test_inner_df['P_emaildomain'].apply(getDomainName)
test_inner_df['P_emaildomain_domaingroup']  = test_inner_df['P_emaildomain'].apply(getDomainGroup)
test_inner_df['P_email_provider'] = test_inner_df['P_emaildomain_hostname'].map(emails_provider)

# Apply to R_emaildomain 
test_inner_df['R_emaildomain_hostname'] = test_inner_df['R_emaildomain'].apply(getDomainName)
test_inner_df['R_emaildomain_domaingroup']  = test_inner_df['R_emaildomain'].apply(getDomainGroup)
test_inner_df['R_email_provider'] = test_inner_df['R_emaildomain_hostname'].map(emails_provider)

# For outer only P_emaildomain. No need to apply on outer->R_emaildomain as it is removed from outer
train_outer_df['P_emaildomain_hostname'] = train_outer_df['P_emaildomain'].apply(getDomainName)
train_outer_df['P_emaildomain_domaingroup']  = train_outer_df['P_emaildomain'].apply(getDomainGroup)
train_outer_df['P_email_provider'] = train_outer_df['P_emaildomain_hostname'].map(emails_provider)

test_outer_df['P_emaildomain_hostname'] = test_outer_df['P_emaildomain'].apply(getDomainName)
test_outer_df['P_emaildomain_domaingroup']  = test_outer_df['P_emaildomain'].apply(getDomainGroup)
test_outer_df['P_email_provider'] = test_outer_df['P_emaildomain_hostname'].map(emails_provider)

# Drop original columns
train_inner_df.drop('P_emaildomain', axis=1, inplace=True)
train_inner_df.drop('R_emaildomain', axis=1, inplace=True)
test_inner_df.drop('P_emaildomain', axis=1, inplace=True)
test_inner_df.drop('R_emaildomain', axis=1, inplace=True)

train_outer_df.drop('P_emaildomain', axis=1, inplace=True)
test_outer_df.drop('P_emaildomain', axis=1, inplace=True)

In [20]:
test_inner_df['id_30'].unique()

array([nan, 'Android 6.0.1', 'iOS 11.4.0', 'Windows 7', 'iOS 10.1.1',
       'Windows 10', 'Mac OS X 10_13_5', 'iOS 11.3.0', 'Linux',
       'Android 5.1.1', 'iOS 9.3.5', 'Android 7.0', 'iOS 11.2.6',
       'Mac OS X 10.13', 'Android 8.0.0', 'Windows 8.1', 'Android',
       'Mac OS X 10_12_6', 'iOS 10.3.3', 'Mac', 'Windows 8',
       'Mac OS X 10_9_5', 'Android 7.1.1', 'iOS 11.1.1', 'Android 8.1.0',
       'iOS 11.2.5', 'Mac OS X 10_11_6', 'Android 9', 'Mac OS X 10_10_5',
       'iOS 11.2.2', 'iOS 11.3.1', 'Windows Vista', 'Windows XP',
       'Mac OS X 10.11', 'Mac OS X 10_8_5', 'Mac OS X 10_13_4',
       'iOS 10.3.2', 'iOS 11.2.1', 'iOS 12.0.0', 'Mac OS X 10_13_1',
       'iOS 10.0.2', 'Mac OS X 10_7_5', 'iOS 11.1.2', 'Mac OS X 10_12_1',
       'Mac OS X 10_13_3', 'iOS 11.0.1', 'Android 6.0', 'iOS 10.2.1',
       'iOS', 'Mac OS X 10.12', 'Mac OS X 10_13_2', 'Mac OS X 10_11_5',
       'Mac OS X 10_12_5', 'Mac OS X 10_13_6', 'Mac OS X 10.14',
       'Mac OS X 10_14_0', 'Mac OS X 10_11_

In [21]:
# id_30
def getMajorNameId30(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split(' ')
        if len(all_words)>0:
            return all_words[0]
        else:
            return np.nan
        
def getMinorNameId30(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split(' ')
        if len(all_words)>0:
            return all_words[len(all_words)-1]
        else:
            return np.nan
        
train_inner_df['id_30_major'] = train_inner_df['id_30'].apply(getMajorNameId30)
test_inner_df['id_30_major'] = test_inner_df['id_30'].apply(getMajorNameId30)

train_inner_df['id_30_minor'] = train_inner_df['id_30'].apply(getMinorNameId30)
test_inner_df['id_30_minor'] = test_inner_df['id_30'].apply(getMinorNameId30)
# Only inner has id_30

train_inner_df.drop('id_30', axis=1, inplace=True)
test_inner_df.drop('id_30', axis=1, inplace=True)

In [22]:
# For id_31 (dataset e.g: samsung browser 6.2)
def getMajorNameId31(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split(' ')
        if len(all_words)>1:
            return all_words[0]
        else:
            all_words = input_str.split('/')
            if len(all_words)>1:
                return all_words[0]
            else:
                return np.nan
            
def getMinorNameId31(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split(' ')
        if len(all_words)>1:
            return all_words[len(all_words)-1]
        else:
            all_words = input_str.split('/')
            if len(all_words)>1:
                return all_words[len(all_words)-1]
            else:
                return np.nan
            
train_inner_df['id_31_major'] = train_inner_df['id_31'].apply(getMajorNameId31)
test_inner_df['id_31_major'] = test_inner_df['id_31'].apply(getMajorNameId31)

train_inner_df['id_31_minor'] = train_inner_df['id_31'].apply(getMinorNameId31)
test_inner_df['id_31_minor'] = test_inner_df['id_31'].apply(getMinorNameId31)

# Only inner has id_30

train_inner_df.drop('id_31', axis=1, inplace=True)
test_inner_df.drop('id_31', axis=1, inplace=True)

In [23]:
# id_33 (e.g: 1920x1018)
def getXResId33(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split('x')
        if len(all_words)>0:
            return '-'+all_words[0]+'-'
        else:
            return np.nan
        
def getYResId33(input_str):
    if pd.isnull(input_str):
        return np.nan
    else:
        all_words = input_str.split('x')
        if len(all_words)>1:
            return '-'+all_words[1]+'-'
        else:
            return np.nan
        
train_inner_df['id_33_X'] = train_inner_df['id_33'].apply(getXResId33)
train_inner_df['id_33_Y'] = train_inner_df['id_33'].apply(getYResId33)

test_inner_df['id_33_X'] = test_inner_df['id_33'].apply(getXResId33)
test_inner_df['id_33_Y'] = test_inner_df['id_33'].apply(getYResId33)

train_inner_df.drop('id_33', axis=1, inplace=True)
test_inner_df.drop('id_33', axis=1, inplace=True)

In [24]:
def transactionAmtDecimal(df):     
    df['TransactionAmt_Log'] = np.log(df['TransactionAmt'])
    df['TransactionAmt_decimal'] = ((df['TransactionAmt'] - df['TransactionAmt'].astype(int)) * 1000).astype(int)
    return df

train_inner_df = transactionAmtDecimal(train_inner_df)
train_outer_df = transactionAmtDecimal(train_outer_df) 

test_inner_df = transactionAmtDecimal(test_inner_df)
test_outer_df = transactionAmtDecimal(test_outer_df) 

In [25]:
test_inner_df.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D12,D13,D14,D15,M4,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_28,id_29,id_32,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,year,month,dow,hour,day,device_name,device_version,browser_id_31,version_id_31,screen_width,screen_height,had_id,P_emaildomain_hostname,P_emaildomain_domaingroup,P_email_provider,R_emaildomain_hostname,R_emaildomain_domaingroup,R_email_provider,id_30_major,id_30_minor,id_31_major,id_31_minor,id_33_X,id_33_Y,TransactionAmt_Log,TransactionAmt_decimal
37,3663586,6.271,C,15885,545.0,185.0,visa,138.0,debit,,,,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,,,,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,M2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,-45.0,280290.0,,,0.0,0.0,,,100.0,NotFound,27.0,,New,NotFound,225.0,15.0,427.0,563.0,New,NotFound,,_Unknown,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13,2018,7,0,0,2,Huawei,HUAWEIMYA-L13,chrome,67.0,_Unknown,_Unknown,1,hotmail,com,microsoft,hotmail,com,microsoft,,,chrome,android,,,1.835936,270
39,3663588,50.0,S,2453,399.0,150.0,american express,137.0,credit,494.0,87.0,37.0,4.0,8.0,0.0,4.0,0.0,0.0,0.0,25.0,0.0,30.0,5.0,0.0,57.0,4.0,34.0,34.0,1.0,,,634.0,1.0,22.0,0.0,458.0,,,,634.0,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,7.0,2.0,0.0,7.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,280.0,90.0,0.0,280.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,40.0,0.0,0.0,0.0,0.0,40.0,40.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,7.0,0.0,2.0,1.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,2.0,1.0,0.0,270.0,90.0,0.0,270.0,0.0,90.0,60.0,310.0,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.0,0.0,8.0,2.0,0.0,0.0,0.0,0.0,310.0,90.0,0.0,310.0,90.0,0.0,0.0,0.0,0.0,3579.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,Found,,-300.0,Found,Found,166.0,,542.0,368.0,Found,Found,24.0,2,T,F,T,T,mobile,LGLS676 Build/MXB48T,2018,7,0,0,2,Others,MXB48T,chrome,67.0,1280,720,1,,,,gmail,com,google,Android,6.0.1,chrome,android,-1280-,-720-,3.912023,0
48,3663597,37.318,C,15885,545.0,185.0,visa,138.0,debit,,,,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,,,,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,M2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,-5.0,185210.0,,,1.0,0.0,,,100.0,NotFound,52.0,-360.0,New,NotFound,225.0,,271.0,507.0,New,NotFound,,_Unknown,F,T,T,F,desktop,Trident/7.0,2018,7,0,0,2,Trident,7.0,ie,11.0,_Unknown,_Unknown,1,anonymous,com,other,anonymous,com,other,,,ie,tablet,,,3.619476,318
52,3663601,6.271,C,15885,545.0,185.0,visa,138.0,debit,,,,2.0,4.0,0.0,3.0,0.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,2.0,1.0,,,0.0,567.0,288.0,567.0,288.0,0.0,0.0,0.0,567.0,0.0,0.0,0.0,M2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.271,6.271,6.271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.271,6.271,6.271,1.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.271,6.271,6.271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.271,6.271,6.271,1.0,1.0,,,,0.0,0.0,0.0,0.0,,,2.0,2.0,2.0,0.0,0.0,0.0,,1.0,1.0,1.0,,,1.0,1.0,1.0,1.0,6.271,6.271,6.271,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,6.271,6.271,6.271,,,,,,,,,,,,,,,,,,,-45.0,252944.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,NotFound,27.0,,Found,Found,225.0,15.0,427.0,563.0,Found,Found,,_Unknown,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13,2018,7,0,0,2,Huawei,HUAWEIMYA-L13,chrome,67.0,_Unknown,_Unknown,1,hotmail,com,microsoft,hotmail,com,microsoft,,,chrome,android,,,1.835936,270
53,3663602,52.258,C,9633,130.0,185.0,visa,138.0,debit,,,,8.0,21.0,0.0,4.0,0.0,4.0,4.0,5.0,0.0,6.0,11.0,11.0,10.0,6.0,,,,316.0,2.0,316.0,2.0,,,0.0,316.0,0.0,,12.0,M2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,-95.0,328680.0,,,7.0,-33.0,,,100.0,NotFound,27.0,,New,NotFound,225.0,15.0,567.0,507.0,New,NotFound,,_Unknown,F,F,T,F,mobile,SM-G9650 Build/R16NW,2018,7,0,0,2,Samsung,R16NW,chrome,67.0,_Unknown,_Unknown,1,hotmail,com,microsoft,hotmail,com,microsoft,,,chrome,android,,,3.956193,258


In [26]:
# First level of missing treatment
inner_first_level_minng_treatment_columns = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'D15', 'id_01', 'id_02', 'id_36']
outer_first_level_minng_treatment_columns = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'D15']
#Inner
for a_column_to_treat in inner_first_level_minng_treatment_columns:
    print('Treating ' + a_column_to_treat)
    if train_inner_df[a_column_to_treat].dtype.kind=='O': # Categorical
        
        train_inner_df[a_column_to_treat] = train_inner_df[a_column_to_treat].astype('str')
        test_inner_df[a_column_to_treat] = test_inner_df[a_column_to_treat].astype('str')
        
        train_inner_df[a_column_to_treat].fillna('_Unknown', inplace=True)
        test_inner_df[a_column_to_treat].fillna('_Unknown', inplace=True)
        
        train_inner_df[a_column_to_treat] = train_inner_df[a_column_to_treat].astype('category')
        test_inner_df[a_column_to_treat] = test_inner_df[a_column_to_treat].astype('category')
    else: # numerical        
        train_inner_df[a_column_to_treat].fillna(-999, inplace=True)
        test_inner_df[a_column_to_treat].fillna(-999, inplace=True)
#Outer
for a_column_to_treat in outer_first_level_minng_treatment_columns:
    print('Treating ' + a_column_to_treat)
    if train_outer_df[a_column_to_treat].dtype.kind=='O': # Categorical
        train_outer_df[a_column_to_treat] = train_outer_df[a_column_to_treat].astype('str')
        test_outer_df[a_column_to_treat] = test_outer_df[a_column_to_treat].astype('str')
        
        train_outer_df[a_column_to_treat].fillna('_Unknown', inplace=True)
        test_outer_df[a_column_to_treat].fillna('_Unknown', inplace=True)
        
        train_outer_df[a_column_to_treat] = train_outer_df[a_column_to_treat].astype('category')
        test_outer_df[a_column_to_treat] = test_outer_df[a_column_to_treat].astype('category')        
    else: # numerical        
        train_outer_df[a_column_to_treat].fillna(-999, inplace=True)
        test_outer_df[a_column_to_treat].fillna(-999, inplace=True)

Treating card1
Treating card2
Treating card3
Treating card4
Treating card5
Treating card6
Treating addr1
Treating D15
Treating id_01
Treating id_02
Treating id_36
Treating card1
Treating card2
Treating card3
Treating card4
Treating card5
Treating card6
Treating addr1
Treating D15


In [27]:
def transAmountFeatureEng(df, isInner):
    df['TransactionAmt_to_mean_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('mean')
    df['TransactionAmt_to_mean_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('mean')
    
    if isInner:
        df['id_02_to_mean_card1'] = df['id_02'] / df.groupby(['card1'])['id_02'].transform('mean')
        df['id_02_to_mean_card4'] = df['id_02'] / df.groupby(['card4'])['id_02'].transform('mean')

    df['D15_to_mean_card1'] = df['D15'] / df.groupby(['card1'])['D15'].transform('mean')
    df['D15_to_mean_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('mean')
    
    df['D15_to_mean_addr1'] = df['D15'] / df.groupby(['addr1'])['D15'].transform('mean')
    df['D15_to_mean_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('mean')
    
    df['TransactionAmt_to_std_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('std')
    df['TransactionAmt_to_std_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('std')
    
    df['TransactionAmt_to_std_card1']=df['TransactionAmt_to_std_card1'].replace([np.inf], 999)
    df['TransactionAmt_to_std_card1']=df['TransactionAmt_to_std_card1'].replace([-np.inf], -999)
    
    df['TransactionAmt_to_std_card4']=df['TransactionAmt_to_std_card4'].replace([np.inf], 999)
    df['TransactionAmt_to_std_card4']=df['TransactionAmt_to_std_card4'].replace([-np.inf], -999)

    if isInner:
        df['id_02_to_std_card1'] = df['id_02'] / df.groupby(['card1'])['id_02'].transform('std')
        df['id_02_to_std_card4'] = df['id_02'] / df.groupby(['card4'])['id_02'].transform('std')
        
        df['id_02_to_std_card1']=df['id_02_to_std_card1'].replace([np.inf], 999)
        df['id_02_to_std_card1']=df['id_02_to_std_card1'].replace([-np.inf], -999)
        
        df['id_02_to_std_card4']=df['id_02_to_std_card4'].replace([np.inf], 1)
        df['id_02_to_std_card4']=df['id_02_to_std_card4'].replace([-np.inf], -1)
    

    df['D15_to_std_card1'] = df['D15'] / df.groupby(['card1'])['D15'].transform('std')
    df['D15_to_std_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('std')
    
    df['D15_to_std_card1']=df['D15_to_std_card1'].replace([np.inf], 999)
    df['D15_to_std_card1']=df['D15_to_std_card1'].replace([-np.inf], -999)
    
    df['D15_to_std_card4']=df['D15_to_std_card4'].replace([np.inf], 999)
    df['D15_to_std_card4']=df['D15_to_std_card4'].replace([-np.inf], -999)
        

    df['D15_to_std_addr1'] = df['D15'] / df.groupby(['addr1'])['D15'].transform('std')
    df['D15_to_std_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('std')
    
    df['D15_to_std_addr1']=df['D15_to_std_addr1'].replace([np.inf], 999)
    df['D15_to_std_addr1']=df['D15_to_std_addr1'].replace([-np.inf], -999)
    
    df['D15_to_std_card4']=df['D15_to_std_card4'].replace([np.inf], 999)
    df['D15_to_std_card4']=df['D15_to_std_card4'].replace([-np.inf], -999)    
    return df
    
train_inner_df = transAmountFeatureEng(train_inner_df, True)
train_outer_df = transAmountFeatureEng(train_outer_df, False)

test_inner_df = transAmountFeatureEng(test_inner_df, True)
test_outer_df = transAmountFeatureEng(test_outer_df, False)

In [28]:
train_inner_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D12,D13,D14,D15,M4,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_28,id_29,id_32,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,year,month,dow,hour,day,device_name,device_version,browser_id_31,version_id_31,screen_width,screen_height,had_id,P_emaildomain_hostname,P_emaildomain_domaingroup,P_email_provider,R_emaildomain_hostname,R_emaildomain_domaingroup,R_email_provider,id_30_major,id_30_minor,id_31_major,id_31_minor,id_33_X,id_33_Y,TransactionAmt_Log,TransactionAmt_decimal,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,id_02_to_mean_card1,id_02_to_mean_card4,D15_to_mean_card1,D15_to_mean_card4,D15_to_mean_addr1,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,id_02_to_std_card1,id_02_to_std_card4,D15_to_std_card1,D15_to_std_card4,D15_to_std_addr1
4,2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15557.990234,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,New,NotFound,32.0,2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,2017,12,5,0,2,Samsung,NRD90M,samsung,browser,2220,1080,1,gmail,com,google,,,,Android,7.0,samsung,6.2,-2220-,-1080-,3.912023,0,0.606061,0.700202,0.764773,0.381001,1.0,2.596141,1.08415,0.633724,0.566274,1.753301,0.424652,-999.0,-1.895981,-3.418647
8,2987008,0,15.0,H,2803,100.0,150.0,visa,226.0,debit,337.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1804.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15607.990234,169740.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,98945.0,,,0.0,-5.0,,,100.0,NotFound,49.0,-300.0,New,NotFound,166.0,,621.0,500.0,New,NotFound,32.0,1,T,F,F,T,mobile,iOS Device,2017,12,5,0,2,iOS Device,_Unknown,mobile,safari,1334,750,1,anonymous,com,other,,,,iOS,11.1.2,mobile,11.0,-1334-,-750-,2.70805,0,0.180093,0.185831,0.89228,0.586767,1.129304,1.92212,1.187537,0.212693,0.161859,0.782785,0.621704,-2.901258,-1.883397,-2.410692
10,2987010,0,75.887,C,16496,352.0,117.0,mastercard,134.0,credit,,,,1.0,4.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,M0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,4.0,4.0,2.0,1.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,31.841299,90.327904,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,0.0,2.0,4.0,4.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,90.327904,90.327904,0.0,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,3.0,3.0,3.0,4.0,4.0,2.0,2.0,1.0,2.0,2.0,2.0,4.0,4.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,90.327904,31.841299,90.327904,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,,,,,,,,,,,,,,,,,,,-5.0,191631.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,NotFound,52.0,,Found,Found,121.0,,410.0,142.0,Found,Found,,_Unknown,F,F,T,T,desktop,Windows,2017,12,5,0,2,Windows,_Unknown,chrome,62.0,_Unknown,_Unknown,1,gmail,com,google,gmail,com,google,,,chrome,62.0,,,4.329245,887,1.289521,1.062724,1.339566,1.031426,0.0,-0.0,-0.0,2.636139,0.859456,5.574408,1.149596,0.0,0.0,0.0
11,2987011,0,16.495,C,4461,375.0,185.0,mastercard,224.0,debit,,,30.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,,,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,M0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,-5.0,221832.0,,,0.0,-6.0,,,100.0,NotFound,52.0,,New,NotFound,225.0,,176.0,507.0,New,NotFound,,_Unknown,F,F,T,T,desktop,,2017,12,5,0,2,_Unknown,_Unknown,chrome,62.0,_Unknown,_Unknown,1,hotmail,com,microsoft,hotmail,com,microsoft,,,chrome,62.0,,,2.803057,495,0.417514,0.230996,0.846149,1.193978,-0.0,-0.0,-0.0,0.437795,0.186814,1.332674,1.330772,0.0,0.0,0.0
16,2987016,0,30.0,H,1790,555.0,150.0,visa,226.0,debit,170.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,26.0,0.0,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1805.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15622.990234,169755.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7460.0,0.0,0.0,1.0,0.0,0.0,0.0,100.0,NotFound,,-300.0,Found,Found,166.0,15.0,529.0,575.0,Found,Found,24.0,2,T,F,T,T,desktop,MacOS,2017,12,5,0,2,MacOS,_Unknown,chrome,62.0,1280,800,1,aol,com,other,,,,Mac,10_11_6,chrome,62.0,-1280-,-800-,3.401197,0,0.692308,0.371663,0.137576,0.04424,1.0,1.92212,1.067055,2.598076,0.323718,0.167968,0.046874,-999.0,-1.883397,-3.911396


In [29]:
train_outer_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,year,month,dow,hour,day,P_emaildomain_hostname,P_emaildomain_domaingroup,P_email_provider,TransactionAmt_Log,TransactionAmt_decimal,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_card1,D15_to_mean_card4,D15_to_mean_addr1,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,D15_to_std_card1,D15_to_std_card4,D15_to_std_addr1
0,2987000,0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,13.0,13.0,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,2017,12,5,0,2,,,,4.226834,500,0.154796,0.197225,0.0,0.0,0.0,0.170922,0.146421,0.0,0.0,0.0
1,2987001,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,12,5,0,2,gmail,com,google,3.367296,0,0.116987,0.192096,0.0,0.0,0.0,0.060363,0.102363,0.0,0.0,0.0
2,2987002,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,315.0,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,12,5,0,2,outlook,com,microsoft,4.077537,0,0.609264,0.395942,4.039594,2.077792,1.97919,0.58929,0.23297,1.124217,1.107922,1.109901
3,2987003,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,84.0,,111.0,,,,M0,T,F,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,0.0,10.0,4.0,1.0,38.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,2017,12,5,0,2,yahoo,com,yahoo,3.912023,0,0.392864,0.3312,0.648329,0.813621,0.885663,0.251555,0.176488,0.387324,0.388294,0.402605
5,2987005,0,49.0,W,5937,555.0,150.0,visa,226.0,debit,272.0,87.0,36.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,0.0,0.0,T,T,T,M1,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,12,5,0,2,gmail,com,google,3.89182,0,0.365477,0.328833,0.0,0.0,0.0,0.491192,0.193484,0.0,0.0,0.0


In [30]:
print(train_inner_df.shape)
print(train_outer_df.shape)
   
print(test_inner_df.shape)
print(test_outer_df.shape)

(144233, 438)
(446307, 244)
(141907, 437)
(364784, 243)


In [31]:
def multipleColumnMapping(df, test_df, isInner):
    # Encoding - count encoding for both df and test_df
    feature_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6' ]
    if isInner:
        feature_list = feature_list + ['id_36']
        
    for feature in feature_list:
        df[feature + '_count_full'] = df[feature].map(pd.concat([df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
        test_df[feature + '_count_full'] = test_df[feature].map(pd.concat([df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
            

    # Encoding - count encoding separately for df and test_df
    if isInner:
        for feature in ['id_01', 'id_31_major', 'id_33_X', 'id_36']:
            df[feature + '_count_dist'] = df[feature].map(df[feature].value_counts(dropna=False))
            test_df[feature + '_count_dist'] = test_df[feature].map(test_df[feature].value_counts(dropna=False))
        
    return df, test_df
        
train_inner_df, test_inner_df = multipleColumnMapping(train_inner_df, test_inner_df, True)
train_outer_df, test_outer_df = multipleColumnMapping(train_outer_df, test_outer_df, False)


In [32]:
train_inner_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D12,D13,D14,D15,M4,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_28,id_29,id_32,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,year,month,dow,hour,day,device_name,device_version,browser_id_31,version_id_31,screen_width,screen_height,had_id,P_emaildomain_hostname,P_emaildomain_domaingroup,P_email_provider,R_emaildomain_hostname,R_emaildomain_domaingroup,R_email_provider,id_30_major,id_30_minor,id_31_major,id_31_minor,id_33_X,id_33_Y,TransactionAmt_Log,TransactionAmt_decimal,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,id_02_to_mean_card1,id_02_to_mean_card4,D15_to_mean_card1,D15_to_mean_card4,D15_to_mean_addr1,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,id_02_to_std_card1,id_02_to_std_card4,D15_to_std_card1,D15_to_std_card4,D15_to_std_addr1,card1_count_full,card2_count_full,card3_count_full,card4_count_full,card5_count_full,card6_count_full,id_36_count_full,id_01_count_dist,id_31_major_count_dist,id_33_X_count_dist,id_36_count_dist
4,2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15557.990234,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,New,NotFound,32.0,2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,2017,12,5,0,2,Samsung,NRD90M,samsung,browser,2220,1080,1,gmail,com,google,,,,Android,7.0,samsung,6.2,-2220-,-1080-,3.912023,0,0.606061,0.700202,0.764773,0.381001,1.0,2.596141,1.08415,0.633724,0.566274,1.753301,0.424652,-999.0,-1.895981,-3.418647,10,9939,160555,87352,17292,139575,267353,19555,2028,720,134066
8,2987008,0,15.0,H,2803,100.0,150.0,visa,226.0,debit,337.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1804.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15607.990234,169740.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,98945.0,,,0.0,-5.0,,,100.0,NotFound,49.0,-300.0,New,NotFound,166.0,,621.0,500.0,New,NotFound,32.0,1,T,F,F,T,mobile,iOS Device,2017,12,5,0,2,iOS Device,_Unknown,mobile,safari,1334,750,1,anonymous,com,other,,,,iOS,11.1.2,mobile,11.0,-1334-,-750-,2.70805,0,0.180093,0.185831,0.89228,0.586767,1.129304,1.92212,1.187537,0.212693,0.161859,0.782785,0.621704,-2.901258,-1.883397,-2.410692,1209,1948,160555,177024,106731,145291,267353,82170,28368,6447,134066
10,2987010,0,75.887,C,16496,352.0,117.0,mastercard,134.0,credit,,,,1.0,4.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,M0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,4.0,4.0,2.0,1.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,31.841299,90.327904,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,0.0,2.0,4.0,4.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,90.327904,90.327904,0.0,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,3.0,3.0,3.0,4.0,4.0,2.0,2.0,1.0,2.0,2.0,2.0,4.0,4.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,166.215393,166.215393,166.215393,90.327904,90.327904,31.841299,90.327904,90.327904,90.327904,90.327904,0.0,0.0,0.0,75.887497,75.887497,75.887497,,,,,,,,,,,,,,,,,,,-5.0,191631.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,NotFound,52.0,,Found,Found,121.0,,410.0,142.0,Found,Found,,_Unknown,F,F,T,T,desktop,Windows,2017,12,5,0,2,Windows,_Unknown,chrome,62.0,_Unknown,_Unknown,1,gmail,com,google,gmail,com,google,,,chrome,62.0,,,4.329245,887,1.289521,1.062724,1.339566,1.031426,0.0,-0.0,-0.0,2.636139,0.859456,5.574408,1.149596,0.0,0.0,0.0,4,156,2796,87352,239,139575,267353,82170,75631,70944,134066
11,2987011,0,16.495,C,4461,375.0,185.0,mastercard,224.0,debit,,,30.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,,,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,M0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,-5.0,221832.0,,,0.0,-6.0,,,100.0,NotFound,52.0,,New,NotFound,225.0,,176.0,507.0,New,NotFound,,_Unknown,F,F,T,T,desktop,,2017,12,5,0,2,_Unknown,_Unknown,chrome,62.0,_Unknown,_Unknown,1,hotmail,com,microsoft,hotmail,com,microsoft,,,chrome,62.0,,,2.803057,495,0.417514,0.230996,0.846149,1.193978,-0.0,-0.0,-0.0,0.437795,0.186814,1.332674,1.330772,0.0,0.0,0.0,5565,11377,101061,87352,47583,145291,267353,82170,75631,70944,134066
16,2987016,0,30.0,H,1790,555.0,150.0,visa,226.0,debit,170.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,26.0,0.0,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1805.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15622.990234,169755.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7460.0,0.0,0.0,1.0,0.0,0.0,0.0,100.0,NotFound,,-300.0,Found,Found,166.0,15.0,529.0,575.0,Found,Found,24.0,2,T,F,T,T,desktop,MacOS,2017,12,5,0,2,MacOS,_Unknown,chrome,62.0,1280,800,1,aol,com,other,,,,Mac,10_11_6,chrome,62.0,-1280-,-800-,3.401197,0,0.692308,0.371663,0.137576,0.04424,1.0,1.92212,1.067055,2.598076,0.323718,0.167968,0.046874,-999.0,-1.883397,-3.911396,3,21998,160555,177024,106731,145291,267353,19555,75631,5446,134066


In [33]:
print(train_inner_df.shape)
print(train_outer_df.shape)
   
print(test_inner_df.shape)
print(test_outer_df.shape)

(144233, 449)
(446307, 250)
(141907, 448)
(364784, 249)


In [34]:
df = train_outer_df
cat_column_names = list(df.select_dtypes(['category','object']).columns)
numerical_column_names =  [i for i in df.columns if not i in cat_column_names]

In [35]:
cat_column_names

['isFraud',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain_hostname',
 'P_emaildomain_domaingroup',
 'P_email_provider']

In [36]:
numerical_column_names

['TransactionID',
 'TransactionAmt',
 'dist1',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D10',
 'D11',
 'D15',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',

In [37]:
test_inner_df['id_31_major'] = test_inner_df['id_31_major'].astype('str')
test_inner_df.loc[test_inner_df['id_31_major']=='Lanix','id_31_major']=22

In [38]:
test_inner_df.loc[test_inner_df['id_31_major']=='Lanix','id_31_major']

Series([], Name: id_31_major, dtype: object)

In [39]:
from sklearn import preprocessing

def missingImputerAndScaling(df, test_df):
    cat_column_names = list(df.select_dtypes(['category','object']).columns)
    numerical_column_names =  [i for i in df.columns if not i in cat_column_names]
    numerical_column_names.remove('TransactionID')
    cat_column_names.remove('isFraud')
    
    print(cat_column_names)
    print(numerical_column_names)
    
    # Treat mising values
    # Get list of the columns which has >0 null values
    aSeries = df.isnull().sum()
    columns_to_treat = list(aSeries.index)
    columns_to_treat.remove('isFraud')
    columns_to_treat.remove('TransactionID')

    for a_column_to_treat in columns_to_treat:
        #if train_df[a_column_to_treat].isnull().sum()>0:
        print('Treating ' + a_column_to_treat)
        if df[a_column_to_treat].dtype.kind=='O': # Categorical            
            df[a_column_to_treat] = df[a_column_to_treat].astype('str') 
            df[a_column_to_treat].fillna('_Unknown', inplace=True)
            df[a_column_to_treat] = df[a_column_to_treat].astype('category')
            
            test_df[a_column_to_treat] = test_df[a_column_to_treat].astype('str') 
            test_df[a_column_to_treat].fillna('_Unknown', inplace=True)
            test_df[a_column_to_treat] = test_df[a_column_to_treat].astype('category')
        else: # numerical
            #print(min(df[a_column_to_treat]))
            col_mean = np.mean(df[a_column_to_treat])
            col_sd = np.std(df[a_column_to_treat])
            df[a_column_to_treat].fillna(-999, inplace=True)
            test_df[a_column_to_treat].fillna(-999, inplace=True)
    print('Missing treatment Done')    
    
    print('Label Encoder')
    for a_column_to_treat in cat_column_names:
        le = preprocessing.LabelEncoder()
        le.fit(list(df[a_column_to_treat].astype(str).values) + list(test_df[a_column_to_treat].astype(str).values))
        df[a_column_to_treat] = le.transform(list(df[a_column_to_treat].astype(str).values))
        test_df[a_column_to_treat] = le.transform(list(test_df[a_column_to_treat].astype(str).values))
        
    # Scaling
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import RobustScaler

    for a_column_to_treat in numerical_column_names:
        print(a_column_to_treat)
        scaler = RobustScaler().fit(df[[a_column_to_treat]])
        scaler.transform(df[[a_column_to_treat]])
        scaler.transform(test_df[[a_column_to_treat]])
        
    return df, test_df
    

In [40]:
train_inner_df, test_inner_df = missingImputerAndScaling(train_inner_df, test_inner_df)
train_outer_df, test_outer_df = missingImputerAndScaling(train_outer_df, test_outer_df)

['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'M4', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'device_name', 'device_version', 'browser_id_31', 'version_id_31', 'screen_width', 'screen_height', 'P_emaildomain_hostname', 'P_emaildomain_domaingroup', 'P_email_provider', 'R_emaildomain_hostname', 'R_emaildomain_domaingroup', 'R_email_provider', 'id_30_major', 'id_30_minor', 'id_31_major', 'id_31_minor', 'id_33_X', 'id_33_Y']
['TransactionAmt', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D12', 'D13', 'D14', 'D15', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 

In [41]:
train_inner_df.to_csv("train_inner_df.csv", index=False)
test_inner_df.to_csv("test_inner_df.csv", index=False)

train_outer_df.to_csv("train_outer_df.csv", index=False)
test_outer_df.to_csv("test_outer_df.csv", index=False)

In [42]:
print(train_inner_df.shape)
print(train_outer_df.shape)
   
print(test_inner_df.shape)
print(test_outer_df.shape)

(144233, 449)
(446307, 250)
(141907, 448)
(364784, 249)


# 1. Model 1 on inner (With Bayesian Optimization)

In [None]:
train_df = train_inner_df
# Putting feature variable to X
X = train_df.drop(['TransactionID', 'isFraud'],axis=1)

# Putting response variable to y
y = train_df['isFraud']

y.head()

from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7,test_size=0.3,random_state=100)

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(data=X_train.astype('float32'), label=y_train.astype('float32'))
lgb_test  = lgb.Dataset(data=X_test.astype('float32'),  label=y_test.astype('float32'))


from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

def train_model(num_leaves, min_data_in_leaf, max_depth, bagging_fraction, feature_fraction, lambda_l1, lambda_l2):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': False,
        'boost_from_average': True,
        'num_threads': 4,
        
        'num_leaves': int(num_leaves),
        'min_data_in_leaf': int(min_data_in_leaf),
        'max_depth': int(max_depth),
        'bagging_fraction' : bagging_fraction,
        'feature_fraction' : feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2
    }
    
    lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_test, verbose_eval=1000)
    
    y = lgb_model.predict(X_test.astype('float32'), num_iteration=lgb_model.best_iteration)
    
    score = roc_auc_score(y_test.astype('float32'), y)
    return score

bounds = {
    'num_leaves': (50, 500),
    'min_data_in_leaf': (20, 200),
    'max_depth':(2, 50),
    'bagging_fraction' : (0.01, 0.99),
    'feature_fraction' : (0.01, 0.99),
    'lambda_l1': (0, 2),
    'lambda_l2': (0, 2)
}

bo = BayesianOptimization(train_model, bounds, random_state=42)

bo.maximize(init_points=10, n_iter=15, acq='ucb', xi=0.0, alpha=1e-6)

In [None]:
#Extracting the best parameters
params = bo.max['params']
print(params)

#Converting the max_depth and n_estimator values from float to int
params['bagging_seed']= int(params['bagging_seed'])
params['num_leaves']= int(params['num_leaves'])
params['min_data_in_leaf']= int(params['min_data_in_leaf'])

#Initialize an LGBClassifier with the tuned parameters and fit the training data
final_clf = lgb.LGBMClassifier(**params).fit(X_train, y_train)

#predicting for training set
y_pred_default = final_clf.predict(X_test)

In [None]:
inner_model = final_clf

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred_default))

local_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_default )

# Printing confusion matrix and accuracy
print('Confusion Matrix:')
print(local_confusion_matrix)

# Accuracy, precision, recall and f1 score
print('\nScores:')
accuracy = metrics.accuracy_score(y_test, y_pred_default)
precision = metrics.precision_score(y_test, y_pred_default)
recall = metrics.recall_score(y_test, y_pred_default)
f1_score = metrics.f1_score(y_test, y_pred_default)
roc_auc = metrics.roc_auc_score(y_test, y_pred_default)
sensitivity = recall
specificity =  local_confusion_matrix[0,0]/(local_confusion_matrix[0,0]+local_confusion_matrix[0,1])
print(" Accuracy {0:.3f}, \n Sensitivity {1:.3f}, \n Specificity {2:.3f}, \n Precision {3:.3f}, \n Recall {4:.3f}, \n f1_score {5:.3f}, \n roc_auc {6:.3f}".format(
    accuracy, sensitivity,specificity, precision,recall,f1_score,roc_auc))

In [None]:
# Feature importance
importances = final_clf.feature_importances_
unique_dict = dict(zip(X_train.columns, importances))
chartil.core_barchart_from_series(
    pd.Series(unique_dict), 
    optional_settings={'sort_by_value':True, 'decimal_precision':2}) 

# 2 Model 2 on outer (With Bayesian Optimization)

In [None]:
train_df = train_outer_df
# Putting feature variable to X
X = train_df.drop(['TransactionID', 'isFraud'],axis=1)

# Putting response variable to y
y = train_df['isFraud']

y.head()

from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7,test_size=0.3,random_state=100)

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(data=X_train.astype('float32'), label=y_train.astype('float32'))
lgb_test  = lgb.Dataset(data=X_test.astype('float32'),  label=y_test.astype('float32'))


from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

def train_model(num_leaves, min_data_in_leaf, max_depth, bagging_fraction, feature_fraction, lambda_l1, lambda_l2):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': False,
        'boost_from_average': True,
        'num_threads': 4,
        
        'num_leaves': int(num_leaves),
        'min_data_in_leaf': int(min_data_in_leaf),
        'max_depth': int(max_depth),
        'bagging_fraction' : bagging_fraction,
        'feature_fraction' : feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2
    }
    
    lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_test, verbose_eval=1000)
    
    y = lgb_model.predict(X_test.astype('float32'), num_iteration=lgb_model.best_iteration)
    
    score = roc_auc_score(y_test.astype('float32'), y)
    return score

bounds = {
    'num_leaves': (50, 500),
    'min_data_in_leaf': (20, 200),
    'max_depth':(2, 50),
    'bagging_fraction' : (0.01, 0.99),
    'feature_fraction' : (0.01, 0.99),
    'lambda_l1': (0, 2),
    'lambda_l2': (0, 2)
}

bo = BayesianOptimization(train_model, bounds, random_state=42)

bo.maximize(init_points=10, n_iter=15, acq='ucb', xi=0.0, alpha=1e-6)

In [None]:
#Extracting the best parameters
params = bo.max['params']
print(params)

#Converting the max_depth and n_estimator values from float to int
params['max_depth']= int(params['max_depth'])
params['num_leaves']= int(params['num_leaves'])
params['min_data_in_leaf']= int(params['min_data_in_leaf'])

#Initialize an LGBClassifier with the tuned parameters and fit the training data
final_clf = lgb.LGBMClassifier(**params).fit(X_train, y_train)

#predicting for training set
y_pred_default = final_clf.predict(X_test)

In [None]:
outer_model = final_clf

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred_default))

local_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_default )

# Printing confusion matrix and accuracy
print('Confusion Matrix:')
print(local_confusion_matrix)

# Accuracy, precision, recall and f1 score
print('\nScores:')
accuracy = metrics.accuracy_score(y_test, y_pred_default)
precision = metrics.precision_score(y_test, y_pred_default)
recall = metrics.recall_score(y_test, y_pred_default)
f1_score = metrics.f1_score(y_test, y_pred_default)
roc_auc = metrics.roc_auc_score(y_test, y_pred_default)
sensitivity = recall
specificity =  local_confusion_matrix[0,0]/(local_confusion_matrix[0,0]+local_confusion_matrix[0,1])
print(" Accuracy {0:.3f}, \n Sensitivity {1:.3f}, \n Specificity {2:.3f}, \n Precision {3:.3f}, \n Recall {4:.3f}, \n f1_score {5:.3f}, \n roc_auc {6:.3f}".format(
    accuracy, sensitivity,specificity, precision,recall,f1_score,roc_auc))

In [None]:
# Feature importance
importances = final_clf.feature_importances_
unique_dict = dict(zip(X_train.columns, importances))
chartil.core_barchart_from_series(
    pd.Series(unique_dict), 
    optional_settings={'sort_by_value':True, 'decimal_precision':2}) 

# Apply final model on test

In [None]:
print(inner_model)
print(outer_model)

In [None]:
test_outer_df.head()

In [None]:
test_inner_df.head()

In [None]:
ext_inner_pred_proba = inner_model.predict_proba(test_inner_df.drop(['TransactionID'], axis=1))


In [None]:
ext_outer_pred_proba = outer_model.predict_proba(test_outer_df.drop(['TransactionID'], axis=1))  

In [None]:
ext_inner_pred_proba


In [None]:
ext_inner_pred_proba[:,1]

In [None]:
test_inner_df['isFraud'] = ext_inner_pred_proba[:,1]
test_outer_df['isFraud'] = ext_outer_pred_proba[:,1]

In [None]:
test_inner_df.head()

In [None]:
inner_tst = test_inner_df.loc[:,['TransactionID', 'isFraud' ]]
outer_tst = test_outer_df.loc[:,['TransactionID', 'isFraud' ]]

In [None]:
outer_tst.head()

In [None]:
submission_csv = pd.concat([inner_tst, outer_tst])

In [None]:
submission_csv.shape

In [None]:
submission_csv.to_csv("inner_outer_fsubmission.csv", index=False)


# https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm-w-gpu

In [None]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "max_depth": 50,
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

final_clf = lgb.LGBMClassifier(**params).fit(X_train, y_train)

#predicting for training set
y_pred_default = final_clf.predict(X_test)

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred_default))

local_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_default )

# Printing confusion matrix and accuracy
print('Confusion Matrix:')
print(local_confusion_matrix)

# Accuracy, precision, recall and f1 score
print('\nScores:')
accuracy = metrics.accuracy_score(y_test, y_pred_default)
precision = metrics.precision_score(y_test, y_pred_default)
recall = metrics.recall_score(y_test, y_pred_default)
f1_score = metrics.f1_score(y_test, y_pred_default)
roc_auc = metrics.roc_auc_score(y_test, y_pred_default)
sensitivity = recall
specificity =  local_confusion_matrix[0,0]/(local_confusion_matrix[0,0]+local_confusion_matrix[0,1])
print(" Accuracy {0:.3f}, \n Sensitivity {1:.3f}, \n Specificity {2:.3f}, \n Precision {3:.3f}, \n Recall {4:.3f}, \n f1_score {5:.3f}, \n roc_auc {6:.3f}".format(
    accuracy, sensitivity,specificity, precision,recall,f1_score,roc_auc))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

dtree = lgb.LGBMClassifier()
cv_folds = 5
parameters = {
    'num_leaves':  range(10, 1001, 100),
    'min_child_samples': range(10, 1001, 100),
    'subsample' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9], 
    'colsample_bytree' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9], 
    'reg_alpha': [0.0001, 0.001, 0.1, 0, 1, 2, 5, 7, 10],
    'reg_lambda': [0.0001, 0.001, 0.1, 0, 1, 2, 5, 7, 10, 20],
    'min_data_in_leaf': range(10, 50),
    'max_depth': range(2, 50),
    'bagging_fraction' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
}

model_scoring = {'Precision': make_scorer(precision_score)}


treeGrid = RandomizedSearchCV(dtree, parameters, cv=cv_folds, scoring=model_scoring, refit='Precision', return_train_score=True, verbose = 1)
treeGrid.fit(X_train,y_train)

In [None]:
print("Best score: ", treeGrid.best_score_)
print("Best Estimator: ", treeGrid.best_estimator_)
print("Best Params: ", treeGrid.best_params_)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

lgb_final = lgb.LGBMClassifier(???)
lgb_final.fit(X_train, y_train)
# Making predictions
y_pred_default = lgb_final.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

# Printing confusion matrix and accuracy
print(confusion_matrix(y_test, y_pred_default))

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred_default))

local_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_default )

# Printing confusion matrix and accuracy
print('Confusion Matrix:')
print(local_confusion_matrix)

# Accuracy, precision, recall and f1 score
print('\nScores:')
accuracy = metrics.accuracy_score(y_test, y_pred_default)
precision = metrics.precision_score(y_test, y_pred_default)
recall = metrics.recall_score(y_test, y_pred_default)
f1_score = metrics.f1_score(y_test, y_pred_default)
roc_auc = metrics.roc_auc_score(y_test, y_pred_default)
sensitivity = recall
specificity =  local_confusion_matrix[0,0]/(local_confusion_matrix[0,0]+local_confusion_matrix[0,1])
print(" Accuracy {0:.3f}, \n Sensitivity {1:.3f}, \n Specificity {2:.3f}, \n Precision {3:.3f}, \n Recall {4:.3f}, \n f1_score {5:.3f}, \n roc_auc {6:.3f}".format(
    accuracy, sensitivity,specificity, precision,recall,f1_score,roc_auc))

# 2. Model 2 on outer

In [None]:
train_df = train_outer_df
# Putting feature variable to X
X = train_df.drop(['TransactionID', 'isFraud'],axis=1)

# Putting response variable to y
y = train_df['isFraud']

y.head()

from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7,test_size=0.3,random_state=100)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

dtree = lgb.LGBMClassifier()
cv_folds = 5
parameters = {
    'num_leaves':  range(10, 1001, 100),
    'min_child_samples': range(10, 1001, 100),
    'subsample' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9], 
    'colsample_bytree' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9], 
    'reg_alpha': [0.0001, 0.001, 0.1, 0, 1, 2, 5, 7, 10],
    'reg_lambda': [0.0001, 0.001, 0.1, 0, 1, 2, 5, 7, 10, 20],
    'min_data_in_leaf': range(10, 50),
    'max_depth': range(2, 50),
    'bagging_fraction' : [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
}

model_scoring = {'Recall': make_scorer(recall_score)}


treeGrid = RandomizedSearchCV(dtree, parameters, cv=cv_folds, scoring=model_scoring, refit='Recall', return_train_score=True, verbose = 1)

treeGrid.fit(X_train,y_train)

In [None]:
print("Best score: ", treeGrid.best_score_)
print("Best Estimator: ", treeGrid.best_estimator_)
print("Best Params: ", treeGrid.best_params_)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

best_params = treeGrid.best_params_

lgb_final = lgb.LGBMClassifier(subsample=0.9, reg_lambda=5, reg_alpha=0.0001, num_leaves=810, min_data_in_leaf=22, min_child_samples=410, max_depth=38, colsample_bytree=0.8, bagging_fraction=0.4)
lgb_final.fit(X_train, y_train)
# Making predictions
y_pred_default = lgb_final.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

# Printing confusion matrix and accuracy
print(confusion_matrix(y_test, y_pred_default))

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred_default))

local_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_default )

# Printing confusion matrix and accuracy
print('Confusion Matrix:')
print(local_confusion_matrix)

# Accuracy, precision, recall and f1 score
print('\nScores:')
accuracy = metrics.accuracy_score(y_test, y_pred_default)
precision = metrics.precision_score(y_test, y_pred_default)
recall = metrics.recall_score(y_test, y_pred_default)
f1_score = metrics.f1_score(y_test, y_pred_default)
roc_auc = metrics.roc_auc_score(y_test, y_pred_default)
sensitivity = recall
specificity =  local_confusion_matrix[0,0]/(local_confusion_matrix[0,0]+local_confusion_matrix[0,1])
print(" Accuracy {0:.3f}, \n Sensitivity {1:.3f}, \n Specificity {2:.3f}, \n Precision {3:.3f}, \n Recall {4:.3f}, \n f1_score {5:.3f}, \n roc_auc {6:.3f}".format(
    accuracy, sensitivity,specificity, precision,recall,f1_score,roc_auc))

In [None]:
lgb_final