## Setup

We'll begin with some imports that will be useful throughout the notebook, and set up some objects and variables we'll need.

### Import dependencies

In [1]:
import datetime
import io
import os
import warnings
from math import sqrt
from time import gmtime, strftime

import boto3
import numpy as np
import pandas as pd
import sagemaker
import sklearn
from sagemaker.serializers import CSVSerializer
from sagemaker.tuner import (CategoricalParameter, ContinuousParameter,
                             HyperparameterTuner, IntegerParameter)
from sklearn.datasets import dump_svmlight_file
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler

### Constants

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

# The object info
bucket = 'test-asc-sagemaker-fraud'
prefix = 'ieee-fraud-detection'
key = 'preprocessed-dataset'
train_identity_key = prefix + '/raw-data/train_identity.csv'
train_transaction_key = prefix + '/raw-data/train_transaction.csv'
test_identity_key = prefix + '/raw-data/test_identity.csv'
test_transaction_key = prefix + '/raw-data/test_transaction.csv'

# The random seed
random_seed = 42

# Set random seed in numpy
np.random.seed(random_seed)

# set threshold of correlation matrix
threshold = 0.98

# get sagemaker session
session = sagemaker.Session()

# set instance type
instance_type = 'ml.m5.2xlarge'

# name of the target
target = 'isFraud'

# set Fold
FOLDS = 7

### Functions

#### Reduce memory usage

iterate through all the columns of a dataframe and modify the data type to reduce the memory usage

In [3]:
## Function to reduce the DataFrame memory usage
def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # change the data type depends on value boundary
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#### Transform datetime

transform datetime to year, month, day, hour, minute, second

In [4]:
def datetime_transformer(df, datetime_vars):
    
    # Due to the datetime in this dataset is deltatime, so we need to set a start date
    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
    
    # The dictionary with key as datetime type and value as datetime type operator
    dict_ = {'year'   : lambda x : x.dt.year,
             'month'  : lambda x : x.dt.month,
             'day'    : lambda x : x.dt.day,
             'hour'   : lambda x : x.dt.hour,
             'minute' : lambda x : x.dt.minute,
             'second' : lambda x : x.dt.second}
    
    # Make a copy of df
    df_datetime = df.copy(deep=True)
    
    # For each variable in datetime_vars
    for var in datetime_vars:
        # Cast the variable to datetime
        
        df_datetime[var] = pd.to_datetime(df_datetime[var].apply(lambda x: (startdate + datetime.timedelta(seconds=x))))
        
        # For each item (datetime_type and datetime_type_operator) in dict_
        for datetime_type, datetime_type_operator in dict_.items():
            # Add a new variable to df_datetime where:
            # the variable's name is var + '_' + datetime_type
            # the variable's values are the ones obtained by datetime_type_operator
            df_datetime[var + '_' + datetime_type] = datetime_type_operator(df_datetime[var])
            
    # Remove datetime_vars from df_datetime
    df_datetime = df_datetime.drop(columns=datetime_vars)
                
    return df_datetime

#### Check nan value

Check nan value with data type and proporion.

In [5]:
def nan_checker(df):

    # Get the dataframe of variables with NaN, their proportion of NaN and data type
    df_nan = pd.DataFrame([[var, df[var].isna().sum() / df.shape[0], df[var].dtype]
                           for var in df.columns if df[var].isna().sum() > 0],
                          columns=['var', 'proportion', 'dtype'])
    
    # Sort df_nan in accending order of the proportion of NaN
    df_nan = df_nan.sort_values(by='proportion', ascending=False).reset_index(drop=True)
    
    return df_nan

#### Check categorical variables

Find the categorical variables and number of unique value.

In [6]:
def cat_var_checker(df, dtype='object'):

    # Get the dataframe of categorical variables and their number of unique value
    df_cat = pd.DataFrame([[var, df[var].nunique(dropna=False)]
                           # If the data type is dtype
                           for var in df.columns if df[var].dtype == dtype],
                          columns=['var', 'nunique'])
    
    # Sort df_cat in accending order of the number of unique value
    df_cat = df_cat.sort_values(by='nunique', ascending=False).reset_index(drop=True)
    
    return df_cat

## Load Data

Load the raw data from S3 bucket

In [7]:
train_identity = pd.read_csv('s3://{}/{}'.format(bucket, train_identity_key))
train_transaction = pd.read_csv('s3://{}/{}'.format(bucket, train_transaction_key))
test_identity = pd.read_csv('s3://{}/{}'.format(bucket, test_identity_key))
test_transaction = pd.read_csv('s3://{}/{}'.format(bucket, test_transaction_key))

# Combine the datasets
df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
df_test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Get the name of the target
target = 'isFraud'

In [8]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,590540,434


In [9]:
# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [10]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,506691,433


In [11]:
# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [12]:
# delete raw dataframe to reduce memory usage
del train_identity, train_transaction, test_identity, test_transaction

## Split Data (Optional)

Under normal circumstances, we need to split the training data set into the training set, validation set, and test set. However, since there is a given test set in the current scenario, we did not split the training set to maintain the sample size, so that to improve the model performance. Also, we will split the validation set during the cross-validation when we train the model.

In [13]:
# Divide the data into training (60%) and test (40%)
# df_train, df_test = train_test_split(train, 
#                                      train_size=0.6, 
#                                      random_state=random_seed, 
#                                      stratify=train[target])

# Divide the test data into validation (50%) and test (50%)
# df_val, df_test = train_test_split(df_test, 
#                                    train_size=0.5, 
#                                    random_state=random_seed, 
#                                    stratify=df_test[target])

# Reset the index
# df_train, df_val, df_test = df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

## Feature Engineering

### Handling uncommon variables (Optional)

This step is not required if the training set/validation set/test set is divided from a data set.

In [14]:
# Get the features in the training data and test data
common_vars = np.intersect1d(df_train.columns, np.union1d(df_test.columns, [target]))

# Print the common features
pd.DataFrame(common_vars, columns=['common feature'])

Unnamed: 0,common feature
0,C1
1,C10
2,C11
3,C12
4,C13
...,...
391,card5
392,card6
393,dist1
394,dist2


In [15]:
# Get the features in the training data but not in the validation or test data
uncommon_feature_train_not_test = np.setdiff1d(df_train.columns, common_vars)

# Print the uncommon features
pd.DataFrame(uncommon_feature_train_not_test, columns=['uncommon feature'])

Unnamed: 0,uncommon feature
0,id_01
1,id_02
2,id_03
3,id_04
4,id_05
5,id_06
6,id_07
7,id_08
8,id_09
9,id_10


In [16]:
# Get the features in the test data but not in the training or validation data
uncommon_feature_test_not_train = np.setdiff1d(df_test.columns, common_vars)

# Print the uncommon features
pd.DataFrame(uncommon_feature_test_not_train, columns=['uncommon feature'])

Unnamed: 0,uncommon feature
0,id-01
1,id-02
2,id-03
3,id-04
4,id-05
5,id-06
6,id-07
7,id-08
8,id-09
9,id-10


In [17]:
# Handling uncommon data
df_test.columns = df_test.columns.str.replace('-', '_')

In [18]:
# Check again
common_vars_after = np.intersect1d(df_train.columns, np.union1d(df_test.columns, [target]))

# Print the common features
pd.DataFrame(common_vars, columns=['common feature'])

Unnamed: 0,common feature
0,C1
1,C10
2,C11
3,C12
4,C13
...,...
391,card5
392,card6
393,dist1
394,dist2


In [19]:
# Get the features in the training data but not in the validation or test data
uncommon_feature_train_not_test_after = np.setdiff1d(df_train.columns, common_vars_after)

# Print the uncommon features
pd.DataFrame(uncommon_feature_train_not_test_after, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


In [20]:
# Get the features in the test data but not in the training or validation data
uncommon_feature_test_not_train_after = np.setdiff1d(df_test.columns, common_vars_after)

# Print the uncommon features
pd.DataFrame(uncommon_feature_test_not_train_after, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


### Handling identifiers

In [21]:
# remove TransactionID from data sets as they are not meaningful for training
df_train.drop(columns=['TransactionID'], inplace=True)
df_test.drop(columns=['TransactionID'], inplace=True)

In [22]:
# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [23]:
# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## Handling datetime

In [24]:
# Get the date time variables
datetime_vars = ['TransactionDT']

In [25]:
# Call datetime_transformer on df_train
df_train = datetime_transformer(df_train, datetime_vars)

# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_37,id_38,DeviceType,DeviceInfo,TransactionDT_year,TransactionDT_month,TransactionDT_day,TransactionDT_hour,TransactionDT_minute,TransactionDT_second
0,0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,,,,,2017,12,2,0,0,0
1,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,,,,,2017,12,2,0,0,1
2,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,,,,,2017,12,2,0,1,9
3,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,,,,,2017,12,2,0,1,39
4,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,2017,12,2,0,1,46


In [26]:
# Call datetime_transformer on df_test
df_test = datetime_transformer(df_test, datetime_vars)

# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_37,id_38,DeviceType,DeviceInfo,TransactionDT_year,TransactionDT_month,TransactionDT_day,TransactionDT_hour,TransactionDT_minute,TransactionDT_second
0,31.95,W,10409,111.0,150.0,visa,226.0,debit,170.0,87.0,...,,,,,2018,7,2,0,0,24
1,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,87.0,...,,,,,2018,7,2,0,1,3
2,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,87.0,...,,,,,2018,7,2,0,1,50
3,284.95,W,10989,360.0,150.0,visa,166.0,debit,205.0,87.0,...,,,,,2018,7,2,0,1,50
4,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,87.0,...,,,,,2018,7,2,0,1,57


### Binning

Binning is a simple technique that groups different values into bins. In this dataset, we want to bin email domain features.

In [27]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

In [28]:
for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_test[c + '_bin'] = df_test[c].map(emails)
    
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us') 
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [29]:
# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,TransactionDT_year,TransactionDT_month,TransactionDT_day,TransactionDT_hour,TransactionDT_minute,TransactionDT_second,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix
0,0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,2017,12,2,0,0,0,,,,
1,0,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,2017,12,2,0,0,1,google,com,,
2,0,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,2017,12,2,0,1,9,microsoft,com,,
3,0,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,2017,12,2,0,1,39,yahoo,com,,
4,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,2017,12,2,0,1,46,google,com,,


In [30]:
# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,TransactionDT_year,TransactionDT_month,TransactionDT_day,TransactionDT_hour,TransactionDT_minute,TransactionDT_second,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix
0,31.95,W,10409,111.0,150.0,visa,226.0,debit,170.0,87.0,...,2018,7,2,0,0,24,google,com,,
1,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,87.0,...,2018,7,2,0,1,3,aol,com,,
2,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,87.0,...,2018,7,2,0,1,50,microsoft,com,,
3,284.95,W,10989,360.0,150.0,visa,166.0,debit,205.0,87.0,...,2018,7,2,0,1,50,google,com,,
4,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,87.0,...,2018,7,2,0,1,57,google,com,,


### Handling missing value

In [31]:
# Call nan_checker on df_train
df_train_nan = nan_checker(df_train)

# Print df_train_nan
df_train_nan

Unnamed: 0,var,proportion,dtype
0,id_24,0.991962,float64
1,id_25,0.991310,float64
2,id_07,0.991271,float64
3,id_08,0.991271,float64
4,id_21,0.991264,float64
...,...,...,...
411,V284,0.000020,float64
412,V280,0.000020,float64
413,V279,0.000020,float64
414,V292,0.000020,float64


In [32]:
# Print the unique data type of variables with NaN
pd.DataFrame(df_train_nan['dtype'].unique(), columns=['dtype'])

Unnamed: 0,dtype
0,float64
1,object


In [33]:
# Call nan_checker on df_train
df_test_nan = nan_checker(df_test)

# Print df_train_nan
df_test_nan

Unnamed: 0,var,proportion,dtype
0,id_24,0.990645,float64
1,id_25,0.990055,float64
2,id_26,0.990039,float64
3,id_07,0.990016,float64
4,id_08,0.990016,float64
...,...,...,...
382,V316,0.000006,float64
383,C12,0.000006,float64
384,C11,0.000006,float64
385,V292,0.000006,float64


In [34]:
# Print the unique data type of variables with NaN
pd.DataFrame(df_test_nan['dtype'].unique(), columns=['dtype'])

Unnamed: 0,dtype
0,float64
1,object


In [35]:
df_nan = pd.merge(df_train_nan, df_test_nan, how="outer", on=["var", "dtype"])
df_nan

Unnamed: 0,var,proportion_x,dtype,proportion_y
0,id_24,0.991962,float64,0.990645
1,id_25,0.991310,float64,0.990055
2,id_07,0.991271,float64,0.990016
3,id_08,0.991271,float64,0.990016
4,id_21,0.991264,float64,0.990016
...,...,...,...,...
425,C9,,float64,0.000006
426,C10,,float64,0.000006
427,C14,,float64,0.000006
428,C12,,float64,0.000006


In [36]:
# Get the variables with missing values, their proportion of missing values and data type
df_miss = df_nan[df_nan['dtype'] != 'object'].reset_index(drop=True)

# Print df_miss
df_miss

Unnamed: 0,var,proportion_x,dtype,proportion_y
0,id_24,0.991962,float64,0.990645
1,id_25,0.991310,float64,0.990055
2,id_07,0.991271,float64,0.990016
3,id_08,0.991271,float64,0.990016
4,id_21,0.991264,float64,0.990016
...,...,...,...,...
393,C9,,float64,0.000006
394,C10,,float64,0.000006
395,C14,,float64,0.000006
396,C12,,float64,0.000006


In [37]:
# Get the variables with missing values, their proportion of missing values and data type
df_miss_obj = df_nan[df_nan['dtype'] == 'object'].reset_index(drop=True)

# Print df_miss
df_miss_obj

Unnamed: 0,var,proportion_x,dtype,proportion_y
0,id_27,0.991247,object,0.99001
1,id_23,0.991247,object,0.99001
2,id_33,0.875895,object,0.860524
3,id_30,0.868654,object,0.860548
4,id_34,0.868248,object,0.857556
5,DeviceInfo,0.799055,object,0.772925
6,id_16,0.78098,object,0.751827
7,R_emaildomain,0.767516,object,0.731848
8,R_emaildomain_bin,0.767516,object,0.731848
9,id_31,0.762451,object,0.730358


Drop column if the proportion of missing value is over 0.9, impute the rest by mean of column values.

In [38]:
# If there are missing values
if len(df_miss['var']) > 0:
    # The SimpleImputer
    si = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    # Drop the most missing column
    for index, row in df_miss.iterrows():
        if row['proportion_x'] > 0.9 or row['proportion_y'] > 0.9:
            df_train = df_train.drop(row['var'], axis=1)
            df_test = df_test.drop(row['var'], axis=1)
            df_miss = df_miss.drop(index, axis=0)
            
    # Impute the variables with missing values in df_train and df_test 
    df_train[df_miss['var']] = si.fit_transform(df_train[df_miss['var']])
    df_test[df_miss['var']] = si.transform(df_test[df_miss['var']])

In [39]:
# if len(df_miss_obj['var']) > 0:
#     # Drop the most missing column
#     for index, row in df_miss_obj.iterrows():
#         if row['proportion_x'] > 0.9 or row['proportion_y'] > 0.9:
#             df_train = df_train.drop(row['var'], axis=1)
#             df_test = df_test.drop(row['var'], axis=1)
#             df_miss_obj = df_miss_obj.drop(index, axis=0)

#     df_train[list(df_miss_obj['var'].values)] = df_train[list(df_miss_obj['var'].values)].fillna('empty')
#     df_test[list(df_miss_obj['var'].values)] = df_test[list(df_miss_obj['var'].values)].fillna('empty')

**Note** that the ordinal encoder cannot handle null values (nan), so we also need to handle the missing values of the object type. This is not required if you use a one-hot encoder or label encoder.

### Encoding categorical features

In [40]:
# Call cat_var_checker on df_train
df_cat = cat_var_checker(df_train)

# Print the dataframe
df_cat

Unnamed: 0,var,nunique
0,DeviceInfo,1787
1,id_33,261
2,id_31,131
3,id_30,76
4,R_emaildomain,61
5,P_emaildomain,60
6,R_emaildomain_bin,10
7,P_emaildomain_bin,10
8,R_emaildomain_suffix,9
9,P_emaildomain_suffix,9


In fact, only the training set should be fit and the transform operation should be performed on both the training set and the testing set. However, there are multiple categories in the test set that have not appeared in the training set, the ordinal encoder will label them as unknown_value and assign them the same value. This operation will destroy the sample distribution of the testing set, so here we choose to use label encoder to fit and transform both training and testing datasets, you can also try to use ordinal encoder to fit and transform missing values processed training and testing sets.

In [41]:
# oe = OrdinalEncoder()

# oe.fit(df_train[list(df_cat['var'].values)] + df_test[list(df_cat['var'].values)])
# df_train[list(df_cat['var'].values)] = oe.transform(df_train[list(df_cat['var'].values)])
# df_test[list(df_cat['var'].values)] = oe.transform(df_test[list(df_cat['var'].values)])

In [42]:
for col in df_cat['var']:
    # The LabelEncoder
    le = LabelEncoder()
    le.fit(list(df_train[col].values) + list(df_test[col].values))
    df_train[col] = le.transform(list(df_train[col].values))
    df_test[col] = le.transform(list(df_test[col].values))

In [43]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,590540,432


In [44]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,506691,431


**Note** that the number of columns in the training set and testing set is the same because pandas automatically fill the isFraud column in testing set when we perform the concat operation.

### Numerical transform

There is no logic in them - simply aggregations on top features.

In [45]:
df_train['Trans_min_mean'] = df_train['TransactionAmt'] - df_train['TransactionAmt'].mean()
df_train['Trans_min_std'] = df_train['Trans_min_mean'] / df_train['TransactionAmt'].std()

df_test['Trans_min_mean'] = df_test['TransactionAmt'] - df_test['TransactionAmt'].mean()
df_test['Trans_min_std'] = df_test['Trans_min_mean'] / df_test['TransactionAmt'].std()

In [46]:
df_train['TransactionAmt_to_mean_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_mean_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_std_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('std')
df_train['TransactionAmt_to_std_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('std')

df_test['TransactionAmt_to_mean_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_mean_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_std_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('std')
df_test['TransactionAmt_to_std_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('std')

In [47]:
df_train['id_02_to_mean_card1'] = df_train['id_02'] / df_train.groupby(['card1'])['id_02'].transform('mean')
df_train['id_02_to_mean_card4'] = df_train['id_02'] / df_train.groupby(['card4'])['id_02'].transform('mean')
df_train['id_02_to_std_card1'] = df_train['id_02'] / df_train.groupby(['card1'])['id_02'].transform('std')
df_train['id_02_to_std_card4'] = df_train['id_02'] / df_train.groupby(['card4'])['id_02'].transform('std')


df_test['id_02_to_mean_card1'] = df_test['id_02'] / df_test.groupby(['card1'])['id_02'].transform('mean')
df_test['id_02_to_mean_card4'] = df_test['id_02'] / df_test.groupby(['card4'])['id_02'].transform('mean')
df_test['id_02_to_std_card1'] = df_test['id_02'] / df_test.groupby(['card1'])['id_02'].transform('std')
df_test['id_02_to_std_card4'] = df_test['id_02'] / df_test.groupby(['card4'])['id_02'].transform('std')


df_train['D15_to_mean_card1'] = df_train['D15'] / df_train.groupby(['card1'])['D15'].transform('mean')
df_train['D15_to_mean_card4'] = df_train['D15'] / df_train.groupby(['card4'])['D15'].transform('mean')
df_train['D15_to_std_card1'] = df_train['D15'] / df_train.groupby(['card1'])['D15'].transform('std')
df_train['D15_to_std_card4'] = df_train['D15'] / df_train.groupby(['card4'])['D15'].transform('std')

df_test['D15_to_mean_card1'] = df_test['D15'] / df_test.groupby(['card1'])['D15'].transform('mean')
df_test['D15_to_mean_card4'] = df_test['D15'] / df_test.groupby(['card4'])['D15'].transform('mean')
df_test['D15_to_std_card1'] = df_test['D15'] / df_test.groupby(['card1'])['D15'].transform('std')
df_test['D15_to_std_card4'] = df_test['D15'] / df_test.groupby(['card4'])['D15'].transform('std')

df_train['D15_to_mean_addr1'] = df_train['D15'] / df_train.groupby(['addr1'])['D15'].transform('mean')
df_train['D15_to_mean_addr2'] = df_train['D15'] / df_train.groupby(['addr2'])['D15'].transform('mean')
df_train['D15_to_std_addr1'] = df_train['D15'] / df_train.groupby(['addr1'])['D15'].transform('std')
df_train['D15_to_std_addr2'] = df_train['D15'] / df_train.groupby(['addr2'])['D15'].transform('std')


df_test['D15_to_mean_addr1'] = df_test['D15'] / df_test.groupby(['addr1'])['D15'].transform('mean')
df_test['D15_to_mean_addr2'] = df_test['D15'] / df_test.groupby(['addr2'])['D15'].transform('mean')
df_test['D15_to_std_addr1'] = df_test['D15'] / df_test.groupby(['addr1'])['D15'].transform('std')
df_test['D15_to_std_addr2'] = df_test['D15'] / df_test.groupby(['addr2'])['D15'].transform('std')

In [48]:
df_train['TransactionAmt'] = (df_train['TransactionAmt']+1).transform(np.log)

df_test['TransactionAmt'] = (df_test['TransactionAmt']+1).transform(np.log)

### Reduce memory usage

In [49]:
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 2031.96 MB
Memory usage after optimization is: 557.55 MB
Decreased by 72.6%


In [50]:
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 1739.59 MB
Memory usage after optimization is: 485.63 MB
Decreased by 72.1%


### Feature Selection (Depends on Data set)

Choose a feature selection method based on the specifics of your dataset, here we use the correlation matrix to select the final features for training.

In [51]:
# Absolute value correlation matrix
corr_matrix = df_train[df_train[target].notnull()].corr().abs()

# Getting the upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold) or any(upper[column] == 0)]

df_train = df_train.drop(columns = to_drop)
df_test = df_test.drop(columns = to_drop)

In [52]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,590540,382


In [53]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,506691,381


### Scaling

In [54]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
X_test = df_test[np.setdiff1d(df_train.columns, [target])].values

# Get the target vector
y_train = df_train[target].values

Since Scaler can not handle nan of inf in ndarray, we need to convert them to int value

In [55]:
X_train = np.nan_to_num(X_train, nan=0, posinf=999999, neginf=-999999)
X_test = np.nan_to_num(X_test, nan=0, posinf=999999, neginf=-999999)

#### Normalization

In [56]:
# minmax_scaler = MinMaxScaler()
# df_train.iloc[:, 1:] = minmax_scaler.fit_transform(df_train.iloc[:,1:], df_train[target])
# df_test.iloc[:, 1:] = minmax_scaler.transform(df_test.iloc[:, 1:])

#### Standardize

In [57]:
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

## Upload to S3

In [58]:
tss = TimeSeriesSplit(n_splits=FOLDS)

for tr_idx, val_idx in tss.split(X_train, y_train):

    X_tr, X_vl = X_train[tr_idx, :], X_train[val_idx, :]
    y_tr, y_vl = y_train[tr_idx], y_train[val_idx]

Under normal circumstances, it is necessary to cyclically split the training set to generate the validation set. But for aws built-in algorithms, we need to save the dataset in csv or libsvm format. Of course, you can use the sagemaker channel to generate multiple training sets and validation sets to test separately, but here as a demonstration, we only take one division method to generate training sets and validation sets for subsequent model training.

In [59]:
train_buf = io.BytesIO()

sklearn.datasets.dump_svmlight_file(X_tr, y_tr, train_buf)
train_buf.seek(0);

In [60]:
boto3.resource('s3', region_name='us-west-2').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(train_buf)

s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('Uploaded training data location: {}'.format(s3_train_data))

train_output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(train_output_location))

Uploaded training data location: s3://test-asc-sagemaker-fraud/ieee-fraud-detection/train/preprocessed-dataset
Training artifacts will be uploaded to: s3://test-asc-sagemaker-fraud/ieee-fraud-detection/output


In [61]:
val_buf = io.BytesIO()

sklearn.datasets.dump_svmlight_file(X_vl, y_vl, val_buf)
val_buf.seek(0);

In [63]:
boto3.resource('s3', region_name='us-west-2').Bucket(bucket).Object(os.path.join(prefix, 'val', key)).upload_fileobj(val_buf)

s3_val_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('Uploaded validation data location: {}'.format(s3_val_data))

Uploaded validation data location: s3://test-asc-sagemaker-fraud/ieee-fraud-detection/train/preprocessed-dataset


## Model training

In [64]:
# get the built-in image from aws
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-2')

# create a estimator for training
clf = sagemaker.estimator.Estimator(image_uri=container,
                                    role=sagemaker.get_execution_role(),
                                    instance_count=1, 
                                    instance_type=instance_type,
                                    output_path=train_output_location,
                                    sagemaker_session=session,
                                    base_job_name="{}-xgb".format('omv'))

### Hyperparameter Tuning & training

In [65]:
# init the hyperparameters
scale_pos_weight = sqrt(np.count_nonzero(y_train==0)/np.count_nonzero(y_train))
clf.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    scale_pos_weight=scale_pos_weight
)

# set the validation metric for hypermeter tuning
objective_metric_name = "validation:auc"

In [66]:
# set the range of hypermeters
hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 0.4, scaling_type="Logarithmic"),
    "lambda": ContinuousParameter(0.01, 0.4, scaling_type="Logarithmic"),
    "colsample_bytree": ContinuousParameter(0.3, 0.9, scaling_type="Logarithmic"),
    "gamma": ContinuousParameter(0.01, 0.7, scaling_type="Logarithmic"),
    "max_depth": IntegerParameter(7, 23, scaling_type="Logarithmic"),
    "subsample": ContinuousParameter(0.2, 0.9, scaling_type="Logarithmic"),
    "eta": ContinuousParameter(0.01, 0.2, scaling_type="Logarithmic"),
    "num_round": IntegerParameter(400, 1000, scaling_type="Logarithmic")
    
}

In [67]:
# init hyperparameter tuning job
tuner_log = HyperparameterTuner(
    clf,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

# start fit
tuner_log.fit(
    {"train": s3_train_data, "validation": s3_val_data},
    include_cls_metadata=False,
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


## Model Deploying

In [68]:
# deploy the model with the best hyperparameters
predictor = tuner_log.deploy(initial_instance_count=1,
                       model_name="{}-xgb".format('omv-log'),
                       endpoint_name="{}-xgb".format('omv-log'),
                       instance_type=instance_type,
                       serializer=CSVSerializer(),
                       deserializer=None)


2022-05-02 20:22:31 Starting - Preparing the instances for training
2022-05-02 20:22:31 Downloading - Downloading input data
2022-05-02 20:22:31 Training - Training image download completed. Training in progress.
2022-05-02 20:22:31 Uploading - Uploading generated training model
2022-05-02 20:22:31 Completed - Training job completed
-----!

**Note** that AWS provided Random Search and Bayesian Search for hyperparameter tuning. Although different methods will not make much difference in the performance of the model, but please choose the method that best fits your models.

## Model evaluation

In [69]:
def predict(current_predictor, data):
    predictions = ''
    for array in data:
        predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [71]:
# split the test data into smaller size of batches to query the endpoint due to the large size of test data.
batch_size = 1500
predict_prob = []
for i in np.arange(0, len(X_test), step=batch_size):
    predict_prob_batch = predict(predictor, X_test[i : (i + batch_size), :])
    predict_prob.append(predict_prob_batch)

predict_prob = np.concatenate(predict_prob, axis=0)

In [None]:
preds = np.empty(1)
for i in range(0, len(X_test), 1000):
    data = X_test[i:i+1000]
    raw_preds = predict(predictor, data)
    preds = np.concatenate((preds, raw_preds), axis=None)
preds = preds[1:]

In [72]:
test_transaction = pd.read_csv('s3://test-asc-sagemaker-fraud/ieee-fraud-detection/raw-data/test_transaction.csv', usecols=[0])
test_transaction['isFraud'] = predict_prob
sub = test_transaction[['TransactionID', 'isFraud']]

In [73]:
sub.to_csv('xgb_5.csv', index=False)

test changes