# IEEE-CIS Fraud Detection

Dataset description: https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-643955

# Import necessary packages

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Load training set

In [3]:
transaction = pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//train_transaction.csv')
identify = pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//train_identity.csv')

Define a function to reduce the memory useage

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Merge two datasets to a big dataset based on TransactionID

In [5]:
train = pd.merge(transaction,identify,'left',on='TransactionID')

In [6]:
train = reduce_mem_usage(train)

Mem. usage decreased to 650.48 Mb (66.8% reduction)


Quick summary of train

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float16(354), float32(45), int16(1), int32(2), int8(1), object(31)
memory usage: 650.5+ MB


Delete unnecessart dataset to release memory

In [8]:
del transaction, identify

Target field desctiption:

In [38]:
isfraud = train['isFraud'].value_counts()
trace = go.Bar(x=isfraud.index,
              y=isfraud.values,
              text=isfraud.values,
              textposition='outside')
fig = go.Figure(data=[trace])
fig.show()

Indentify categorical columns

In [39]:
cat_cols = list(train.select_dtypes(include=['object']).columns)

In [40]:
cat_cols

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

How many unique values for each categorical field

In [15]:
for i in cat_cols:
    print(i,train[i].nunique())

ProductCD 5
card4 4
card6 4
P_emaildomain 59
R_emaildomain 60
M1 2
M2 2
M3 2
M4 3
M5 2
M6 2
M7 2
M8 2
M9 2
id_12 2
id_15 3
id_16 2
id_23 3
id_27 2
id_28 2
id_29 2
id_30 75
id_31 130
id_33 260
id_34 4
id_35 2
id_36 2
id_37 2
id_38 2
DeviceType 2
DeviceInfo 1786


Correlation between every numerical fields

In [41]:
corr = train.corr('pearson')