# IEEE-CIS Fraud Detection

Dataset description: https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-643955

# Import necessary packages

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Load training set and test set

In [66]:
transaction = pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//train_transaction.csv')
identify = pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//train_identity.csv')
transaction_test=pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//test_transaction.csv')
identify_test=pd.read_csv('C://Users//Minghao Lyu//Desktop//Capstone//test_identity.csv')

Define a function to reduce the memory useage

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Merge two datasets to a big dataset based on TransactionID

In [67]:
train = pd.merge(transaction,identify,'left',on='TransactionID')
test = pd.merge(transaction_test,identify_test,'left',on='TransactionID')

In [68]:
train = reduce_mem_usage(train)

Mem. usage decreased to 650.48 Mb (66.8% reduction)


In [69]:
test = reduce_mem_usage(test)

Mem. usage decreased to 565.37 Mb (66.3% reduction)


Quick summary of train

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float16(354), float32(45), int16(1), int32(2), int8(1), object(31)
memory usage: 650.5+ MB


In [70]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506691 entries, 0 to 506690
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: float16(346), float32(53), int16(1), int32(2), object(31)
memory usage: 565.4+ MB


In [62]:
print(f'There are {train.isnull().any().sum()} columns in train dataset with missing values.')

There are 414 columns in train dataset with missing values.


In [71]:
print(f'There are {test.isnull().any().sum()} columns in train dataset with missing values.')

There are 385 columns in train dataset with missing values.


Delete unnecessart dataset to release memory

In [None]:
del transaction,identify,identify_test,transaction_test

Target field desctiption:

In [76]:
isfraud = train['isFraud'].value_counts()
trace = go.Bar(x=isfraud.index,
              y=isfraud.values,
              text=isfraud.values,
              textposition='outside')
fig = go.Figure(data=[trace])
fig.show()

Indentify categorical columns

In [39]:
cat_cols = list(train.select_dtypes(include=['object']).columns)

In [40]:
cat_cols

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

How many unique values for each categorical field

In [65]:
x = []
y = []
for i in cat_cols:
    x.append(i)
    y.append(train[i].nunique())

data = go.Bar(x=x,
             y=y,
             text=y,
             textposition='outside')
fig=go.Figure([data])
fig.show()

Correlation between every numerical fields

In [41]:
corr = train.corr('pearson')

In [42]:
corr = corr.reset_index()

In [43]:
corr.head()

Unnamed: 0,index,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
0,TransactionID,1.0,0.014166,0.99828,0.012025,0.010122,-0.0199,-0.010091,-0.0239,-0.00037,...,0.222829,0.111975,-0.01503,0.084105,-0.029004,0.052621,-0.038339,0.020672,0.01367,-0.066437
1,isFraud,0.014166,1.0,0.013103,0.01132,-0.01364,0.003388,0.154151,-0.03358,0.005596,...,0.1501,0.050004,-0.041721,0.061597,0.063544,0.118409,-0.001905,0.034045,0.099587,0.069702
2,TransactionDT,0.99828,0.013103,1.0,0.01192,0.010625,-0.019202,-0.011222,-0.024132,-5.1e-05,...,0.214802,0.110234,-0.011815,0.082009,-0.024249,0.051437,-0.031312,0.020372,0.01779,-0.071392
3,TransactionAmt,0.012025,0.01132,0.01192,1.0,-0.005725,0.016136,-0.109788,0.003061,-0.007421,...,-0.334974,-0.096853,0.076174,-0.105868,-0.031841,-0.10462,0.08011,0.016142,0.023098,-0.041851
4,card1,0.010122,-0.01364,0.010625,-0.005725,1.0,0.00496,0.002965,-0.093633,0.020369,...,0.018892,0.008869,0.022617,0.015917,0.044073,0.016853,0.05755,-0.010192,0.078487,-0.009524


Identify the highest correlation between each field

In [59]:
corr_high=[]
for i in corr.columns:
    corr_dic = {}
    corr_dic['corr1']=i
    corr_dic['corr2']=corr.sort_values([i],ascending=False).reset_index()['index'][1]
    corr_dic['corr_value_+']=corr.sort_values([i],ascending=False).reset_index()[i][1]
    corr_high.append(corr_dic)

In [60]:
corr_high.pop(0)
corr_high = pd.DataFrame(corr_high)

In [61]:
corr_high.head()

Unnamed: 0,corr1,corr2,corr_value_+
0,TransactionID,TransactionDT,0.99828
1,isFraud,V257,0.38306
2,TransactionDT,TransactionID,0.99828
3,TransactionAmt,V139,0.222308
4,card1,V330,0.146483
